From 182d919b84902eece162c63ed3d476c8016b4197 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Feb 2015 23:47:31 +0000
Subject: FS-Cache: Count culled objects and objects rejected due to lack of
 space

Count the number of objects that get culled by the cache backend and the
number of objects that the cache backend declines to instantiate due to lack
of space in the cache.

These numbers are made available through /proc/fs/fscache/stats

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 include/linux/fscache-cache.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 771484993ca7..c9dafdaf3347 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -371,6 +371,7 @@ struct fscache_object {
 #define FSCACHE_OBJECT_IS_LOOKED_UP	4	/* T if object has been looked up */
 #define FSCACHE_OBJECT_IS_AVAILABLE	5	/* T if object has become active */
 #define FSCACHE_OBJECT_RETIRED		6	/* T if object was retired on relinquishment */
+#define FSCACHE_OBJECT_KILLED_BY_CACHE	7	/* T if object was killed by the cache */
 
 	struct list_head	cache_link;	/* link in cache->object_list */
 	struct hlist_node	cookie_link;	/* link in cookie->backing_objects */
@@ -551,4 +552,15 @@ extern enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
 					       const void *data,
 					       uint16_t datalen);
 
+extern void fscache_object_retrying_stale(struct fscache_object *object);
+
+enum fscache_why_object_killed {
+	FSCACHE_OBJECT_IS_STALE,
+	FSCACHE_OBJECT_NO_SPACE,
+	FSCACHE_OBJECT_WAS_RETIRED,
+	FSCACHE_OBJECT_WAS_CULLED,
+};
+extern void fscache_object_mark_killed(struct fscache_object *object,
+				       enum fscache_why_object_killed why);
+
 #endif /* _LINUX_FSCACHE_CACHE_H */
-- 
cgit v1.2.3


From 30ceec6284129662efc3a1e7675b2bd857a046fe Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Feb 2015 10:05:27 +0000
Subject: FS-Cache: When submitting an op, cancel it if the target object is
 dying

When submitting an operation, prefer to cancel the operation immediately
rather than queuing it for later processing if the object is marked as dying
(ie. the object state machine has reached the KILL_OBJECT state).

Whilst we're at it, change the series of related test_bit() calls into a
READ_ONCE() and bitwise-AND operators to reduce the number of load
instructions (test_bit() has a volatile address).

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/operation.c        | 47 +++++++++++++++++++++++++++----------------
 include/linux/fscache-cache.h |  9 +++++++--
 2 files changed, 37 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 68ead8482617..dec6defe3be3 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -120,6 +120,8 @@ static void fscache_report_unexpected_submission(struct fscache_object *object,
 int fscache_submit_exclusive_op(struct fscache_object *object,
 				struct fscache_operation *op)
 {
+	const struct fscache_state *ostate;
+	unsigned long flags;
 	int ret;
 
 	_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
@@ -132,8 +134,19 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 	ASSERTCMP(object->n_ops, >=, object->n_exclusive);
 	ASSERT(list_empty(&op->pend_link));
 
+	ostate = object->state;
+	smp_rmb();
+
 	op->state = FSCACHE_OP_ST_PENDING;
-	if (fscache_object_is_active(object)) {
+	flags = READ_ONCE(object->flags);
+	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
+		fscache_stat(&fscache_n_op_rejected);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -EIO;
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
 		op->object = object;
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
@@ -155,7 +168,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		/* need to issue a new write op after this */
 		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
 		ret = 0;
-	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
 		op->object = object;
 		object->n_ops++;
 		object->n_exclusive++;	/* reads and writes must wait */
@@ -164,11 +177,9 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
 	} else {
-		/* If we're in any other state, there must have been an I/O
-		 * error of some nature.
-		 */
-		ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
-		ret = -EIO;
+		fscache_report_unexpected_submission(object, op, ostate);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
 	}
 
 	spin_unlock(&object->lock);
@@ -187,6 +198,7 @@ int fscache_submit_op(struct fscache_object *object,
 		      struct fscache_operation *op)
 {
 	const struct fscache_state *ostate;
+	unsigned long flags;
 	int ret;
 
 	_enter("{OBJ%x OP%x},{%u}",
@@ -204,7 +216,15 @@ int fscache_submit_op(struct fscache_object *object,
 	smp_rmb();
 
 	op->state = FSCACHE_OP_ST_PENDING;
-	if (fscache_object_is_active(object)) {
+	flags = READ_ONCE(object->flags);
+	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
+		fscache_stat(&fscache_n_op_rejected);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -ENOBUFS;
+	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->state = FSCACHE_OP_ST_CANCELLED;
+		ret = -EIO;
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
 		op->object = object;
 		object->n_ops++;
 
@@ -222,25 +242,18 @@ int fscache_submit_op(struct fscache_object *object,
 			fscache_run_op(object, op);
 		}
 		ret = 0;
-	} else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) {
+	} else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) {
 		op->object = object;
 		object->n_ops++;
 		atomic_inc(&op->usage);
 		list_add_tail(&op->pend_link, &object->pending_ops);
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
-	} else if (fscache_object_is_dying(object)) {
-		fscache_stat(&fscache_n_op_rejected);
-		op->state = FSCACHE_OP_ST_CANCELLED;
-		ret = -ENOBUFS;
-	} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
+	} else {
 		fscache_report_unexpected_submission(object, op, ostate);
 		ASSERT(!fscache_object_is_active(object));
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
-	} else {
-		op->state = FSCACHE_OP_ST_CANCELLED;
-		ret = -ENOBUFS;
 	}
 
 	spin_unlock(&object->lock);
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index c9dafdaf3347..2e83a141e465 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -411,17 +411,22 @@ static inline bool fscache_object_is_available(struct fscache_object *object)
 	return test_bit(FSCACHE_OBJECT_IS_AVAILABLE, &object->flags);
 }
 
+static inline bool fscache_cache_is_broken(struct fscache_object *object)
+{
+	return test_bit(FSCACHE_IOERROR, &object->cache->flags);
+}
+
 static inline bool fscache_object_is_active(struct fscache_object *object)
 {
 	return fscache_object_is_available(object) &&
 		fscache_object_is_live(object) &&
-		!test_bit(FSCACHE_IOERROR, &object->cache->flags);
+		!fscache_cache_is_broken(object);
 }
 
 static inline bool fscache_object_is_dead(struct fscache_object *object)
 {
 	return fscache_object_is_dying(object) &&
-		test_bit(FSCACHE_IOERROR, &object->cache->flags);
+		fscache_cache_is_broken(object);
 }
 
 /**
-- 
cgit v1.2.3


From 87021526300f1a292dd966e141e183630ac95317 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Feb 2015 10:52:51 +0000
Subject: FS-Cache: fscache_object_is_dead() has wrong logic, kill it

fscache_object_is_dead() returns true only if the object is marked dead and
the cache got an I/O error.  This should be a logical OR instead.  Since two
of the callers got split up into handling for separate subcases, expand the
other callers and kill the function.  This is probably the right thing to do
anyway since one of the subcases isn't about the object at all, but rather
about the cache.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/cookie.c           | 3 ++-
 fs/fscache/page.c             | 6 ++++--
 include/linux/fscache-cache.h | 6 ------
 3 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 89acec742e0b..8de22164f5fb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache,
 
 object_already_extant:
 	ret = -ENOBUFS;
-	if (fscache_object_is_dead(object)) {
+	if (fscache_object_is_dying(object) ||
+	    fscache_cache_is_broken(object)) {
 		spin_unlock(&cookie->lock);
 		goto error;
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index de33b3fccca6..d0805e31361c 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -377,11 +377,13 @@ check_if_dead:
 		_leave(" = -ENOBUFS [cancelled]");
 		return -ENOBUFS;
 	}
-	if (unlikely(fscache_object_is_dead(object))) {
-		pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state);
+	if (unlikely(fscache_object_is_dying(object) ||
+		     fscache_cache_is_broken(object))) {
+		enum fscache_operation_state state = op->state;
 		fscache_cancel_op(op, do_cancel);
 		if (stat_object_dead)
 			fscache_stat(stat_object_dead);
+		_leave(" = -ENOBUFS [obj dead %d]", state);
 		return -ENOBUFS;
 	}
 	return 0;
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 2e83a141e465..ca3d550da11e 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -423,12 +423,6 @@ static inline bool fscache_object_is_active(struct fscache_object *object)
 		!fscache_cache_is_broken(object);
 }
 
-static inline bool fscache_object_is_dead(struct fscache_object *object)
-{
-	return fscache_object_is_dying(object) &&
-		fscache_cache_is_broken(object);
-}
-
 /**
  * fscache_object_destroyed - Note destruction of an object in a cache
  * @cache: The cache from which the object came
-- 
cgit v1.2.3


From 1339ec98e32b4bc8efb6fbb71c006a465130aaba Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 25 Feb 2015 13:26:39 +0000
Subject: FS-Cache: Out of line fscache_operation_init()

Out of line fscache_operation_init() so that it can access internal FS-Cache
features, such as stats, in a later commit.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/operation.c        | 22 ++++++++++++++++++++++
 include/linux/fscache-cache.h | 24 +++---------------------
 2 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 67594a8d791a..61a6e78b85fa 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -20,6 +20,28 @@
 atomic_t fscache_op_debug_id;
 EXPORT_SYMBOL(fscache_op_debug_id);
 
+/**
+ * fscache_operation_init - Do basic initialisation of an operation
+ * @op: The operation to initialise
+ * @release: The release function to assign
+ *
+ * Do basic initialisation of an operation.  The caller must still set flags,
+ * object and processor if needed.
+ */
+void fscache_operation_init(struct fscache_operation *op,
+			    fscache_operation_processor_t processor,
+			    fscache_operation_release_t release)
+{
+	INIT_WORK(&op->work, fscache_op_work_func);
+	atomic_set(&op->usage, 1);
+	op->state = FSCACHE_OP_ST_INITIALISED;
+	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
+	op->processor = processor;
+	op->release = release;
+	INIT_LIST_HEAD(&op->pend_link);
+}
+EXPORT_SYMBOL(fscache_operation_init);
+
 /**
  * fscache_enqueue_operation - Enqueue an operation for processing
  * @op: The operation to enqueue
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index ca3d550da11e..0e26d49972e3 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -119,27 +119,9 @@ extern void fscache_op_work_func(struct work_struct *work);
 extern void fscache_enqueue_operation(struct fscache_operation *);
 extern void fscache_op_complete(struct fscache_operation *, bool);
 extern void fscache_put_operation(struct fscache_operation *);
-
-/**
- * fscache_operation_init - Do basic initialisation of an operation
- * @op: The operation to initialise
- * @release: The release function to assign
- *
- * Do basic initialisation of an operation.  The caller must still set flags,
- * object and processor if needed.
- */
-static inline void fscache_operation_init(struct fscache_operation *op,
-					fscache_operation_processor_t processor,
-					fscache_operation_release_t release)
-{
-	INIT_WORK(&op->work, fscache_op_work_func);
-	atomic_set(&op->usage, 1);
-	op->state = FSCACHE_OP_ST_INITIALISED;
-	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
-	op->processor = processor;
-	op->release = release;
-	INIT_LIST_HEAD(&op->pend_link);
-}
+extern void fscache_operation_init(struct fscache_operation *,
+				   fscache_operation_processor_t,
+				   fscache_operation_release_t);
 
 /*
  * data read operation
-- 
cgit v1.2.3


From d3b97ca4a99e4e6c78f5a21c968eadf5c8ba9971 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Feb 2015 10:05:29 +0000
Subject: FS-Cache: The operation cancellation method needs calling in more
 places

Any time an incomplete operation is cancelled, the operation cancellation
function needs to be called to clean up.  This is currently being passed
directly to some of the functions that might want to call it, but not all.

Instead, pass the cancellation method pointer to the fscache_operation_init()
and have that cache it in the operation struct.  Further, plug in a dummy
cancellation handler if the caller declines to set one as this allows us to
call the function unconditionally (the extra overhead isn't worth bothering
about as we don't expect to be calling this typically).

The cancellation method must thence be called everywhere the CANCELLED state
is set.  Note that we call it *before* setting the CANCELLED state such that
the method can use the old state value to guide its operation.

fscache_do_cancel_retrieval() needs moving higher up in the sources so that
the init function can use it now.

Without this, the following oops may be seen:

	FS-Cache: Assertion failed
	FS-Cache: 3 == 0 is false
	------------[ cut here ]------------
	kernel BUG at ../fs/fscache/page.c:261!
	...
	RIP: 0010:[<ffffffffa0089c1b>]  fscache_release_retrieval_op+0x77/0x100
	 [<ffffffffa008853d>] fscache_put_operation+0x114/0x2da
	 [<ffffffffa008b8c2>] __fscache_read_or_alloc_pages+0x358/0x3b3
	 [<ffffffffa00b761f>] __nfs_readpages_from_fscache+0x59/0xbf [nfs]
	 [<ffffffffa00b06c5>] nfs_readpages+0x10c/0x185 [nfs]
	 [<ffffffff81124925>] ? alloc_pages_current+0x119/0x13e
	 [<ffffffff810ee5fd>] ? __page_cache_alloc+0xfb/0x10a
	 [<ffffffff810f87f8>] __do_page_cache_readahead+0x188/0x22c
	 [<ffffffff810f8b3a>] ondemand_readahead+0x29e/0x2af
	 [<ffffffff810f8c92>] page_cache_sync_readahead+0x38/0x3a
	 [<ffffffff810ef337>] generic_file_read_iter+0x1a2/0x55a
	 [<ffffffffa00a9dff>] ? nfs_revalidate_mapping+0xd6/0x288 [nfs]
	 [<ffffffffa00a6a23>] nfs_file_read+0x49/0x70 [nfs]
	 [<ffffffff811363be>] new_sync_read+0x78/0x9c
	 [<ffffffff81137164>] __vfs_read+0x13/0x38
	 [<ffffffff8113721e>] vfs_read+0x95/0x121
	 [<ffffffff811372f6>] SyS_read+0x4c/0x8a
	 [<ffffffff81557a52>] system_call_fastpath+0x12/0x17

The assertion is showing that the remaining number of pages (n_pages) is not 0
when the operation is being released.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/cookie.c           |  5 ++---
 fs/fscache/internal.h         |  7 ++-----
 fs/fscache/object.c           |  3 ++-
 fs/fscache/operation.c        | 31 ++++++++++++++++++++++-------
 fs/fscache/page.c             | 46 +++++++++++++++++++++----------------------
 include/linux/fscache-cache.h |  5 +++++
 6 files changed, 57 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 8de22164f5fb..d403c69bee08 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -672,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	if (!op)
 		return -ENOMEM;
 
-	fscache_operation_init(op, NULL, NULL);
+	fscache_operation_init(op, NULL, NULL, NULL);
 	op->flags = FSCACHE_OP_MYTHREAD |
 		(1 << FSCACHE_OP_WAITING) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -696,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 	/* the work queue now carries its own ref on the object */
 	spin_unlock(&cookie->lock);
 
-	ret = fscache_wait_for_operation_activation(object, op,
-						    NULL, NULL, NULL);
+	ret = fscache_wait_for_operation_activation(object, op, NULL, NULL);
 	if (ret == 0) {
 		/* ask the cache to honour the operation */
 		ret = object->cache->ops->check_consistency(op);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index a63225116db6..97ec45110957 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -124,9 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
 				       struct fscache_operation *);
 extern int fscache_submit_op(struct fscache_object *,
 			     struct fscache_operation *);
-extern int fscache_cancel_op(struct fscache_operation *,
-			     void (*)(struct fscache_operation *),
-			     bool);
+extern int fscache_cancel_op(struct fscache_operation *, bool);
 extern void fscache_cancel_all_ops(struct fscache_object *);
 extern void fscache_abort_object(struct fscache_object *);
 extern void fscache_start_operations(struct fscache_object *);
@@ -139,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *);
 extern int fscache_wait_for_operation_activation(struct fscache_object *,
 						 struct fscache_operation *,
 						 atomic_t *,
-						 atomic_t *,
-						 void (*)(struct fscache_operation *));
+						 atomic_t *);
 extern void fscache_invalidate_writes(struct fscache_cookie *);
 
 /*
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 40049f7505f0..9e792e30f4db 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -961,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
+	fscache_operation_init(op, object->cache->ops->invalidate_object,
+			       NULL, NULL);
 	op->flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_EXCLUSIVE) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index c76c09730768..57d4abb68656 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -20,6 +20,10 @@
 atomic_t fscache_op_debug_id;
 EXPORT_SYMBOL(fscache_op_debug_id);
 
+static void fscache_operation_dummy_cancel(struct fscache_operation *op)
+{
+}
+
 /**
  * fscache_operation_init - Do basic initialisation of an operation
  * @op: The operation to initialise
@@ -30,6 +34,7 @@ EXPORT_SYMBOL(fscache_op_debug_id);
  */
 void fscache_operation_init(struct fscache_operation *op,
 			    fscache_operation_processor_t processor,
+			    fscache_operation_cancel_t cancel,
 			    fscache_operation_release_t release)
 {
 	INIT_WORK(&op->work, fscache_op_work_func);
@@ -37,6 +42,7 @@ void fscache_operation_init(struct fscache_operation *op,
 	op->state = FSCACHE_OP_ST_INITIALISED;
 	op->debug_id = atomic_inc_return(&fscache_op_debug_id);
 	op->processor = processor;
+	op->cancel = cancel ?: fscache_operation_dummy_cancel;
 	op->release = release;
 	INIT_LIST_HEAD(&op->pend_link);
 	fscache_stat(&fscache_n_op_initialised);
@@ -164,9 +170,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 	flags = READ_ONCE(object->flags);
 	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
 		fscache_stat(&fscache_n_op_rejected);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -EIO;
 	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
@@ -200,10 +208,12 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
 	} else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	} else {
 		fscache_report_unexpected_submission(object, op, ostate);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	}
@@ -245,9 +255,11 @@ int fscache_submit_op(struct fscache_object *object,
 	flags = READ_ONCE(object->flags);
 	if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) {
 		fscache_stat(&fscache_n_op_rejected);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	} else if (unlikely(fscache_cache_is_broken(object))) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -EIO;
 	} else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) {
@@ -276,11 +288,13 @@ int fscache_submit_op(struct fscache_object *object,
 		fscache_stat(&fscache_n_op_pend);
 		ret = 0;
 	} else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) {
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	} else {
 		fscache_report_unexpected_submission(object, op, ostate);
 		ASSERT(!fscache_object_is_active(object));
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		ret = -ENOBUFS;
 	}
@@ -335,7 +349,6 @@ void fscache_start_operations(struct fscache_object *object)
  * cancel an operation that's pending on an object
  */
 int fscache_cancel_op(struct fscache_operation *op,
-		      void (*do_cancel)(struct fscache_operation *),
 		      bool cancel_in_progress_op)
 {
 	struct fscache_object *object = op->object;
@@ -355,9 +368,9 @@ int fscache_cancel_op(struct fscache_operation *op,
 		ASSERT(!list_empty(&op->pend_link));
 		list_del_init(&op->pend_link);
 		put = true;
+
 		fscache_stat(&fscache_n_op_cancelled);
-		if (do_cancel)
-			do_cancel(op);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
 			object->n_exclusive--;
@@ -373,8 +386,7 @@ int fscache_cancel_op(struct fscache_operation *op,
 			fscache_start_operations(object);
 
 		fscache_stat(&fscache_n_op_cancelled);
-		if (do_cancel)
-			do_cancel(op);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
 			object->n_exclusive--;
@@ -408,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object)
 		list_del_init(&op->pend_link);
 
 		ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+		op->cancel(op);
 		op->state = FSCACHE_OP_ST_CANCELLED;
 
 		if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
@@ -440,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled)
 
 	spin_lock(&object->lock);
 
-	op->state = cancelled ?
-		FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
+	if (!cancelled) {
+		op->state = FSCACHE_OP_ST_COMPLETE;
+	} else {
+		op->cancel(op);
+		op->state = FSCACHE_OP_ST_CANCELLED;
+	}
 
 	if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
 		object->n_exclusive--;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 7db9752252ef..d1b23a67c031 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 		return -ENOMEM;
 	}
 
-	fscache_operation_init(op, fscache_attr_changed_op, NULL);
+	fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL);
 	op->flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_EXCLUSIVE) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
@@ -248,6 +248,17 @@ nobufs:
 }
 EXPORT_SYMBOL(__fscache_attr_changed);
 
+/*
+ * Handle cancellation of a pending retrieval op
+ */
+static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
+{
+	struct fscache_retrieval *op =
+		container_of(_op, struct fscache_retrieval, op);
+
+	atomic_set(&op->n_pages, 0);
+}
+
 /*
  * release a retrieval op reference
  */
@@ -285,7 +296,9 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 		return NULL;
 	}
 
-	fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
+	fscache_operation_init(&op->op, NULL,
+			       fscache_do_cancel_retrieval,
+			       fscache_release_retrieval_op);
 	op->op.flags	= FSCACHE_OP_MYTHREAD |
 		(1UL << FSCACHE_OP_WAITING) |
 		(1UL << FSCACHE_OP_UNUSE_COOKIE);
@@ -329,25 +342,13 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
 	return 0;
 }
 
-/*
- * Handle cancellation of a pending retrieval op
- */
-static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
-{
-	struct fscache_retrieval *op =
-		container_of(_op, struct fscache_retrieval, op);
-
-	atomic_set(&op->n_pages, 0);
-}
-
 /*
  * wait for an object to become active (or dead)
  */
 int fscache_wait_for_operation_activation(struct fscache_object *object,
 					  struct fscache_operation *op,
 					  atomic_t *stat_op_waits,
-					  atomic_t *stat_object_dead,
-					  void (*do_cancel)(struct fscache_operation *))
+					  atomic_t *stat_object_dead)
 {
 	int ret;
 
@@ -359,7 +360,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
 		fscache_stat(stat_op_waits);
 	if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
 			TASK_INTERRUPTIBLE) != 0) {
-		ret = fscache_cancel_op(op, do_cancel, false);
+		ret = fscache_cancel_op(op, false);
 		if (ret == 0)
 			return -ERESTARTSYS;
 
@@ -380,7 +381,7 @@ check_if_dead:
 	if (unlikely(fscache_object_is_dying(object) ||
 		     fscache_cache_is_broken(object))) {
 		enum fscache_operation_state state = op->state;
-		fscache_cancel_op(op, do_cancel, true);
+		fscache_cancel_op(op, true);
 		if (stat_object_dead)
 			fscache_stat(stat_object_dead);
 		_leave(" = -ENOBUFS [obj dead %d]", state);
@@ -464,8 +465,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_retrieval_op_waits),
-		__fscache_stat(&fscache_n_retrievals_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_retrievals_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -595,8 +595,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_retrieval_op_waits),
-		__fscache_stat(&fscache_n_retrievals_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_retrievals_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -702,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
 	ret = fscache_wait_for_operation_activation(
 		object, &op->op,
 		__fscache_stat(&fscache_n_alloc_op_waits),
-		__fscache_stat(&fscache_n_allocs_object_dead),
-		fscache_do_cancel_retrieval);
+		__fscache_stat(&fscache_n_allocs_object_dead));
 	if (ret < 0)
 		goto error;
 
@@ -946,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 	if (!op)
 		goto nomem;
 
-	fscache_operation_init(&op->op, fscache_write_op,
+	fscache_operation_init(&op->op, fscache_write_op, NULL,
 			       fscache_release_write_op);
 	op->op.flags = FSCACHE_OP_ASYNC |
 		(1 << FSCACHE_OP_WAITING) |
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 0e26d49972e3..69c6eb10d858 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -74,6 +74,7 @@ extern wait_queue_head_t fscache_cache_cleared_wq;
  */
 typedef void (*fscache_operation_release_t)(struct fscache_operation *op);
 typedef void (*fscache_operation_processor_t)(struct fscache_operation *op);
+typedef void (*fscache_operation_cancel_t)(struct fscache_operation *op);
 
 enum fscache_operation_state {
 	FSCACHE_OP_ST_BLANK,		/* Op is not yet submitted */
@@ -109,6 +110,9 @@ struct fscache_operation {
 	 *   the op in a non-pool thread */
 	fscache_operation_processor_t processor;
 
+	/* Operation cancellation cleanup (optional) */
+	fscache_operation_cancel_t cancel;
+
 	/* operation releaser */
 	fscache_operation_release_t release;
 };
@@ -121,6 +125,7 @@ extern void fscache_op_complete(struct fscache_operation *, bool);
 extern void fscache_put_operation(struct fscache_operation *);
 extern void fscache_operation_init(struct fscache_operation *,
 				   fscache_operation_processor_t,
+				   fscache_operation_cancel_t,
 				   fscache_operation_release_t);
 
 /*
-- 
cgit v1.2.3


From 4a47132ff472a0c2c5441baeb50cf97f2580bc43 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Feb 2015 10:05:29 +0000
Subject: FS-Cache: Retain the netfs context in the retrieval op earlier

Now that the retrieval operation may be disposed of by fscache_put_operation()
before we actually set the context, the retrieval-specific cleanup operation
can produce a NULL-pointer dereference when it tries to unconditionally clean
up the netfs context.

Given that it is expected that we'll get at least as far as the place where we
currently set the context pointer and it is unlikely we'll go through the
error handling paths prior to that point, retain the context right from the
point that the retrieval op is allocated.

Concomitant to this, we need to retain the cookie pointer in the retrieval op
also so that we can call the netfs to release its context in the release
method.

In addition, we might now get into fscache_release_retrieval_op() with the op
only initialised.  To this end, set the operation to DEAD only after the
release method has been called and skip the n_pages test upon cleanup if the
op is still in the INITIALISED state.

Without these changes, the following oops might be seen:

	BUG: unable to handle kernel NULL pointer dereference at 00000000000000b8
	...
	RIP: 0010:[<ffffffffa0089c98>] fscache_release_retrieval_op+0xae/0x100
	...
	Call Trace:
	 [<ffffffffa0088560>] fscache_put_operation+0x117/0x2e0
	 [<ffffffffa008b8f5>] __fscache_read_or_alloc_pages+0x351/0x3ac
	 [<ffffffffa00b761f>] __nfs_readpages_from_fscache+0x59/0xbf [nfs]
	 [<ffffffffa00b06c5>] nfs_readpages+0x10c/0x185 [nfs]
	 [<ffffffff81124925>] ? alloc_pages_current+0x119/0x13e
	 [<ffffffff810ee5fd>] ? __page_cache_alloc+0xfb/0x10a
	 [<ffffffff810f87f8>] __do_page_cache_readahead+0x188/0x22c
	 [<ffffffff810f8b3a>] ondemand_readahead+0x29e/0x2af
	 [<ffffffff810f8c92>] page_cache_sync_readahead+0x38/0x3a
	 [<ffffffff810ef337>] generic_file_read_iter+0x1a2/0x55a
	 [<ffffffffa00a9dff>] ? nfs_revalidate_mapping+0xd6/0x288 [nfs]
	 [<ffffffffa00a6a23>] nfs_file_read+0x49/0x70 [nfs]
	 [<ffffffff811363be>] new_sync_read+0x78/0x9c
	 [<ffffffff81137164>] __vfs_read+0x13/0x38
	 [<ffffffff8113721e>] vfs_read+0x95/0x121
	 [<ffffffff811372f6>] SyS_read+0x4c/0x8a
	 [<ffffffff81557a52>] system_call_fastpath+0x12/0x17

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Steve Dickson <steved@redhat.com>
Acked-by: Jeff Layton <jeff.layton@primarydata.com>
---
 fs/fscache/operation.c        |  2 +-
 fs/fscache/page.c             | 20 ++++++++++----------
 include/linux/fscache-cache.h |  1 +
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 57d4abb68656..de67745e1cd7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -492,7 +492,6 @@ void fscache_put_operation(struct fscache_operation *op)
 	ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED &&
 		    op->state != FSCACHE_OP_ST_COMPLETE,
 		    op->state, ==, FSCACHE_OP_ST_CANCELLED);
-	op->state = FSCACHE_OP_ST_DEAD;
 
 	fscache_stat(&fscache_n_op_release);
 
@@ -500,6 +499,7 @@ void fscache_put_operation(struct fscache_operation *op)
 		op->release(op);
 		op->release = NULL;
 	}
+	op->state = FSCACHE_OP_ST_DEAD;
 
 	object = op->object;
 	if (likely(object)) {
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index d1b23a67c031..483bbc613bf0 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -269,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
 
 	_enter("{OP%x}", op->op.debug_id);
 
-	ASSERTCMP(atomic_read(&op->n_pages), ==, 0);
+	ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED,
+		    atomic_read(&op->n_pages), ==, 0);
 
 	fscache_hist(fscache_retrieval_histogram, op->start_time);
 	if (op->context)
-		fscache_put_context(op->op.object->cookie, op->context);
+		fscache_put_context(op->cookie, op->context);
 
 	_leave("");
 }
@@ -302,11 +303,18 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
 	op->op.flags	= FSCACHE_OP_MYTHREAD |
 		(1UL << FSCACHE_OP_WAITING) |
 		(1UL << FSCACHE_OP_UNUSE_COOKIE);
+	op->cookie	= cookie;
 	op->mapping	= mapping;
 	op->end_io_func	= end_io_func;
 	op->context	= context;
 	op->start_time	= jiffies;
 	INIT_LIST_HEAD(&op->to_do);
+
+	/* Pin the netfs read context in case we need to do the actual netfs
+	 * read because we've encountered a cache read failure.
+	 */
+	if (context)
+		fscache_get_context(op->cookie, context);
 	return op;
 }
 
@@ -456,10 +464,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
 
 	fscache_stat(&fscache_n_retrieval_ops);
 
-	/* pin the netfs read context in case we need to do the actual netfs
-	 * read because we've encountered a cache read failure */
-	fscache_get_context(object->cookie, op->context);
-
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
 	ret = fscache_wait_for_operation_activation(
@@ -586,10 +590,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
 
 	fscache_stat(&fscache_n_retrieval_ops);
 
-	/* pin the netfs read context in case we need to do the actual netfs
-	 * read because we've encountered a cache read failure */
-	fscache_get_context(object->cookie, op->context);
-
 	/* we wait for the operation to become active, and then process it
 	 * *here*, in this thread, and not in the thread pool */
 	ret = fscache_wait_for_operation_activation(
diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h
index 69c6eb10d858..604e1526cd00 100644
--- a/include/linux/fscache-cache.h
+++ b/include/linux/fscache-cache.h
@@ -133,6 +133,7 @@ extern void fscache_operation_init(struct fscache_operation *,
  */
 struct fscache_retrieval {
 	struct fscache_operation op;
+	struct fscache_cookie	*cookie;	/* The netfs cookie */
 	struct address_space	*mapping;	/* netfs pages */
 	fscache_rw_complete_t	end_io_func;	/* function to call on I/O completion */
 	void			*context;	/* netfs read context (pinned) */
-- 
cgit v1.2.3


From fb7737e949e31d8a71acee6bbb670f32dbd2a2c0 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 4 Mar 2015 20:01:14 -0600
Subject: hwspinlock/core: add device tree support

This patch adds a new OF-friendly API of_hwspin_lock_get_id()
for hwspinlock clients to use/request locks from a hwspinlock
device instantiated through a device-tree blob. This new API
can be used by hwspinlock clients to get the id for a specific
lock using the phandle + args specifier, so that it can be
requested using the available hwspin_lock_request_specific()
API.

Signed-off-by: Suman Anna <s-anna@ti.com>
Reviewed-by: Bjorn Andersson <bjorn.andersson@sonymobile.com>
[small comment clarification]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 Documentation/hwspinlock.txt         | 10 +++++
 drivers/hwspinlock/hwspinlock_core.c | 79 ++++++++++++++++++++++++++++++++++++
 include/linux/hwspinlock.h           |  7 ++++
 3 files changed, 96 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt
index 62f7d4ea6e26..61c1ee98e59f 100644
--- a/Documentation/hwspinlock.txt
+++ b/Documentation/hwspinlock.txt
@@ -48,6 +48,16 @@ independent, drivers.
      ids for predefined purposes.
      Should be called from a process context (might sleep).
 
+  int of_hwspin_lock_get_id(struct device_node *np, int index);
+   - retrieve the global lock id for an OF phandle-based specific lock.
+     This function provides a means for DT users of a hwspinlock module
+     to get the global lock id of a specific hwspinlock, so that it can
+     be requested using the normal hwspin_lock_request_specific() API.
+     The function returns a lock id number on success, -EPROBE_DEFER if
+     the hwspinlock device is not yet registered with the core, or other
+     error values.
+     Should be called from a process context (might sleep).
+
   int hwspin_lock_free(struct hwspinlock *hwlock);
    - free a previously-assigned hwspinlock; returns 0 on success, or an
      appropriate error code on failure (e.g. -EINVAL if the hwspinlock
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index 461a0d739d75..52f708bcf77f 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -27,6 +27,7 @@
 #include <linux/hwspinlock.h>
 #include <linux/pm_runtime.h>
 #include <linux/mutex.h>
+#include <linux/of.h>
 
 #include "hwspinlock_internal.h"
 
@@ -257,6 +258,84 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 }
 EXPORT_SYMBOL_GPL(__hwspin_unlock);
 
+/**
+ * of_hwspin_lock_simple_xlate - translate hwlock_spec to return a lock id
+ * @bank: the hwspinlock device bank
+ * @hwlock_spec: hwlock specifier as found in the device tree
+ *
+ * This is a simple translation function, suitable for hwspinlock platform
+ * drivers that only has a lock specifier length of 1.
+ *
+ * Returns a relative index of the lock within a specified bank on success,
+ * or -EINVAL on invalid specifier cell count.
+ */
+static inline int
+of_hwspin_lock_simple_xlate(const struct of_phandle_args *hwlock_spec)
+{
+	if (WARN_ON(hwlock_spec->args_count != 1))
+		return -EINVAL;
+
+	return hwlock_spec->args[0];
+}
+
+/**
+ * of_hwspin_lock_get_id() - get lock id for an OF phandle-based specific lock
+ * @np: device node from which to request the specific hwlock
+ * @index: index of the hwlock in the list of values
+ *
+ * This function provides a means for DT users of the hwspinlock module to
+ * get the global lock id of a specific hwspinlock using the phandle of the
+ * hwspinlock device, so that it can be requested using the normal
+ * hwspin_lock_request_specific() API.
+ *
+ * Returns the global lock id number on success, -EPROBE_DEFER if the hwspinlock
+ * device is not yet registered, -EINVAL on invalid args specifier value or an
+ * appropriate error as returned from the OF parsing of the DT client node.
+ */
+int of_hwspin_lock_get_id(struct device_node *np, int index)
+{
+	struct of_phandle_args args;
+	struct hwspinlock *hwlock;
+	struct radix_tree_iter iter;
+	void **slot;
+	int id;
+	int ret;
+
+	ret = of_parse_phandle_with_args(np, "hwlocks", "#hwlock-cells", index,
+					 &args);
+	if (ret)
+		return ret;
+
+	/* Find the hwspinlock device: we need its base_id */
+	ret = -EPROBE_DEFER;
+	rcu_read_lock();
+	radix_tree_for_each_slot(slot, &hwspinlock_tree, &iter, 0) {
+		hwlock = radix_tree_deref_slot(slot);
+		if (unlikely(!hwlock))
+			continue;
+
+		if (hwlock->bank->dev->of_node == args.np) {
+			ret = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (ret < 0)
+		goto out;
+
+	id = of_hwspin_lock_simple_xlate(&args);
+	if (id < 0 || id >= hwlock->bank->num_locks) {
+		ret = -EINVAL;
+		goto out;
+	}
+	id += hwlock->bank->base_id;
+
+out:
+	of_node_put(args.np);
+	return ret ? ret : id;
+}
+EXPORT_SYMBOL_GPL(of_hwspin_lock_get_id);
+
 static int hwspin_lock_register_single(struct hwspinlock *hwlock, int id)
 {
 	struct hwspinlock *tmp;
diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h
index 3343298e40e8..859d673d98c8 100644
--- a/include/linux/hwspinlock.h
+++ b/include/linux/hwspinlock.h
@@ -26,6 +26,7 @@
 #define HWLOCK_IRQ	0x02	/* Disable interrupts, don't save state */
 
 struct device;
+struct device_node;
 struct hwspinlock;
 struct hwspinlock_device;
 struct hwspinlock_ops;
@@ -66,6 +67,7 @@ int hwspin_lock_unregister(struct hwspinlock_device *bank);
 struct hwspinlock *hwspin_lock_request(void);
 struct hwspinlock *hwspin_lock_request_specific(unsigned int id);
 int hwspin_lock_free(struct hwspinlock *hwlock);
+int of_hwspin_lock_get_id(struct device_node *np, int index);
 int hwspin_lock_get_id(struct hwspinlock *hwlock);
 int __hwspin_lock_timeout(struct hwspinlock *, unsigned int, int,
 							unsigned long *);
@@ -120,6 +122,11 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 {
 }
 
+static inline int of_hwspin_lock_get_id(struct device_node *np, int index)
+{
+	return 0;
+}
+
 static inline int hwspin_lock_get_id(struct hwspinlock *hwlock)
 {
 	return 0;
-- 
cgit v1.2.3


From 1b852bceb0d111e510d1a15826ecc4a19358d512 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 8 May 2015 23:22:29 -0500
Subject: mnt: Refactor the logic for mounting sysfs and proc in a user
 namespace

Fresh mounts of proc and sysfs are a very special case that works very
much like a bind mount.  Unfortunately the current structure can not
preserve the MNT_LOCK... mount flags.  Therefore refactor the logic
into a form that can be modified to preserve those lock bits.

Add a new filesystem flag FS_USERNS_VISIBLE that requires some mount
of the filesystem be fully visible in the current mount namespace,
before the filesystem may be mounted.

Move the logic for calling fs_fully_visible from proc and sysfs into
fs/namespace.c where it has greater access to mount namespace state.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/namespace.c     | 8 +++++++-
 fs/proc/root.c     | 5 +----
 fs/sysfs/mount.c   | 5 +----
 include/linux/fs.h | 2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1b9e11167bae..8e7edaf60fe1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2332,6 +2332,8 @@ unlock:
 	return err;
 }
 
+static bool fs_fully_visible(struct file_system_type *fs_type);
+
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
@@ -2363,6 +2365,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 			flags |= MS_NODEV;
 			mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
 		}
+		if (type->fs_flags & FS_USERNS_VISIBLE) {
+			if (!fs_fully_visible(type))
+				return -EPERM;
+		}
 	}
 
 	mnt = vfs_kern_mount(type, flags, name, data);
@@ -3164,7 +3170,7 @@ bool current_chrooted(void)
 	return chrooted;
 }
 
-bool fs_fully_visible(struct file_system_type *type)
+static bool fs_fully_visible(struct file_system_type *type)
 {
 	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 	struct mount *mnt;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b7fa4bfe896a..64e1ab64bde6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -112,9 +112,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 		ns = task_active_pid_ns(current);
 		options = data;
 
-		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
-			return ERR_PTR(-EPERM);
-
 		/* Does the mounter have privilege over the pid namespace? */
 		if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
 			return ERR_PTR(-EPERM);
@@ -159,7 +156,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
 };
 
 void __init proc_root_init(void)
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 8a49486bf30c..1c6ac6fcee9f 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -31,9 +31,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	bool new_sb;
 
 	if (!(flags & MS_KERNMOUNT)) {
-		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
-			return ERR_PTR(-EPERM);
-
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
 			return ERR_PTR(-EPERM);
 	}
@@ -58,7 +55,7 @@ static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
 };
 
 int __init sysfs_init(void)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 35ec87e490b1..2d24eeb8e59c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1897,6 +1897,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
+#define FS_USERNS_VISIBLE	32	/* FS must already be visible */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
@@ -1984,7 +1985,6 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
-extern bool fs_fully_visible(struct file_system_type *);
 
 extern int current_umask(void);
 
-- 
cgit v1.2.3


From 10081fb532a2a2216b7d8e4ad585c985075b6f60 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 1 May 2015 15:23:50 +0900
Subject: lib: introduce crc_t10dif_update()

This introduces crc_t10dif_update() which enables to calculate CRC
for a block which straddles multiple SG elements by calling multiple
times.  This also converts crc_t10dif() to use crc_t10dif_update() as
they are almost same.

(remove extra function call in crc_t10dif() and crc_t10dif_update -
 Tim + Herbert)

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-crypto@vger.kernel.org
Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Sagi Grimberg <sagig@mellanox.com>
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: target-devel@vger.kernel.org
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/linux/crc-t10dif.h |  1 +
 lib/crc-t10dif.c           | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crc-t10dif.h b/include/linux/crc-t10dif.h
index cf53d0773ce3..d81961e9e37d 100644
--- a/include/linux/crc-t10dif.h
+++ b/include/linux/crc-t10dif.h
@@ -9,5 +9,6 @@
 extern __u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer,
 				size_t len);
 extern __u16 crc_t10dif(unsigned char const *, size_t);
+extern __u16 crc_t10dif_update(__u16 crc, unsigned char const *, size_t);
 
 #endif
diff --git a/lib/crc-t10dif.c b/lib/crc-t10dif.c
index dfe6ec17c0a5..1ad33e555805 100644
--- a/lib/crc-t10dif.c
+++ b/lib/crc-t10dif.c
@@ -19,7 +19,7 @@
 static struct crypto_shash *crct10dif_tfm;
 static struct static_key crct10dif_fallback __read_mostly;
 
-__u16 crc_t10dif(const unsigned char *buffer, size_t len)
+__u16 crc_t10dif_update(__u16 crc, const unsigned char *buffer, size_t len)
 {
 	struct {
 		struct shash_desc shash;
@@ -28,17 +28,23 @@ __u16 crc_t10dif(const unsigned char *buffer, size_t len)
 	int err;
 
 	if (static_key_false(&crct10dif_fallback))
-		return crc_t10dif_generic(0, buffer, len);
+		return crc_t10dif_generic(crc, buffer, len);
 
 	desc.shash.tfm = crct10dif_tfm;
 	desc.shash.flags = 0;
-	*(__u16 *)desc.ctx = 0;
+	*(__u16 *)desc.ctx = crc;
 
 	err = crypto_shash_update(&desc.shash, buffer, len);
 	BUG_ON(err);
 
 	return *(__u16 *)desc.ctx;
 }
+EXPORT_SYMBOL(crc_t10dif_update);
+
+__u16 crc_t10dif(const unsigned char *buffer, size_t len)
+{
+	return crc_t10dif_update(0, buffer, len);
+}
 EXPORT_SYMBOL(crc_t10dif);
 
 static int __init crc_t10dif_mod_init(void)
-- 
cgit v1.2.3


From cf561f0d2eb74574ad9985a2feab134267a9d298 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Fri, 24 Apr 2015 14:24:27 +0200
Subject: virtio: introduce virtio_is_little_endian() helper

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 include/linux/virtio_config.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 1e306f727edc..170947eb11b5 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -205,35 +205,40 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
 	return 0;
 }
 
+static inline bool virtio_is_little_endian(struct virtio_device *vdev)
+{
+	return virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
+}
+
 /* Memory accessors */
 static inline u16 virtio16_to_cpu(struct virtio_device *vdev, __virtio16 val)
 {
-	return __virtio16_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __virtio16_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio16 cpu_to_virtio16(struct virtio_device *vdev, u16 val)
 {
-	return __cpu_to_virtio16(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __cpu_to_virtio16(virtio_is_little_endian(vdev), val);
 }
 
 static inline u32 virtio32_to_cpu(struct virtio_device *vdev, __virtio32 val)
 {
-	return __virtio32_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __virtio32_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio32 cpu_to_virtio32(struct virtio_device *vdev, u32 val)
 {
-	return __cpu_to_virtio32(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __cpu_to_virtio32(virtio_is_little_endian(vdev), val);
 }
 
 static inline u64 virtio64_to_cpu(struct virtio_device *vdev, __virtio64 val)
 {
-	return __virtio64_to_cpu(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __virtio64_to_cpu(virtio_is_little_endian(vdev), val);
 }
 
 static inline __virtio64 cpu_to_virtio64(struct virtio_device *vdev, u64 val)
 {
-	return __cpu_to_virtio64(virtio_has_feature(vdev, VIRTIO_F_VERSION_1), val);
+	return __cpu_to_virtio64(virtio_is_little_endian(vdev), val);
 }
 
 /* Config space accessors. */
-- 
cgit v1.2.3


From 5da7b160b36a42f161980c5e20d3960d6d076fb8 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Fri, 24 Apr 2015 14:24:58 +0200
Subject: vringh: introduce vringh_is_little_endian() helper

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 include/linux/vringh.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index a3fa537e717a..3ed62efc2bc5 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -226,33 +226,38 @@ static inline void vringh_notify(struct vringh *vrh)
 		vrh->notify(vrh);
 }
 
+static inline bool vringh_is_little_endian(const struct vringh *vrh)
+{
+	return vrh->little_endian;
+}
+
 static inline u16 vringh16_to_cpu(const struct vringh *vrh, __virtio16 val)
 {
-	return __virtio16_to_cpu(vrh->little_endian, val);
+	return __virtio16_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio16 cpu_to_vringh16(const struct vringh *vrh, u16 val)
 {
-	return __cpu_to_virtio16(vrh->little_endian, val);
+	return __cpu_to_virtio16(vringh_is_little_endian(vrh), val);
 }
 
 static inline u32 vringh32_to_cpu(const struct vringh *vrh, __virtio32 val)
 {
-	return __virtio32_to_cpu(vrh->little_endian, val);
+	return __virtio32_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio32 cpu_to_vringh32(const struct vringh *vrh, u32 val)
 {
-	return __cpu_to_virtio32(vrh->little_endian, val);
+	return __cpu_to_virtio32(vringh_is_little_endian(vrh), val);
 }
 
 static inline u64 vringh64_to_cpu(const struct vringh *vrh, __virtio64 val)
 {
-	return __virtio64_to_cpu(vrh->little_endian, val);
+	return __virtio64_to_cpu(vringh_is_little_endian(vrh), val);
 }
 
 static inline __virtio64 cpu_to_vringh64(const struct vringh *vrh, u64 val)
 {
-	return __cpu_to_virtio64(vrh->little_endian, val);
+	return __cpu_to_virtio64(vringh_is_little_endian(vrh), val);
 }
 #endif /* _LINUX_VRINGH_H */
-- 
cgit v1.2.3


From 7d82410950aa74adccf035c332e409af2bb93e92 Mon Sep 17 00:00:00 2001
From: Greg Kurz <gkurz@linux.vnet.ibm.com>
Date: Fri, 24 Apr 2015 14:26:24 +0200
Subject: virtio: add explicit big-endian support to memory accessors

The current memory accessors logic is:
- little endian if little_endian
- native endian (i.e. no byteswap) if !little_endian

If we want to fully support cross-endian vhost, we also need to be
able to convert to big endian.

Instead of changing the little_endian argument to some 3-value enum, this
patch changes the logic to:
- little endian if little_endian
- big endian if !little_endian

The native endian case is handled by all users with a trivial helper. This
patch doesn't change any functionality, nor it does add overhead.

Signed-off-by: Greg Kurz <gkurz@linux.vnet.ibm.com>

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 drivers/net/macvtap.c            |  3 ++-
 drivers/net/tun.c                |  3 ++-
 drivers/vhost/vhost.h            |  3 ++-
 include/linux/virtio_byteorder.h | 24 ++++++++++++++----------
 include/linux/virtio_config.h    |  3 ++-
 include/linux/vringh.h           |  3 ++-
 6 files changed, 24 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 4fdb88102082..072ccbaf7c45 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -51,7 +51,8 @@ struct macvtap_queue {
 
 static inline bool macvtap_is_little_endian(struct macvtap_queue *q)
 {
-	return q->flags & MACVTAP_VNET_LE;
+	return q->flags & MACVTAP_VNET_LE ||
+		virtio_legacy_is_little_endian();
 }
 
 static inline u16 macvtap16_to_cpu(struct macvtap_queue *q, __virtio16 val)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 301e87c89a9f..b210139bef8f 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -208,7 +208,8 @@ struct tun_struct {
 
 static inline bool tun_is_little_endian(struct tun_struct *tun)
 {
-	return tun->flags & TUN_VNET_LE;
+	return tun->flags & TUN_VNET_LE ||
+		virtio_legacy_is_little_endian();
 }
 
 static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6a499603d856..a4fa33a79bf2 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -175,7 +175,8 @@ static inline bool vhost_has_feature(struct vhost_virtqueue *vq, int bit)
 
 static inline bool vhost_is_little_endian(struct vhost_virtqueue *vq)
 {
-	return vhost_has_feature(vq, VIRTIO_F_VERSION_1);
+	return vhost_has_feature(vq, VIRTIO_F_VERSION_1) ||
+		virtio_legacy_is_little_endian();
 }
 
 /* Memory accessors */
diff --git a/include/linux/virtio_byteorder.h b/include/linux/virtio_byteorder.h
index 51865d05b267..ce63a2c3a612 100644
--- a/include/linux/virtio_byteorder.h
+++ b/include/linux/virtio_byteorder.h
@@ -3,17 +3,21 @@
 #include <linux/types.h>
 #include <uapi/linux/virtio_types.h>
 
-/*
- * Low-level memory accessors for handling virtio in modern little endian and in
- * compatibility native endian format.
- */
+static inline bool virtio_legacy_is_little_endian(void)
+{
+#ifdef __LITTLE_ENDIAN
+	return true;
+#else
+	return false;
+#endif
+}
 
 static inline u16 __virtio16_to_cpu(bool little_endian, __virtio16 val)
 {
 	if (little_endian)
 		return le16_to_cpu((__force __le16)val);
 	else
-		return (__force u16)val;
+		return be16_to_cpu((__force __be16)val);
 }
 
 static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val)
@@ -21,7 +25,7 @@ static inline __virtio16 __cpu_to_virtio16(bool little_endian, u16 val)
 	if (little_endian)
 		return (__force __virtio16)cpu_to_le16(val);
 	else
-		return (__force __virtio16)val;
+		return (__force __virtio16)cpu_to_be16(val);
 }
 
 static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val)
@@ -29,7 +33,7 @@ static inline u32 __virtio32_to_cpu(bool little_endian, __virtio32 val)
 	if (little_endian)
 		return le32_to_cpu((__force __le32)val);
 	else
-		return (__force u32)val;
+		return be32_to_cpu((__force __be32)val);
 }
 
 static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val)
@@ -37,7 +41,7 @@ static inline __virtio32 __cpu_to_virtio32(bool little_endian, u32 val)
 	if (little_endian)
 		return (__force __virtio32)cpu_to_le32(val);
 	else
-		return (__force __virtio32)val;
+		return (__force __virtio32)cpu_to_be32(val);
 }
 
 static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val)
@@ -45,7 +49,7 @@ static inline u64 __virtio64_to_cpu(bool little_endian, __virtio64 val)
 	if (little_endian)
 		return le64_to_cpu((__force __le64)val);
 	else
-		return (__force u64)val;
+		return be64_to_cpu((__force __be64)val);
 }
 
 static inline __virtio64 __cpu_to_virtio64(bool little_endian, u64 val)
@@ -53,7 +57,7 @@ static inline __virtio64 __cpu_to_virtio64(bool little_endian, u64 val)
 	if (little_endian)
 		return (__force __virtio64)cpu_to_le64(val);
 	else
-		return (__force __virtio64)val;
+		return (__force __virtio64)cpu_to_be64(val);
 }
 
 #endif /* _LINUX_VIRTIO_BYTEORDER */
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index 170947eb11b5..e5ce8ab0b8b0 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -207,7 +207,8 @@ int virtqueue_set_affinity(struct virtqueue *vq, int cpu)
 
 static inline bool virtio_is_little_endian(struct virtio_device *vdev)
 {
-	return virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
+	return virtio_has_feature(vdev, VIRTIO_F_VERSION_1) ||
+		virtio_legacy_is_little_endian();
 }
 
 /* Memory accessors */
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index 3ed62efc2bc5..bc6c28d04263 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -228,7 +228,8 @@ static inline void vringh_notify(struct vringh *vrh)
 
 static inline bool vringh_is_little_endian(const struct vringh *vrh)
 {
-	return vrh->little_endian;
+	return vrh->little_endian ||
+		virtio_legacy_is_little_endian();
 }
 
 static inline u16 vringh16_to_cpu(const struct vringh *vrh, __virtio16 val)
-- 
cgit v1.2.3


From 632dda833e28fe43049a01b4bc53e409176e7843 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 May 2015 14:04:50 -0400
Subject: SUNRPC: Clean up bc_send()

Clean up: Merge bc_send() into bc_svc_process().

Note: even thought this touches svc.c, it is a client-side change.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/bc_xprt.h |  1 -
 net/sunrpc/Makefile            |  2 +-
 net/sunrpc/bc_svc.c            | 63 ------------------------------------------
 net/sunrpc/svc.c               | 33 ++++++++++++++++------
 4 files changed, 26 insertions(+), 73 deletions(-)
 delete mode 100644 net/sunrpc/bc_svc.c

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index 2ca67b55e0fe..8df43c9f11dc 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -37,7 +37,6 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied);
 void xprt_free_bc_request(struct rpc_rqst *req);
 int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
 void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
-int bc_send(struct rpc_rqst *req);
 
 /*
  * Determine if a shared backchannel is in use
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 15e6f6c23c5d..1b8e68d0e690 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -15,6 +15,6 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
 	    sunrpc_syms.o cache.o rpc_pipe.o \
 	    svc_xprt.o
 sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
-sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o bc_svc.o
+sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o
 sunrpc-$(CONFIG_PROC_FS) += stats.o
 sunrpc-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
deleted file mode 100644
index 15c7a8a1c24f..000000000000
--- a/net/sunrpc/bc_svc.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/******************************************************************************
-
-(c) 2007 Network Appliance, Inc.  All Rights Reserved.
-(c) 2009 NetApp.  All Rights Reserved.
-
-NetApp provides this source code under the GPL v2 License.
-The GPL v2 license is available at
-http://opensource.org/licenses/gpl-license.php.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-******************************************************************************/
-
-/*
- * The NFSv4.1 callback service helper routines.
- * They implement the transport level processing required to send the
- * reply over an existing open connection previously established by the client.
- */
-
-#include <linux/module.h>
-
-#include <linux/sunrpc/xprt.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/sunrpc/bc_xprt.h>
-
-#define RPCDBG_FACILITY	RPCDBG_SVCDSP
-
-/* Empty callback ops */
-static const struct rpc_call_ops nfs41_callback_ops = {
-};
-
-
-/*
- * Send the callback reply
- */
-int bc_send(struct rpc_rqst *req)
-{
-	struct rpc_task *task;
-	int ret;
-
-	dprintk("RPC:       bc_send req= %p\n", req);
-	task = rpc_run_bc_task(req, &nfs41_callback_ops);
-	if (IS_ERR(task))
-		ret = PTR_ERR(task);
-	else {
-		WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
-		ret = task->tk_status;
-		rpc_put_task(task);
-	}
-	dprintk("RPC:       bc_send ret= %d\n", ret);
-	return ret;
-}
-
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 78974e4d9ad2..e144902d382e 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1350,6 +1350,11 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 {
 	struct kvec	*argv = &rqstp->rq_arg.head[0];
 	struct kvec	*resv = &rqstp->rq_res.head[0];
+	static const struct rpc_call_ops reply_ops = { };
+	struct rpc_task *task;
+	int error;
+
+	dprintk("svc: %s(%p)\n", __func__, req);
 
 	/* Build the svc_rqst used by the common processing routine */
 	rqstp->rq_xprt = serv->sv_bc_xprt;
@@ -1372,21 +1377,33 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 
 	/*
 	 * Skip the next two words because they've already been
-	 * processed in the trasport
+	 * processed in the transport
 	 */
 	svc_getu32(argv);	/* XID */
 	svc_getnl(argv);	/* CALLDIR */
 
-	/* Returns 1 for send, 0 for drop */
-	if (svc_process_common(rqstp, argv, resv)) {
-		memcpy(&req->rq_snd_buf, &rqstp->rq_res,
-						sizeof(req->rq_snd_buf));
-		return bc_send(req);
-	} else {
-		/* drop request */
+	/* Parse and execute the bc call */
+	if (!svc_process_common(rqstp, argv, resv)) {
+		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
 		return 0;
 	}
+
+	/* Finally, send the reply synchronously */
+	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
+	task = rpc_run_bc_task(req, &reply_ops);
+	if (IS_ERR(task)) {
+		error = PTR_ERR(task);
+		goto out;
+	}
+
+	WARN_ON_ONCE(atomic_read(&task->tk_count) != 1);
+	error = task->tk_status;
+	rpc_put_task(task);
+
+out:
+	dprintk("svc: %s(), error=%d\n", __func__, error);
+	return error;
 }
 EXPORT_SYMBOL_GPL(bc_svc_process);
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
-- 
cgit v1.2.3


From 0f41979164a52a1ca0f0a601f90000fc18e3a396 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 1 Jun 2015 22:59:08 -0400
Subject: SUNRPC: Remove unused argument 'tk_ops' in rpc_run_bc_task

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/sched.h | 3 +--
 net/sunrpc/clnt.c            | 6 ++----
 net/sunrpc/svc.c             | 3 +--
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 5f1e6bd4c316..2aa68976a482 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -205,8 +205,7 @@ struct rpc_wait_queue {
  */
 struct rpc_task *rpc_new_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-				const struct rpc_call_ops *ops);
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req);
 void		rpc_put_task(struct rpc_task *);
 void		rpc_put_task_async(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index fa70f663eb92..f6717170480e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1031,15 +1031,13 @@ EXPORT_SYMBOL_GPL(rpc_call_async);
  * rpc_run_bc_task - Allocate a new RPC task for backchannel use, then run
  * rpc_execute against it
  * @req: RPC request
- * @tk_ops: RPC call ops
  */
-struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
-				const struct rpc_call_ops *tk_ops)
+struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req)
 {
 	struct rpc_task *task;
 	struct xdr_buf *xbufp = &req->rq_snd_buf;
 	struct rpc_task_setup task_setup_data = {
-		.callback_ops = tk_ops,
+		.callback_ops = &rpc_default_ops,
 	};
 
 	dprintk("RPC: rpc_run_bc_task req= %p\n", req);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e144902d382e..51c8cad5e765 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1350,7 +1350,6 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 {
 	struct kvec	*argv = &rqstp->rq_arg.head[0];
 	struct kvec	*resv = &rqstp->rq_res.head[0];
-	static const struct rpc_call_ops reply_ops = { };
 	struct rpc_task *task;
 	int error;
 
@@ -1391,7 +1390,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 
 	/* Finally, send the reply synchronously */
 	memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf));
-	task = rpc_run_bc_task(req, &reply_ops);
+	task = rpc_run_bc_task(req);
 	if (IS_ERR(task)) {
 		error = PTR_ERR(task);
 		goto out;
-- 
cgit v1.2.3


From 0d2a970d0ae55086520e1e58e572a7acd519429c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 4 Jun 2015 15:37:10 -0400
Subject: SUNRPC: Fix a backchannel race

We need to allow the server to send a new request immediately after we've
replied to the previous one. Right now, there is a window between the
send and the release of the old request in rpc_put_task(), where the
server could send us a new backchannel RPC call, and we have no
request to service it.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h   |  3 ++-
 net/sunrpc/backchannel_rqst.c | 37 ++++++++++++++++++++++++-------------
 net/sunrpc/svc.c              |  6 +++++-
 3 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 8b93ef53df3c..bc9c39d6d30d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -212,7 +212,8 @@ struct rpc_xprt {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct svc_serv		*bc_serv;       /* The RPC service which will */
 						/* process the callback */
-	unsigned int		bc_alloc_count;	/* Total number of preallocs */
+	int			bc_alloc_count;	/* Total number of preallocs */
+	atomic_t		bc_free_slots;
 	spinlock_t		bc_pa_lock;	/* Protects the preallocated
 						 * items */
 	struct list_head	bc_pa_list;	/* List of preallocated
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 1baa41099469..9825ff0f91d6 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -37,16 +37,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
 {
-	return xprt->bc_alloc_count > 0;
+	return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
 }
 
 static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
 {
+	atomic_add(n, &xprt->bc_free_slots);
 	xprt->bc_alloc_count += n;
 }
 
 static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
 {
+	atomic_sub(n, &xprt->bc_free_slots);
 	return xprt->bc_alloc_count -= n;
 }
 
@@ -232,9 +234,15 @@ static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
 	struct rpc_rqst *req = NULL;
 
 	dprintk("RPC:       allocate a backchannel request\n");
-	if (list_empty(&xprt->bc_pa_list))
+	if (atomic_read(&xprt->bc_free_slots) <= 0)
 		goto not_found;
-
+	if (list_empty(&xprt->bc_pa_list)) {
+		req = xprt_alloc_bc_req(xprt, GFP_ATOMIC);
+		if (!req)
+			goto not_found;
+		/* Note: this 'free' request adds it to xprt->bc_pa_list */
+		xprt_free_bc_request(req);
+	}
 	req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
 				rq_bc_pa_list);
 	req->rq_reply_bytes_recvd = 0;
@@ -260,11 +268,21 @@ void xprt_free_bc_request(struct rpc_rqst *req)
 
 	req->rq_connect_cookie = xprt->connect_cookie - 1;
 	smp_mb__before_atomic();
-	WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
 	clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
 	smp_mb__after_atomic();
 
-	if (!xprt_need_to_requeue(xprt)) {
+	/*
+	 * Return it to the list of preallocations so that it
+	 * may be reused by a new callback request.
+	 */
+	spin_lock_bh(&xprt->bc_pa_lock);
+	if (xprt_need_to_requeue(xprt)) {
+		list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+		xprt->bc_alloc_count++;
+		req = NULL;
+	}
+	spin_unlock_bh(&xprt->bc_pa_lock);
+	if (req != NULL) {
 		/*
 		 * The last remaining session was destroyed while this
 		 * entry was in use.  Free the entry and don't attempt
@@ -275,14 +293,6 @@ void xprt_free_bc_request(struct rpc_rqst *req)
 		xprt_free_allocation(req);
 		return;
 	}
-
-	/*
-	 * Return it to the list of preallocations so that it
-	 * may be reused by a new callback request.
-	 */
-	spin_lock_bh(&xprt->bc_pa_lock);
-	list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
-	spin_unlock_bh(&xprt->bc_pa_lock);
 }
 
 /*
@@ -326,6 +336,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
 	spin_lock(&xprt->bc_pa_lock);
 	list_del(&req->rq_bc_pa_list);
+	xprt->bc_alloc_count--;
 	spin_unlock(&xprt->bc_pa_lock);
 
 	req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 51c8cad5e765..b47f02f60b9c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1351,6 +1351,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	struct kvec	*argv = &rqstp->rq_arg.head[0];
 	struct kvec	*resv = &rqstp->rq_res.head[0];
 	struct rpc_task *task;
+	int proc_error;
 	int error;
 
 	dprintk("svc: %s(%p)\n", __func__, req);
@@ -1382,7 +1383,10 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	svc_getnl(argv);	/* CALLDIR */
 
 	/* Parse and execute the bc call */
-	if (!svc_process_common(rqstp, argv, resv)) {
+	proc_error = svc_process_common(rqstp, argv, resv);
+
+	atomic_inc(&req->rq_xprt->bc_free_slots);
+	if (!proc_error) {
 		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
 		return 0;
-- 
cgit v1.2.3


From b1999477ed91c3c33891acfe0e18a4457e5c4915 Mon Sep 17 00:00:00 2001
From: Guo Zeng <Guo.Zeng@csr.com>
Date: Tue, 14 Apr 2015 11:55:55 +0000
Subject: ARM: prima2: move to use REGMAP APIs for rtciobrg

all devices behind rtciobrg needs a special way to access. currently they
are using a platform-specific API.
this patch moves to REGMAP, then clients can use regmap APIs to read/write.
for the moment, old APIs are still kept, once all clients move to regmap,
old APIs will be dropped.

this patch also does minor clean for comments, authors statement.

Signed-off-by: Guo Zeng <Guo.Zeng@csr.com>
Signed-off-by: Barry Song <Baohua.Song@csr.com>
---
 arch/arm/mach-prima2/Kconfig         |  1 +
 arch/arm/mach-prima2/rtciobrg.c      | 48 +++++++++++++++++++++++++++++++++---
 include/linux/rtc/sirfsoc_rtciobrg.h |  4 +++
 3 files changed, 50 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-prima2/Kconfig b/arch/arm/mach-prima2/Kconfig
index e03d8b5c9ad0..9ab8932403e5 100644
--- a/arch/arm/mach-prima2/Kconfig
+++ b/arch/arm/mach-prima2/Kconfig
@@ -4,6 +4,7 @@ menuconfig ARCH_SIRF
 	select ARCH_REQUIRE_GPIOLIB
 	select GENERIC_IRQ_CHIP
 	select NO_IOPORT_MAP
+	select REGMAP
 	select PINCTRL
 	select PINCTRL_SIRF
 	help
diff --git a/arch/arm/mach-prima2/rtciobrg.c b/arch/arm/mach-prima2/rtciobrg.c
index 8f66d8f7ca75..d4852d24dc7d 100644
--- a/arch/arm/mach-prima2/rtciobrg.c
+++ b/arch/arm/mach-prima2/rtciobrg.c
@@ -1,5 +1,5 @@
 /*
- * RTC I/O Bridge interfaces for CSR SiRFprimaII
+ * RTC I/O Bridge interfaces for CSR SiRFprimaII/atlas7
  * ARM access the registers of SYSRTC, GPSRTC and PWRC through this module
  *
  * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company.
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/io.h>
+#include <linux/regmap.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_device.h>
@@ -66,6 +67,7 @@ u32 sirfsoc_rtc_iobrg_readl(u32 addr)
 {
 	unsigned long flags, val;
 
+	/* TODO: add hwspinlock to sync with M3 */
 	spin_lock_irqsave(&rtciobrg_lock, flags);
 
 	val = __sirfsoc_rtc_iobrg_readl(addr);
@@ -90,6 +92,7 @@ void sirfsoc_rtc_iobrg_writel(u32 val, u32 addr)
 {
 	unsigned long flags;
 
+	 /* TODO: add hwspinlock to sync with M3 */
 	spin_lock_irqsave(&rtciobrg_lock, flags);
 
 	sirfsoc_rtc_iobrg_pre_writel(val, addr);
@@ -102,6 +105,45 @@ void sirfsoc_rtc_iobrg_writel(u32 val, u32 addr)
 }
 EXPORT_SYMBOL_GPL(sirfsoc_rtc_iobrg_writel);
 
+
+static int regmap_iobg_regwrite(void *context, unsigned int reg,
+				   unsigned int val)
+{
+	sirfsoc_rtc_iobrg_writel(val, reg);
+	return 0;
+}
+
+static int regmap_iobg_regread(void *context, unsigned int reg,
+				  unsigned int *val)
+{
+	*val = (u32)sirfsoc_rtc_iobrg_readl(reg);
+	return 0;
+}
+
+static struct regmap_bus regmap_iobg = {
+	.reg_write = regmap_iobg_regwrite,
+	.reg_read = regmap_iobg_regread,
+};
+
+/**
+ * devm_regmap_init_iobg(): Initialise managed register map
+ *
+ * @iobg: Device that will be interacted with
+ * @config: Configuration for register map
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap.  The regmap will be automatically freed by the
+ * device management code.
+ */
+struct regmap *devm_regmap_init_iobg(struct device *dev,
+				    const struct regmap_config *config)
+{
+	const struct regmap_bus *bus = &regmap_iobg;
+
+	return devm_regmap_init(dev, bus, dev, config);
+}
+EXPORT_SYMBOL_GPL(devm_regmap_init_iobg);
+
 static const struct of_device_id rtciobrg_ids[] = {
 	{ .compatible = "sirf,prima2-rtciobg" },
 	{}
@@ -132,7 +174,7 @@ static int __init sirfsoc_rtciobrg_init(void)
 }
 postcore_initcall(sirfsoc_rtciobrg_init);
 
-MODULE_AUTHOR("Zhiwu Song <zhiwu.song@csr.com>, "
-		"Barry Song <baohua.song@csr.com>");
+MODULE_AUTHOR("Zhiwu Song <zhiwu.song@csr.com>");
+MODULE_AUTHOR("Barry Song <baohua.song@csr.com>");
 MODULE_DESCRIPTION("CSR SiRFprimaII rtc io bridge");
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/rtc/sirfsoc_rtciobrg.h b/include/linux/rtc/sirfsoc_rtciobrg.h
index 2c92e1c8e055..aefd997262e4 100644
--- a/include/linux/rtc/sirfsoc_rtciobrg.h
+++ b/include/linux/rtc/sirfsoc_rtciobrg.h
@@ -9,10 +9,14 @@
 #ifndef _SIRFSOC_RTC_IOBRG_H_
 #define _SIRFSOC_RTC_IOBRG_H_
 
+struct regmap_config;
+
 extern void sirfsoc_rtc_iobrg_besyncing(void);
 
 extern u32 sirfsoc_rtc_iobrg_readl(u32 addr);
 
 extern void sirfsoc_rtc_iobrg_writel(u32 val, u32 addr);
+struct regmap *devm_regmap_init_iobg(struct device *dev,
+				    const struct regmap_config *config);
 
 #endif
-- 
cgit v1.2.3


From 3c87ef6efb40f0e339d705c194b2224f854ec38e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Wed, 3 Jun 2015 16:14:25 -0400
Subject: sunrpc: keep a count of swapfiles associated with the rpc_clnt

Jerome reported seeing a warning pop when working with a swapfile on
NFS. The nfs_swap_activate can end up calling sk_set_memalloc while
holding the rcu_read_lock and that function can sleep.

To fix that, we need to take a reference to the xprt while holding the
rcu_read_lock, set the socket up for swapping and then drop that
reference. But, xprt_put is not exported and having NFS deal with the
underlying xprt is a bit of layering violation anyway.

Fix this by adding a set of activate/deactivate functions that take a
rpc_clnt pointer instead of an rpc_xprt, and have nfs_swap_activate and
nfs_swap_deactivate call those.

Also, add a per-rpc_clnt atomic counter to keep track of the number of
active swapfiles associated with it. When the counter does a 0->1
transition, we enable swapping on the xprt, when we do a 1->0 transition
we disable swapping on it.

This also allows us to be a bit more selective with the RPC_TASK_SWAPPER
flag. If non-swapper and swapper clnts are sharing a xprt, then we only
need to flag the tasks from the swapper clnt with that flag.

Acked-by: Mel Gorman <mgorman@suse.de>
Reported-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/file.c                | 15 ++--------
 include/linux/sunrpc/clnt.h  |  1 +
 include/linux/sunrpc/sched.h | 16 +++++++++++
 net/sunrpc/clnt.c            | 67 ++++++++++++++++++++++++++++++++++++++------
 net/sunrpc/xprtsock.c        |  3 +-
 5 files changed, 78 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8b8d83a526ce..cc4fa1ed61fc 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page)
 	return nfs_wb_page(inode, page);
 }
 
-#ifdef CONFIG_NFS_SWAP
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
 						sector_t *span)
 {
-	int ret;
 	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
 	*span = sis->pages;
 
-	rcu_read_lock();
-	ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
-	rcu_read_unlock();
-
-	return ret;
+	return rpc_clnt_swap_activate(clnt);
 }
 
 static void nfs_swap_deactivate(struct file *file)
 {
 	struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
 
-	rcu_read_lock();
-	xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
-	rcu_read_unlock();
+	rpc_clnt_swap_deactivate(clnt);
 }
-#endif
 
 const struct address_space_operations nfs_file_aops = {
 	.readpage = nfs_readpage,
@@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = {
 	.launder_page = nfs_launder_page,
 	.is_dirty_writeback = nfs_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
-#ifdef CONFIG_NFS_SWAP
 	.swap_activate = nfs_swap_activate,
 	.swap_deactivate = nfs_swap_deactivate,
-#endif
 };
 
 /*
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 598ba80ec30c..131032f15cc1 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -56,6 +56,7 @@ struct rpc_clnt {
 	struct rpc_rtt *	cl_rtt;		/* RTO estimator data */
 	const struct rpc_timeout *cl_timeout;	/* Timeout strategy */
 
+	atomic_t		cl_swapper;	/* swapfile count */
 	int			cl_nodelen;	/* nodename length */
 	char 			cl_nodename[UNX_MAXNODENAME+1];
 	struct rpc_pipe_dir_head cl_pipedir_objects;
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 2aa68976a482..d703f0ef37d8 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -268,4 +268,20 @@ static inline void rpc_assign_waitqueue_name(struct rpc_wait_queue *q,
 }
 #endif
 
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int rpc_clnt_swap_activate(struct rpc_clnt *clnt);
+void rpc_clnt_swap_deactivate(struct rpc_clnt *clnt);
+#else
+static inline int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+	return -EINVAL;
+}
+
+static inline void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+}
+#endif /* CONFIG_SUNRPC_SWAP */
+
 #endif /* _LINUX_SUNRPC_SCHED_H_ */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index f6717170480e..0ba65156a62b 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -891,15 +891,8 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
 			task->tk_flags |= RPC_TASK_SOFT;
 		if (clnt->cl_noretranstimeo)
 			task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
-		if (sk_memalloc_socks()) {
-			struct rpc_xprt *xprt;
-
-			rcu_read_lock();
-			xprt = rcu_dereference(clnt->cl_xprt);
-			if (xprt->swapper)
-				task->tk_flags |= RPC_TASK_SWAPPER;
-			rcu_read_unlock();
-		}
+		if (atomic_read(&clnt->cl_swapper))
+			task->tk_flags |= RPC_TASK_SWAPPER;
 		/* Add to the client's list of all tasks */
 		spin_lock(&clnt->cl_lock);
 		list_add_tail(&task->tk_task, &clnt->cl_tasks);
@@ -2479,3 +2472,59 @@ void rpc_show_tasks(struct net *net)
 	spin_unlock(&sn->rpc_client_lock);
 }
 #endif
+
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+int
+rpc_clnt_swap_activate(struct rpc_clnt *clnt)
+{
+	int ret = 0;
+	struct rpc_xprt	*xprt;
+
+	if (atomic_inc_return(&clnt->cl_swapper) == 1) {
+retry:
+		rcu_read_lock();
+		xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+		rcu_read_unlock();
+		if (!xprt) {
+			/*
+			 * If we didn't get a reference, then we likely are
+			 * racing with a migration event. Wait for a grace
+			 * period and try again.
+			 */
+			synchronize_rcu();
+			goto retry;
+		}
+
+		ret = xs_swapper(xprt, 1);
+		xprt_put(xprt);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate);
+
+void
+rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt	*xprt;
+
+	if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) {
+retry:
+		rcu_read_lock();
+		xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+		rcu_read_unlock();
+		if (!xprt) {
+			/*
+			 * If we didn't get a reference, then we likely are
+			 * racing with a migration event. Wait for a grace
+			 * period and try again.
+			 */
+			synchronize_rcu();
+			goto retry;
+		}
+
+		xs_swapper(xprt, 0);
+		xprt_put(xprt);
+	}
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
+#endif /* CONFIG_SUNRPC_SWAP */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index b142feef0cf4..f344c0f8974c 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1955,7 +1955,7 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 		msleep_interruptible(15000);
 }
 
-#ifdef CONFIG_SUNRPC_SWAP
+#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
@@ -1987,7 +1987,6 @@ int xs_swapper(struct rpc_xprt *xprt, int enable)
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(xs_swapper);
 #else
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
-- 
cgit v1.2.3


From 8e2281330f9930bccf77cf04027ec60b6cc0fb34 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Wed, 3 Jun 2015 16:14:26 -0400
Subject: sunrpc: make xprt->swapper an atomic_t

Split xs_swapper into enable/disable functions and eliminate the
"enable" flag.

Currently, it's racy if you have multiple swapon/swapoff operations
running in parallel over the same xprt. Also fix it so that we only
set it to a memalloc socket on a 0->1 transition and only clear it
on a 1->0 transition.

Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h |  5 +++--
 net/sunrpc/clnt.c           |  4 ++--
 net/sunrpc/xprtsock.c       | 38 +++++++++++++++++++++++++-------------
 3 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index bc9c39d6d30d..9a3308703c15 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -180,7 +180,7 @@ struct rpc_xprt {
 	atomic_t		num_reqs;	/* total slots */
 	unsigned long		state;		/* transport state */
 	unsigned char		resvport   : 1; /* use a reserved port */
-	unsigned int		swapper;	/* we're swapping over this
+	atomic_t		swapper;	/* we're swapping over this
 						   transport */
 	unsigned int		bind_index;	/* bind function index */
 
@@ -346,7 +346,8 @@ void			xprt_release_rqst_cong(struct rpc_task *task);
 void			xprt_disconnect_done(struct rpc_xprt *xprt);
 void			xprt_force_disconnect(struct rpc_xprt *xprt);
 void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
-int			xs_swapper(struct rpc_xprt *xprt, int enable);
+int			xs_swapper_enable(struct rpc_xprt *xprt);
+void			xs_swapper_disable(struct rpc_xprt *xprt);
 
 bool			xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
 void			xprt_unlock_connect(struct rpc_xprt *, void *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0ba65156a62b..801044aeeedd 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2495,7 +2495,7 @@ retry:
 			goto retry;
 		}
 
-		ret = xs_swapper(xprt, 1);
+		ret = xs_swapper_enable(xprt);
 		xprt_put(xprt);
 	}
 	return ret;
@@ -2522,7 +2522,7 @@ retry:
 			goto retry;
 		}
 
-		xs_swapper(xprt, 0);
+		xs_swapper_disable(xprt);
 		xprt_put(xprt);
 	}
 }
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index f344c0f8974c..6538e6fbb7ba 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1961,31 +1961,43 @@ static void xs_set_memalloc(struct rpc_xprt *xprt)
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
 			xprt);
 
-	if (xprt->swapper)
+	if (atomic_read(&xprt->swapper))
 		sk_set_memalloc(transport->inet);
 }
 
 /**
- * xs_swapper - Tag this transport as being used for swap.
+ * xs_swapper_enable - Tag this transport as being used for swap.
  * @xprt: transport to tag
- * @enable: enable/disable
  *
+ * Take a reference to this transport on behalf of the rpc_clnt, and
+ * optionally mark it for swapping if it wasn't already.
  */
-int xs_swapper(struct rpc_xprt *xprt, int enable)
+int
+xs_swapper_enable(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
 			xprt);
-	int err = 0;
 
-	if (enable) {
-		xprt->swapper++;
-		xs_set_memalloc(xprt);
-	} else if (xprt->swapper) {
-		xprt->swapper--;
-		sk_clear_memalloc(transport->inet);
-	}
+	if (atomic_inc_return(&xprt->swapper) == 1)
+		sk_set_memalloc(transport->inet);
+	return 0;
+}
 
-	return err;
+/**
+ * xs_swapper_disable - Untag this transport as being used for swap.
+ * @xprt: transport to tag
+ *
+ * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
+ * swapper refcount goes to 0, untag the socket as a memalloc socket.
+ */
+void
+xs_swapper_disable(struct rpc_xprt *xprt)
+{
+	struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
+			xprt);
+
+	if (atomic_dec_and_test(&xprt->swapper))
+		sk_clear_memalloc(transport->inet);
 }
 #else
 static void xs_set_memalloc(struct rpc_xprt *xprt)
-- 
cgit v1.2.3


From d67fa4d85a2143b46052b2e9ccc6749a4c97b2de Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Wed, 3 Jun 2015 16:14:29 -0400
Subject: sunrpc: turn swapper_enable/disable functions into rpc_xprt_ops

RDMA xprts don't have a sock_xprt, but an rdma_xprt, so the
xs_swapper_enable/disable functions will likely oops when fed an RDMA
xprt. Turn these functions into rpc_xprt_ops so that that doesn't
occur. For now the RDMA versions are no-ops that just return -EINVAL
on an attempt to swapon.

Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h     | 16 ++++++++++++++--
 net/sunrpc/clnt.c               |  4 ++--
 net/sunrpc/xprtrdma/transport.c | 15 ++++++++++++++-
 net/sunrpc/xprtsock.c           | 31 +++++++++++++++++++++++++------
 4 files changed, 55 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 9a3308703c15..b6096592b1d4 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -133,6 +133,8 @@ struct rpc_xprt_ops {
 	void		(*close)(struct rpc_xprt *xprt);
 	void		(*destroy)(struct rpc_xprt *xprt);
 	void		(*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
+	int		(*enable_swap)(struct rpc_xprt *xprt);
+	void		(*disable_swap)(struct rpc_xprt *xprt);
 };
 
 /*
@@ -328,6 +330,18 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
 	return p + xprt->tsh_size;
 }
 
+static inline int
+xprt_enable_swap(struct rpc_xprt *xprt)
+{
+	return xprt->ops->enable_swap(xprt);
+}
+
+static inline void
+xprt_disable_swap(struct rpc_xprt *xprt)
+{
+	xprt->ops->disable_swap(xprt);
+}
+
 /*
  * Transport switch helper functions
  */
@@ -346,8 +360,6 @@ void			xprt_release_rqst_cong(struct rpc_task *task);
 void			xprt_disconnect_done(struct rpc_xprt *xprt);
 void			xprt_force_disconnect(struct rpc_xprt *xprt);
 void			xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
-int			xs_swapper_enable(struct rpc_xprt *xprt);
-void			xs_swapper_disable(struct rpc_xprt *xprt);
 
 bool			xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *);
 void			xprt_unlock_connect(struct rpc_xprt *, void *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 801044aeeedd..576d6ae39f25 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -2495,7 +2495,7 @@ retry:
 			goto retry;
 		}
 
-		ret = xs_swapper_enable(xprt);
+		ret = xprt_enable_swap(xprt);
 		xprt_put(xprt);
 	}
 	return ret;
@@ -2522,7 +2522,7 @@ retry:
 			goto retry;
 		}
 
-		xs_swapper_disable(xprt);
+		xprt_disable_swap(xprt);
 		xprt_put(xprt);
 	}
 }
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 54f23b1be986..ebf6fe759f0e 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -682,6 +682,17 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 	   r_xprt->rx_stats.bad_reply_count);
 }
 
+static int
+xprt_rdma_enable_swap(struct rpc_xprt *xprt)
+{
+	return -EINVAL;
+}
+
+static void
+xprt_rdma_disable_swap(struct rpc_xprt *xprt)
+{
+}
+
 /*
  * Plumbing for rpc transport switch and kernel module
  */
@@ -700,7 +711,9 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 	.send_request		= xprt_rdma_send_request,
 	.close			= xprt_rdma_close,
 	.destroy		= xprt_rdma_destroy,
-	.print_stats		= xprt_rdma_print_stats
+	.print_stats		= xprt_rdma_print_stats,
+	.enable_swap		= xprt_rdma_enable_swap,
+	.disable_swap		= xprt_rdma_disable_swap,
 };
 
 static struct xprt_class xprt_rdma = {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e0efd8514886..bf20726d4ab5 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1980,14 +1980,14 @@ static void xs_set_memalloc(struct rpc_xprt *xprt)
 }
 
 /**
- * xs_swapper_enable - Tag this transport as being used for swap.
+ * xs_enable_swap - Tag this transport as being used for swap.
  * @xprt: transport to tag
  *
  * Take a reference to this transport on behalf of the rpc_clnt, and
  * optionally mark it for swapping if it wasn't already.
  */
-int
-xs_swapper_enable(struct rpc_xprt *xprt)
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
 
@@ -2002,14 +2002,14 @@ xs_swapper_enable(struct rpc_xprt *xprt)
 }
 
 /**
- * xs_swapper_disable - Untag this transport as being used for swap.
+ * xs_disable_swap - Untag this transport as being used for swap.
  * @xprt: transport to tag
  *
  * Drop a "swapper" reference to this xprt on behalf of the rpc_clnt. If the
  * swapper refcount goes to 0, untag the socket as a memalloc socket.
  */
-void
-xs_swapper_disable(struct rpc_xprt *xprt)
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
 {
 	struct sock_xprt *xs = container_of(xprt, struct sock_xprt, xprt);
 
@@ -2025,6 +2025,17 @@ xs_swapper_disable(struct rpc_xprt *xprt)
 static void xs_set_memalloc(struct rpc_xprt *xprt)
 {
 }
+
+static int
+xs_enable_swap(struct rpc_xprt *xprt)
+{
+	return -EINVAL;
+}
+
+static void
+xs_disable_swap(struct rpc_xprt *xprt)
+{
+}
 #endif
 
 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
@@ -2488,6 +2499,8 @@ static struct rpc_xprt_ops xs_local_ops = {
 	.close			= xs_close,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_local_print_stats,
+	.enable_swap		= xs_enable_swap,
+	.disable_swap		= xs_disable_swap,
 };
 
 static struct rpc_xprt_ops xs_udp_ops = {
@@ -2507,6 +2520,8 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.close			= xs_close,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_udp_print_stats,
+	.enable_swap		= xs_enable_swap,
+	.disable_swap		= xs_disable_swap,
 };
 
 static struct rpc_xprt_ops xs_tcp_ops = {
@@ -2523,6 +2538,8 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.close			= xs_tcp_shutdown,
 	.destroy		= xs_destroy,
 	.print_stats		= xs_tcp_print_stats,
+	.enable_swap		= xs_enable_swap,
+	.disable_swap		= xs_disable_swap,
 };
 
 /*
@@ -2540,6 +2557,8 @@ static struct rpc_xprt_ops bc_tcp_ops = {
 	.close			= bc_close,
 	.destroy		= bc_destroy,
 	.print_stats		= xs_tcp_print_stats,
+	.enable_swap		= xs_enable_swap,
+	.disable_swap		= xs_disable_swap,
 };
 
 static int xs_init_anyaddr(const int family, struct sockaddr *sap)
-- 
cgit v1.2.3


From 11598b8ff2b97cf034d0c025cf125c92b574bafc Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Wed, 10 Jun 2015 16:54:28 -0400
Subject: NFS: Remove unused nfs_rw_ops->rw_release() function

This was only ever set to nfs_writeback_release_common(), a function
which is completely empty.  Let's just drop this function pointer and
simplify the code a bit.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/pagelist.c        | 2 --
 fs/nfs/write.c           | 6 ------
 include/linux/nfs_page.h | 1 -
 3 files changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 282b39369510..e9953ab1ac40 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -690,8 +690,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
 static void nfs_pgio_release(void *calldata)
 {
 	struct nfs_pgio_header *hdr = calldata;
-	if (hdr->rw_ops->rw_release)
-		hdr->rw_ops->rw_release(hdr);
 	nfs_pgio_data_destroy(hdr);
 	hdr->completion_ops->completion(hdr);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index dfc19f1575a1..d0f1f210342e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1347,11 +1347,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
 	NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
 }
 
-static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
-{
-	/* do nothing! */
-}
-
 /*
  * Special version of should_remove_suid() that ignores capabilities.
  */
@@ -2012,7 +2007,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = {
 	.rw_mode		= FMODE_WRITE,
 	.rw_alloc_header	= nfs_writehdr_alloc,
 	.rw_free_header		= nfs_writehdr_free,
-	.rw_release		= nfs_writeback_release_common,
 	.rw_done		= nfs_writeback_done,
 	.rw_result		= nfs_writeback_result,
 	.rw_initiate		= nfs_initiate_write,
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 3eb072dbce83..f2f650f136ee 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -67,7 +67,6 @@ struct nfs_rw_ops {
 	const fmode_t rw_mode;
 	struct nfs_pgio_header *(*rw_alloc_header)(void);
 	void (*rw_free_header)(struct nfs_pgio_header *);
-	void (*rw_release)(struct nfs_pgio_header *);
 	int  (*rw_done)(struct rpc_task *, struct nfs_pgio_header *,
 			struct inode *);
 	void (*rw_result)(struct rpc_task *, struct nfs_pgio_header *);
-- 
cgit v1.2.3


From 4a06825839889cc1756d0dd8a52d6b1071ee0263 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 11 May 2015 14:02:25 -0400
Subject: SUNRPC: Transport fault injection

It has been exceptionally useful to exercise the logic that handles
local immediate errors and RDMA connection loss.  To enable
developers to test this regularly and repeatably, add logic to
simulate connection loss every so often.

Fault injection is disabled by default. It is enabled with

  $ sudo echo xxx > /sys/kernel/debug/sunrpc/inject_fault/disconnect

where "xxx" is a large positive number of transport method calls
before a disconnect. A value of several thousand is usually a good
number that allows reasonable forward progress while still causing a
lot of connection drops.

These hooks are disabled when SUNRPC_DEBUG is turned off.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h     | 19 ++++++++++
 net/sunrpc/clnt.c               |  1 +
 net/sunrpc/debugfs.c            | 77 +++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/xprt.c               |  2 ++
 net/sunrpc/xprtrdma/transport.c | 11 ++++++
 net/sunrpc/xprtsock.c           | 10 ++++++
 6 files changed, 120 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index b6096592b1d4..0fb9acbb4780 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -135,6 +135,7 @@ struct rpc_xprt_ops {
 	void		(*print_stats)(struct rpc_xprt *xprt, struct seq_file *seq);
 	int		(*enable_swap)(struct rpc_xprt *xprt);
 	void		(*disable_swap)(struct rpc_xprt *xprt);
+	void		(*inject_disconnect)(struct rpc_xprt *xprt);
 };
 
 /*
@@ -244,6 +245,7 @@ struct rpc_xprt {
 	const char		*address_strings[RPC_DISPLAY_MAX];
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 	struct dentry		*debugfs;		/* debugfs directory */
+	atomic_t		inject_disconnect;
 #endif
 };
 
@@ -445,6 +447,23 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt)
 	return test_and_set_bit(XPRT_BINDING, &xprt->state);
 }
 
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+extern unsigned int rpc_inject_disconnect;
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+	if (!rpc_inject_disconnect)
+		return;
+	if (atomic_dec_return(&xprt->inject_disconnect))
+		return;
+	atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
+	xprt->ops->inject_disconnect(xprt);
+}
+#else
+static inline void xprt_inject_disconnect(struct rpc_xprt *xprt)
+{
+}
+#endif
+
 #endif /* __KERNEL__*/
 
 #endif /* _LINUX_SUNRPC_XPRT_H */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 576d6ae39f25..f41ed882ed3c 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1605,6 +1605,7 @@ call_allocate(struct rpc_task *task)
 					req->rq_callsize + req->rq_rcvsize);
 	if (req->rq_buffer != NULL)
 		return;
+	xprt_inject_disconnect(xprt);
 
 	dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index 82962f7e6e88..7cc1b8a6ef6d 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -10,9 +10,12 @@
 #include "netns.h"
 
 static struct dentry *topdir;
+static struct dentry *rpc_fault_dir;
 static struct dentry *rpc_clnt_dir;
 static struct dentry *rpc_xprt_dir;
 
+unsigned int rpc_inject_disconnect;
+
 struct rpc_clnt_iter {
 	struct rpc_clnt	*clnt;
 	loff_t		pos;
@@ -257,6 +260,8 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt)
 		debugfs_remove_recursive(xprt->debugfs);
 		xprt->debugfs = NULL;
 	}
+
+	atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect);
 }
 
 void
@@ -266,11 +271,78 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt)
 	xprt->debugfs = NULL;
 }
 
+static int
+fault_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = kmalloc(128, GFP_KERNEL);
+	if (!filp->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int
+fault_release(struct inode *inode, struct file *filp)
+{
+	kfree(filp->private_data);
+	return 0;
+}
+
+static ssize_t
+fault_disconnect_read(struct file *filp, char __user *user_buf,
+		      size_t len, loff_t *offset)
+{
+	char *buffer = (char *)filp->private_data;
+	size_t size;
+
+	size = sprintf(buffer, "%u\n", rpc_inject_disconnect);
+	return simple_read_from_buffer(user_buf, len, offset, buffer, size);
+}
+
+static ssize_t
+fault_disconnect_write(struct file *filp, const char __user *user_buf,
+		       size_t len, loff_t *offset)
+{
+	char buffer[16];
+
+	len = min(len, sizeof(buffer) - 1);
+	if (copy_from_user(buffer, user_buf, len))
+		return -EFAULT;
+	buffer[len] = '\0';
+	if (kstrtouint(buffer, 10, &rpc_inject_disconnect))
+		return -EINVAL;
+	return len;
+}
+
+static const struct file_operations fault_disconnect_fops = {
+	.owner		= THIS_MODULE,
+	.open		= fault_open,
+	.read		= fault_disconnect_read,
+	.write		= fault_disconnect_write,
+	.release	= fault_release,
+};
+
+static struct dentry *
+inject_fault_dir(struct dentry *topdir)
+{
+	struct dentry *faultdir;
+
+	faultdir = debugfs_create_dir("inject_fault", topdir);
+	if (!faultdir)
+		return NULL;
+
+	if (!debugfs_create_file("disconnect", S_IFREG | S_IRUSR, faultdir,
+				 NULL, &fault_disconnect_fops))
+		return NULL;
+
+	return faultdir;
+}
+
 void __exit
 sunrpc_debugfs_exit(void)
 {
 	debugfs_remove_recursive(topdir);
 	topdir = NULL;
+	rpc_fault_dir = NULL;
 	rpc_clnt_dir = NULL;
 	rpc_xprt_dir = NULL;
 }
@@ -282,6 +354,10 @@ sunrpc_debugfs_init(void)
 	if (!topdir)
 		return;
 
+	rpc_fault_dir = inject_fault_dir(topdir);
+	if (!rpc_fault_dir)
+		goto out_remove;
+
 	rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir);
 	if (!rpc_clnt_dir)
 		goto out_remove;
@@ -294,5 +370,6 @@ sunrpc_debugfs_init(void)
 out_remove:
 	debugfs_remove_recursive(topdir);
 	topdir = NULL;
+	rpc_fault_dir = NULL;
 	rpc_clnt_dir = NULL;
 }
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1d4fe24af06a..e1fb538e10e0 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -967,6 +967,7 @@ void xprt_transmit(struct rpc_task *task)
 		task->tk_status = status;
 		return;
 	}
+	xprt_inject_disconnect(xprt);
 
 	dprintk("RPC: %5u xmit complete\n", task->tk_pid);
 	task->tk_flags |= RPC_TASK_SENT;
@@ -1285,6 +1286,7 @@ void xprt_release(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
 		xprt->ops->buf_free(req->rq_buffer);
+	xprt_inject_disconnect(xprt);
 	if (req->rq_cred != NULL)
 		put_rpccred(req->rq_cred);
 	task->tk_rqstp = NULL;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index ebf6fe759f0e..b8aac23b1b0c 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -246,6 +246,16 @@ xprt_rdma_connect_worker(struct work_struct *work)
 	xprt_clear_connecting(xprt);
 }
 
+static void
+xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt,
+						   rx_xprt);
+
+	pr_info("rpcrdma: injecting transport disconnect on xprt=%p\n", xprt);
+	rdma_disconnect(r_xprt->rx_ia.ri_id);
+}
+
 /*
  * xprt_rdma_destroy
  *
@@ -714,6 +724,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
 	.print_stats		= xprt_rdma_print_stats,
 	.enable_swap		= xprt_rdma_enable_swap,
 	.disable_swap		= xprt_rdma_disable_swap,
+	.inject_disconnect	= xprt_rdma_inject_disconnect
 };
 
 static struct xprt_class xprt_rdma = {
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf20726d4ab5..fda8ec8c74c0 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -866,6 +866,13 @@ static void xs_close(struct rpc_xprt *xprt)
 	xprt_disconnect_done(xprt);
 }
 
+static void xs_inject_disconnect(struct rpc_xprt *xprt)
+{
+	dprintk("RPC:       injecting transport disconnect on xprt=%p\n",
+		xprt);
+	xprt_disconnect_done(xprt);
+}
+
 static void xs_xprt_free(struct rpc_xprt *xprt)
 {
 	xs_free_peer_addresses(xprt);
@@ -2522,6 +2529,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
 	.print_stats		= xs_udp_print_stats,
 	.enable_swap		= xs_enable_swap,
 	.disable_swap		= xs_disable_swap,
+	.inject_disconnect	= xs_inject_disconnect,
 };
 
 static struct rpc_xprt_ops xs_tcp_ops = {
@@ -2540,6 +2548,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 	.enable_swap		= xs_enable_swap,
 	.disable_swap		= xs_disable_swap,
+	.inject_disconnect	= xs_inject_disconnect,
 };
 
 /*
@@ -2559,6 +2568,7 @@ static struct rpc_xprt_ops bc_tcp_ops = {
 	.print_stats		= xs_tcp_print_stats,
 	.enable_swap		= xs_enable_swap,
 	.disable_swap		= xs_disable_swap,
+	.inject_disconnect	= xs_inject_disconnect,
 };
 
 static int xs_init_anyaddr(const int family, struct sockaddr *sap)
-- 
cgit v1.2.3


From 7e53df111beea8db2543424d07bdee2a630698c3 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 26 May 2015 11:53:04 -0400
Subject: xprtrdma: Remove rpcrdma_ia::ri_memreg_strategy

Clean up: This field is no longer used.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Devesh Sharma <devesh.sharma@avagotech.com>
Tested-By: Devesh Sharma <devesh.sharma@avagotech.com>
Reviewed-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprtrdma.h | 3 ++-
 net/sunrpc/xprtrdma/verbs.c     | 3 ---
 net/sunrpc/xprtrdma/xprt_rdma.h | 1 -
 3 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index c984c85981ea..b17613052cc3 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -56,7 +56,8 @@
 
 #define RPCRDMA_INLINE_PAD_THRESH  (512)/* payload threshold to pad (bytes) */
 
-/* memory registration strategies */
+/* Memory registration strategies, by number.
+ * This is part of a kernel / user space API. Do not remove. */
 enum rpcrdma_memreg {
 	RPCRDMA_BOUNCEBUFFERS = 0,
 	RPCRDMA_REGISTER,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index db9303a6a145..cc1a52609974 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -665,9 +665,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
 	dprintk("RPC:       %s: memory registration strategy is '%s'\n",
 		__func__, ia->ri_ops->ro_displayname);
 
-	/* Else will do memory reg/dereg for each chunk */
-	ia->ri_memreg_strategy = memreg;
-
 	rwlock_init(&ia->ri_qplock);
 	return 0;
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index f19376d35750..3ecee38bf1a0 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,7 +70,6 @@ struct rpcrdma_ia {
 	int			ri_have_dma_lkey;
 	struct completion	ri_done;
 	int			ri_async_rc;
-	enum rpcrdma_memreg	ri_memreg_strategy;
 	unsigned int		ri_max_frmr_depth;
 	struct ib_device_attr	ri_devattr;
 	struct ib_qp_attr	ri_qp_attr;
-- 
cgit v1.2.3


From 764ad8ba8cd4c6f836fca9378f8c5121aece0842 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 9 Jun 2015 19:43:56 -0400
Subject: nfs: increase size of EXCHANGE_ID name string buffer

The current buffer is much too small if you have a relatively long
hostname. Bring it up to the size of the one that SETCLIENTID has.

Cc: <stable@vger.kernel.org>
Reported-by: Michael Skralivetsky <michael.skralivetsky@primarydata.com>
Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/nfs_xdr.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 93ab6071bbe9..e9e9a8dcfb47 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1142,7 +1142,7 @@ struct nfs41_state_protection {
 	struct nfs4_op_map allow;
 };
 
-#define NFS4_EXCHANGE_ID_LEN	(48)
+#define NFS4_EXCHANGE_ID_LEN	(127)
 struct nfs41_exchange_id_args {
 	struct nfs_client		*client;
 	nfs4_verifier			*verifier;
-- 
cgit v1.2.3


From 3a6bb738792500e8b4534c0350c13a132bac0492 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 9 Jun 2015 19:43:57 -0400
Subject: nfs: convert setclientid and exchange_id encoders to use
 clp->cl_owner_id

...instead of buffers that are part of their arg structs. We already
hold a reference to the client, so we might as well use the allocated
buffer. In the event that we can't allocate the clp->cl_owner_id, then
just return -ENOMEM.

Note too that we switch from a GFP_KERNEL allocation here to GFP_NOFS.
It's possible we could end up trying to do a SETCLIENTID or EXCHANGE_ID
in order to reclaim some memory, and the GFP_KERNEL allocations in the
existing code could cause recursion back into NFS reclaim.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 21 ++++++++++++++++-----
 fs/nfs/nfs4xdr.c        |  8 +++++---
 include/linux/nfs_xdr.h |  2 +-
 3 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d689ea37be84..b2afe9556147 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5046,7 +5046,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 	struct nfs4_setclientid setclientid = {
 		.sc_verifier = &sc_verifier,
 		.sc_prog = program,
-		.sc_cb_ident = clp->cl_cb_ident,
+		.sc_clnt = clp,
 	};
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID],
@@ -5076,6 +5076,12 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 				nfs4_init_nonuniform_client_string(clp,
 						setclientid.sc_name,
 						sizeof(setclientid.sc_name));
+
+	if (!clp->cl_owner_id) {
+		status = -ENOMEM;
+		goto out;
+	}
+
 	/* cb_client4 */
 	setclientid.sc_netid_len =
 				nfs4_init_callback_netid(clp,
@@ -5085,9 +5091,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 				sizeof(setclientid.sc_uaddr), "%s.%u.%u",
 				clp->cl_ipaddr, port >> 8, port & 255);
 
-	dprintk("NFS call  setclientid auth=%s, '%.*s'\n",
+	dprintk("NFS call  setclientid auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
-		setclientid.sc_name_len, setclientid.sc_name);
+		clp->cl_owner_id);
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task)) {
 		status = PTR_ERR(task);
@@ -6850,9 +6856,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	nfs4_init_boot_verifier(clp, &verifier);
 	args.id_len = nfs4_init_uniform_client_string(clp, args.id,
 							sizeof(args.id));
-	dprintk("NFS call  exchange_id auth=%s, '%.*s'\n",
+	if (!clp->cl_owner_id) {
+		status = -ENOMEM;
+		goto out;
+	}
+
+	dprintk("NFS call  exchange_id auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
-		args.id_len, args.id);
+		clp->cl_owner_id);
 
 	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
 					GFP_NOFS);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0aea97841d30..f58e8a979397 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1667,13 +1667,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 	encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr);
 	encode_nfs4_verifier(xdr, setclientid->sc_verifier);
 
-	encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
+	encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id),
+			setclientid->sc_clnt->cl_owner_id);
 	p = reserve_space(xdr, 4);
 	*p = cpu_to_be32(setclientid->sc_prog);
 	encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
 	encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
 	p = reserve_space(xdr, 4);
-	*p = cpu_to_be32(setclientid->sc_cb_ident);
+	*p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident);
 }
 
 static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr)
@@ -1747,7 +1748,8 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 	encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr);
 	encode_nfs4_verifier(xdr, args->verifier);
 
-	encode_string(xdr, args->id_len, args->id);
+	encode_string(xdr, strlen(args->client->cl_owner_id),
+			args->client->cl_owner_id);
 
 	encode_uint32(xdr, args->flags);
 	encode_uint32(xdr, args->state_protect.how);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index e9e9a8dcfb47..6c0b423d781b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -994,7 +994,7 @@ struct nfs4_setclientid {
 	char				sc_netid[RPCBIND_MAXNETIDLEN + 1];
 	unsigned int			sc_uaddr_len;
 	char				sc_uaddr[RPCBIND_MAXUADDRLEN + 1];
-	u32				sc_cb_ident;
+	struct nfs_client		*sc_clnt;
 	struct rpc_cred			*sc_cred;
 };
 
-- 
cgit v1.2.3


From 873e385116b2cc5c7daca8f51881371fadb90970 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 9 Jun 2015 19:44:00 -0400
Subject: nfs: make nfs4_init_uniform_client_string use a dynamically allocated
 buffer

Change the uniform client string generator to dynamically allocate the
NFSv4 client name string buffer. With this patch, we can eliminate the
buffers that are embedded within the "args" structs and simply use the
name string that is hanging off the client.

This uniform string case is a little simpler than the nonuniform since
we don't need to deal with RCU, but we do have two different cases,
depending on whether there is a uniquifier or not.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c       | 112 +++++++++++++++++++++++++++++++++---------------
 include/linux/nfs_xdr.h |   6 ---
 2 files changed, 77 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c20322636a68..d181090124db 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5007,28 +5007,78 @@ retry:
 	return 0;
 }
 
-static unsigned int
-nfs4_init_uniform_client_string(struct nfs_client *clp,
-				char *buf, size_t len)
+static int
+nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 {
-	const char *nodename = clp->cl_rpcclient->cl_nodename;
-	unsigned int result;
+	int result;
+	size_t len;
+	char *str;
+
+	len = 10 + 10 + 1 + 10 + 1 +
+		strlen(nfs4_client_id_uniquifier) + 1 +
+		strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+	if (len > NFS4_OPAQUE_LIMIT + 1)
+		return -EINVAL;
+
+	/*
+	 * Since this string is allocated at mount time, and held until the
+	 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+	 * about a memory-reclaim deadlock.
+	 */
+	str = kmalloc(len, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+			clp->rpc_ops->version, clp->cl_minorversion,
+			nfs4_client_id_uniquifier,
+			clp->cl_rpcclient->cl_nodename);
+	if (result >= len) {
+		kfree(str);
+		return -EINVAL;
+	}
+	clp->cl_owner_id = str;
+	return 0;
+}
+
+static int
+nfs4_init_uniform_client_string(struct nfs_client *clp)
+{
+	int result;
+	size_t len;
+	char *str;
 
 	if (clp->cl_owner_id != NULL)
-		return strlcpy(buf, clp->cl_owner_id, len);
+		return 0;
 
 	if (nfs4_client_id_uniquifier[0] != '\0')
-		result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
-				clp->rpc_ops->version,
-				clp->cl_minorversion,
-				nfs4_client_id_uniquifier,
-				nodename);
-	else
-		result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
-				clp->rpc_ops->version, clp->cl_minorversion,
-				nodename);
-	clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
-	return result;
+		return nfs4_init_uniquifier_client_string(clp);
+
+	len = 10 + 10 + 1 + 10 + 1 +
+		strlen(clp->cl_rpcclient->cl_nodename) + 1;
+
+	if (len > NFS4_OPAQUE_LIMIT + 1)
+		return -EINVAL;
+
+	/*
+	 * Since this string is allocated at mount time, and held until the
+	 * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying
+	 * about a memory-reclaim deadlock.
+	 */
+	str = kmalloc(len, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+			clp->rpc_ops->version, clp->cl_minorversion,
+			clp->cl_rpcclient->cl_nodename);
+	if (result >= len) {
+		kfree(str);
+		return -EINVAL;
+	}
+	clp->cl_owner_id = str;
+	return 0;
 }
 
 /*
@@ -5095,20 +5145,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 
 	/* nfs_client_id4 */
 	nfs4_init_boot_verifier(clp, &sc_verifier);
-	if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) {
-		setclientid.sc_name_len =
-				nfs4_init_uniform_client_string(clp,
-						setclientid.sc_name,
-						sizeof(setclientid.sc_name));
-		if (!clp->cl_owner_id) {
-			status = -ENOMEM;
-			goto out;
-		}
-	} else {
+
+	if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags))
+		status = nfs4_init_uniform_client_string(clp);
+	else
 		status = nfs4_init_nonuniform_client_string(clp);
-		if (status)
-			goto out;
-	}
+
+	if (status)
+		goto out;
 
 	/* cb_client4 */
 	setclientid.sc_netid_len =
@@ -6882,12 +6926,10 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 	};
 
 	nfs4_init_boot_verifier(clp, &verifier);
-	args.id_len = nfs4_init_uniform_client_string(clp, args.id,
-							sizeof(args.id));
-	if (!clp->cl_owner_id) {
-		status = -ENOMEM;
+
+	status = nfs4_init_uniform_client_string(clp);
+	if (status)
 		goto out;
-	}
 
 	dprintk("NFS call  exchange_id auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6c0b423d781b..c959e7eb5bd4 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -984,11 +984,8 @@ struct nfs4_readlink_res {
 	struct nfs4_sequence_res	seq_res;
 };
 
-#define NFS4_SETCLIENTID_NAMELEN	(127)
 struct nfs4_setclientid {
 	const nfs4_verifier *		sc_verifier;
-	unsigned int			sc_name_len;
-	char				sc_name[NFS4_SETCLIENTID_NAMELEN + 1];
 	u32				sc_prog;
 	unsigned int			sc_netid_len;
 	char				sc_netid[RPCBIND_MAXNETIDLEN + 1];
@@ -1142,12 +1139,9 @@ struct nfs41_state_protection {
 	struct nfs4_op_map allow;
 };
 
-#define NFS4_EXCHANGE_ID_LEN	(127)
 struct nfs41_exchange_id_args {
 	struct nfs_client		*client;
 	nfs4_verifier			*verifier;
-	unsigned int 			id_len;
-	char 				id[NFS4_EXCHANGE_ID_LEN];
 	u32				flags;
 	struct nfs41_state_protection	state_protect;
 };
-- 
cgit v1.2.3


From fec47d863587c272d6fbf4e50066209c953d5e60 Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Fri, 22 May 2015 15:45:27 -0500
Subject: remoteproc: introduce rproc_get_by_phandle API

Allow users of remoteproc the ability to get a handle to an rproc by
passing a phandle supplied in the user's device tree node. This is
useful in situations that require manual booting of the rproc.

This patch uses the code removed by commit 40e575b1d0b3 ("remoteproc:
remove the get_by_name/put API") for the ref counting but is modified
to use a simple list and locking mechanism and has rproc_get_by_name
replaced with an rproc_get_by_phandle API.

Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Suman Anna <s-anna@ti.com>
[fix order of Signed-off-by tags]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 Documentation/remoteproc.txt         |  6 +++++
 drivers/remoteproc/remoteproc_core.c | 50 ++++++++++++++++++++++++++++++++++++
 include/linux/remoteproc.h           |  7 ++---
 3 files changed, 60 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/remoteproc.txt b/Documentation/remoteproc.txt
index e6469fdcf89a..ef0219fa4bb4 100644
--- a/Documentation/remoteproc.txt
+++ b/Documentation/remoteproc.txt
@@ -51,6 +51,12 @@ cost.
         rproc_shutdown() returns, and users can still use it with a subsequent
         rproc_boot(), if needed.
 
+  struct rproc *rproc_get_by_phandle(phandle phandle)
+    - Find an rproc handle using a device tree phandle. Returns the rproc
+      handle on success, and NULL on failure. This function increments
+      the remote processor's refcount, so always use rproc_put() to
+      decrement it back once rproc isn't needed anymore.
+
 3. Typical usage
 
 #include <linux/remoteproc.h>
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index e991512e04ed..e1a6d693b8bb 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -44,6 +44,9 @@
 
 #include "remoteproc_internal.h"
 
+static DEFINE_MUTEX(rproc_list_mutex);
+static LIST_HEAD(rproc_list);
+
 typedef int (*rproc_handle_resources_t)(struct rproc *rproc,
 				struct resource_table *table, int len);
 typedef int (*rproc_handle_resource_t)(struct rproc *rproc,
@@ -1144,6 +1147,43 @@ out:
 }
 EXPORT_SYMBOL(rproc_shutdown);
 
+/**
+ * rproc_get_by_phandle() - find a remote processor by phandle
+ * @phandle: phandle to the rproc
+ *
+ * Finds an rproc handle using the remote processor's phandle, and then
+ * return a handle to the rproc.
+ *
+ * This function increments the remote processor's refcount, so always
+ * use rproc_put() to decrement it back once rproc isn't needed anymore.
+ *
+ * Returns the rproc handle on success, and NULL on failure.
+ */
+struct rproc *rproc_get_by_phandle(phandle phandle)
+{
+	struct rproc *rproc = NULL, *r;
+	struct device_node *np;
+
+	np = of_find_node_by_phandle(phandle);
+	if (!np)
+		return NULL;
+
+	mutex_lock(&rproc_list_mutex);
+	list_for_each_entry(r, &rproc_list, node) {
+		if (r->dev.parent && r->dev.parent->of_node == np) {
+			rproc = r;
+			get_device(&rproc->dev);
+			break;
+		}
+	}
+	mutex_unlock(&rproc_list_mutex);
+
+	of_node_put(np);
+
+	return rproc;
+}
+EXPORT_SYMBOL(rproc_get_by_phandle);
+
 /**
  * rproc_add() - register a remote processor
  * @rproc: the remote processor handle to register
@@ -1173,6 +1213,11 @@ int rproc_add(struct rproc *rproc)
 	if (ret < 0)
 		return ret;
 
+	/* expose to rproc_get_by_phandle users */
+	mutex_lock(&rproc_list_mutex);
+	list_add(&rproc->node, &rproc_list);
+	mutex_unlock(&rproc_list_mutex);
+
 	dev_info(dev, "%s is available\n", rproc->name);
 
 	dev_info(dev, "Note: remoteproc is still under development and considered experimental.\n");
@@ -1360,6 +1405,11 @@ int rproc_del(struct rproc *rproc)
 	/* Free the copy of the resource table */
 	kfree(rproc->cached_table);
 
+	/* the rproc is downref'ed as soon as it's removed from the klist */
+	mutex_lock(&rproc_list_mutex);
+	list_del(&rproc->node);
+	mutex_unlock(&rproc_list_mutex);
+
 	device_del(&rproc->dev);
 
 	return 0;
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 78b8a9b9d40a..56739e5df1e6 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -36,11 +36,11 @@
 #define REMOTEPROC_H
 
 #include <linux/types.h>
-#include <linux/klist.h>
 #include <linux/mutex.h>
 #include <linux/virtio.h>
 #include <linux/completion.h>
 #include <linux/idr.h>
+#include <linux/of.h>
 
 /**
  * struct resource_table - firmware resource table header
@@ -375,7 +375,7 @@ enum rproc_crash_type {
 
 /**
  * struct rproc - represents a physical remote processor device
- * @node: klist node of this rproc object
+ * @node: list node of this rproc object
  * @domain: iommu domain
  * @name: human readable name of the rproc
  * @firmware: name of firmware file to be loaded
@@ -407,7 +407,7 @@ enum rproc_crash_type {
  * @has_iommu: flag to indicate if remote processor is behind an MMU
  */
 struct rproc {
-	struct klist_node node;
+	struct list_head node;
 	struct iommu_domain *domain;
 	const char *name;
 	const char *firmware;
@@ -481,6 +481,7 @@ struct rproc_vdev {
 	u32 rsc_offset;
 };
 
+struct rproc *rproc_get_by_phandle(phandle phandle);
 struct rproc *rproc_alloc(struct device *dev, const char *name,
 				const struct rproc_ops *ops,
 				const char *firmware, int len);
-- 
cgit v1.2.3


From a01f7cd657c95941079d548ef7fbf0cc370c1259 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Fri, 22 May 2015 15:45:28 -0500
Subject: remoteproc: add a rproc ops for performing address translation

The rproc_da_to_va API is currently used to perform any device to
kernel address translations to meet the different needs of the remoteproc
core/drivers (eg: loading). The functionality is achieved within the
remoteproc core, and is limited only for carveouts allocated within the
core.

A new rproc ops, da_to_va, is added to provide flexibility to platform
implementations to perform the address translation themselves when the
above conditions cannot be met by the implementations. The rproc_da_to_va()
API is extended to invoke this ops if present, and fallback to regular
processing if the platform implementation cannot provide the translation.
This will allow any remoteproc implementations to translate addresses for
dedicated memories like internal memories.

While at this, also update the rproc_da_to_va() documentation since it
is an exported function.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 drivers/remoteproc/remoteproc_core.c | 31 +++++++++++++++++++++++++------
 include/linux/remoteproc.h           |  2 ++
 2 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index e1a6d693b8bb..b4ab38015cf2 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -137,28 +137,46 @@ static void rproc_disable_iommu(struct rproc *rproc)
 	iommu_domain_free(domain);
 }
 
-/*
+/**
+ * rproc_da_to_va() - lookup the kernel virtual address for a remoteproc address
+ * @rproc: handle of a remote processor
+ * @da: remoteproc device address to translate
+ * @len: length of the memory region @da is pointing to
+ *
  * Some remote processors will ask us to allocate them physically contiguous
  * memory regions (which we call "carveouts"), and map them to specific
- * device addresses (which are hardcoded in the firmware).
+ * device addresses (which are hardcoded in the firmware). They may also have
+ * dedicated memory regions internal to the processors, and use them either
+ * exclusively or alongside carveouts.
  *
  * They may then ask us to copy objects into specific device addresses (e.g.
  * code/data sections) or expose us certain symbols in other device address
  * (e.g. their trace buffer).
  *
- * This function is an internal helper with which we can go over the allocated
- * carveouts and translate specific device address to kernel virtual addresses
- * so we can access the referenced memory.
+ * This function is a helper function with which we can go over the allocated
+ * carveouts and translate specific device addresses to kernel virtual addresses
+ * so we can access the referenced memory. This function also allows to perform
+ * translations on the internal remoteproc memory regions through a platform
+ * implementation specific da_to_va ops, if present.
+ *
+ * The function returns a valid kernel address on success or NULL on failure.
  *
  * Note: phys_to_virt(iommu_iova_to_phys(rproc->domain, da)) will work too,
  * but only on kernel direct mapped RAM memory. Instead, we're just using
- * here the output of the DMA API, which should be more correct.
+ * here the output of the DMA API for the carveouts, which should be more
+ * correct.
  */
 void *rproc_da_to_va(struct rproc *rproc, u64 da, int len)
 {
 	struct rproc_mem_entry *carveout;
 	void *ptr = NULL;
 
+	if (rproc->ops->da_to_va) {
+		ptr = rproc->ops->da_to_va(rproc, da, len);
+		if (ptr)
+			goto out;
+	}
+
 	list_for_each_entry(carveout, &rproc->carveouts, node) {
 		int offset = da - carveout->da;
 
@@ -175,6 +193,7 @@ void *rproc_da_to_va(struct rproc *rproc, u64 da, int len)
 		break;
 	}
 
+out:
 	return ptr;
 }
 EXPORT_SYMBOL(rproc_da_to_va);
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 56739e5df1e6..9c4e1384f636 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -330,11 +330,13 @@ struct rproc;
  * @start:	power on the device and boot it
  * @stop:	power off the device
  * @kick:	kick a virtqueue (virtqueue id given as a parameter)
+ * @da_to_va:	optional platform hook to perform address translations
  */
 struct rproc_ops {
 	int (*start)(struct rproc *rproc);
 	int (*stop)(struct rproc *rproc);
 	void (*kick)(struct rproc *rproc, int vqid);
+	void * (*da_to_va)(struct rproc *rproc, u64 da, int len);
 };
 
 /**
-- 
cgit v1.2.3


From a01bc0d5f557bd8becd0ba75d09c39192004697e Mon Sep 17 00:00:00 2001
From: Dave Gerlach <d-gerlach@ti.com>
Date: Fri, 22 May 2015 15:45:30 -0500
Subject: remoteproc/wkup_m3: add a remoteproc driver for TI Wakeup M3

Add a remoteproc driver to load the firmware and boot a small
Wakeup M3 processor present on TI AM33xx and AM43xx SoCs. This
Wakeup M3 remote processor is an integrated Cortex M3 that allows
the SoC to enter the lowest possible power state by taking control
from the MPU after it has gone into its own low power state and
shutting off any additional peripherals.

The Wakeup M3 processor has two internal memory regions - 16 kB of
unified instruction memory called UMEM used to store executable
code, and 8 kB of data memory called DMEM used for all data sections.
The Wakeup M3 processor executes its code entirely from within the
UMEM and uses the DMEM for any data. It does not use any external
memory or any other external resources. The device address view has
the UMEM at address 0x0 and DMEM at address 0x80000, and these are
computed automatically within the driver based on relative address
calculation from the corresponding device tree IOMEM resources.
These device addresses are used to aid the core remoteproc ELF
loader code to properly translate and load the firmware segments
through the .rproc_da_to_va ops.

Signed-off-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 drivers/remoteproc/Kconfig            |  13 ++
 drivers/remoteproc/Makefile           |   1 +
 drivers/remoteproc/wkup_m3_rproc.c    | 257 ++++++++++++++++++++++++++++++++++
 include/linux/platform_data/wkup_m3.h |  30 ++++
 4 files changed, 301 insertions(+)
 create mode 100644 drivers/remoteproc/wkup_m3_rproc.c
 create mode 100644 include/linux/platform_data/wkup_m3.h

(limited to 'include/linux')

diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
index 5e343bab9458..28c711f0ac6b 100644
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -41,6 +41,19 @@ config STE_MODEM_RPROC
 	  This can be either built-in or a loadable module.
 	  If unsure say N.
 
+config WKUP_M3_RPROC
+	tristate "AMx3xx Wakeup M3 remoteproc support"
+	depends on SOC_AM33XX || SOC_AM43XX
+	select REMOTEPROC
+	help
+	  Say y here to support Wakeup M3 remote processor on TI AM33xx
+	  and AM43xx family of SoCs.
+
+	  Required for Suspend-to-RAM on AM33xx and AM43xx SoCs. Also needed
+	  for deep CPUIdle states on AM33xx SoCs. Allows for loading of the
+	  firmware onto these remote processors.
+	  If unsure say N.
+
 config DA8XX_REMOTEPROC
 	tristate "DA8xx/OMAP-L13x remoteproc support"
 	depends on ARCH_DAVINCI_DA8XX
diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile
index ac2ff75686d2..81b04d1e2e58 100644
--- a/drivers/remoteproc/Makefile
+++ b/drivers/remoteproc/Makefile
@@ -9,4 +9,5 @@ remoteproc-y				+= remoteproc_virtio.o
 remoteproc-y				+= remoteproc_elf_loader.o
 obj-$(CONFIG_OMAP_REMOTEPROC)		+= omap_remoteproc.o
 obj-$(CONFIG_STE_MODEM_RPROC)	 	+= ste_modem_rproc.o
+obj-$(CONFIG_WKUP_M3_RPROC)		+= wkup_m3_rproc.o
 obj-$(CONFIG_DA8XX_REMOTEPROC)		+= da8xx_remoteproc.o
diff --git a/drivers/remoteproc/wkup_m3_rproc.c b/drivers/remoteproc/wkup_m3_rproc.c
new file mode 100644
index 000000000000..edf81819cce1
--- /dev/null
+++ b/drivers/remoteproc/wkup_m3_rproc.c
@@ -0,0 +1,257 @@
+/*
+ * TI AMx3 Wakeup M3 Remote Processor driver
+ *
+ * Copyright (C) 2014-2015 Texas Instruments, Inc.
+ *
+ * Dave Gerlach <d-gerlach@ti.com>
+ * Suman Anna <s-anna@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
+#include <linux/remoteproc.h>
+
+#include <linux/platform_data/wkup_m3.h>
+
+#include "remoteproc_internal.h"
+
+#define WKUPM3_MEM_MAX	2
+
+/**
+ * struct wkup_m3_mem - WkupM3 internal memory structure
+ * @cpu_addr: MPU virtual address of the memory region
+ * @bus_addr: Bus address used to access the memory region
+ * @dev_addr: Device address from Wakeup M3 view
+ * @size: Size of the memory region
+ */
+struct wkup_m3_mem {
+	void __iomem *cpu_addr;
+	phys_addr_t bus_addr;
+	u32 dev_addr;
+	size_t size;
+};
+
+/**
+ * struct wkup_m3_rproc - WkupM3 remote processor state
+ * @rproc: rproc handle
+ * @pdev: pointer to platform device
+ * @mem: WkupM3 memory information
+ */
+struct wkup_m3_rproc {
+	struct rproc *rproc;
+	struct platform_device *pdev;
+	struct wkup_m3_mem mem[WKUPM3_MEM_MAX];
+};
+
+static int wkup_m3_rproc_start(struct rproc *rproc)
+{
+	struct wkup_m3_rproc *wkupm3 = rproc->priv;
+	struct platform_device *pdev = wkupm3->pdev;
+	struct device *dev = &pdev->dev;
+	struct wkup_m3_platform_data *pdata = dev_get_platdata(dev);
+
+	if (pdata->deassert_reset(pdev, pdata->reset_name)) {
+		dev_err(dev, "Unable to reset wkup_m3!\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static int wkup_m3_rproc_stop(struct rproc *rproc)
+{
+	struct wkup_m3_rproc *wkupm3 = rproc->priv;
+	struct platform_device *pdev = wkupm3->pdev;
+	struct device *dev = &pdev->dev;
+	struct wkup_m3_platform_data *pdata = dev_get_platdata(dev);
+
+	if (pdata->assert_reset(pdev, pdata->reset_name)) {
+		dev_err(dev, "Unable to assert reset of wkup_m3!\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+static void *wkup_m3_rproc_da_to_va(struct rproc *rproc, u64 da, int len)
+{
+	struct wkup_m3_rproc *wkupm3 = rproc->priv;
+	void *va = NULL;
+	int i;
+	u32 offset;
+
+	if (len <= 0)
+		return NULL;
+
+	for (i = 0; i < WKUPM3_MEM_MAX; i++) {
+		if (da >= wkupm3->mem[i].dev_addr && da + len <=
+		    wkupm3->mem[i].dev_addr +  wkupm3->mem[i].size) {
+			offset = da -  wkupm3->mem[i].dev_addr;
+			/* __force to make sparse happy with type conversion */
+			va = (__force void *)(wkupm3->mem[i].cpu_addr + offset);
+			break;
+		}
+	}
+
+	return va;
+}
+
+static struct rproc_ops wkup_m3_rproc_ops = {
+	.start		= wkup_m3_rproc_start,
+	.stop		= wkup_m3_rproc_stop,
+	.da_to_va	= wkup_m3_rproc_da_to_va,
+};
+
+static const struct of_device_id wkup_m3_rproc_of_match[] = {
+	{ .compatible = "ti,am3352-wkup-m3", },
+	{ .compatible = "ti,am4372-wkup-m3", },
+	{},
+};
+
+static int wkup_m3_rproc_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct wkup_m3_platform_data *pdata = dev->platform_data;
+	/* umem always needs to be processed first */
+	const char *mem_names[WKUPM3_MEM_MAX] = { "umem", "dmem" };
+	struct wkup_m3_rproc *wkupm3;
+	const char *fw_name;
+	struct rproc *rproc;
+	struct resource *res;
+	const __be32 *addrp;
+	u32 l4_offset = 0;
+	u64 size;
+	int ret;
+	int i;
+
+	if (!(pdata && pdata->deassert_reset && pdata->assert_reset &&
+	      pdata->reset_name)) {
+		dev_err(dev, "Platform data missing!\n");
+		return -ENODEV;
+	}
+
+	ret = of_property_read_string(dev->of_node, "ti,pm-firmware",
+				      &fw_name);
+	if (ret) {
+		dev_err(dev, "No firmware filename given\n");
+		return -ENODEV;
+	}
+
+	pm_runtime_enable(&pdev->dev);
+	ret = pm_runtime_get_sync(&pdev->dev);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "pm_runtime_get_sync() failed\n");
+		goto err;
+	}
+
+	rproc = rproc_alloc(dev, "wkup_m3", &wkup_m3_rproc_ops,
+			    fw_name, sizeof(*wkupm3));
+	if (!rproc) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	wkupm3 = rproc->priv;
+	wkupm3->rproc = rproc;
+	wkupm3->pdev = pdev;
+
+	for (i = 0; i < ARRAY_SIZE(mem_names); i++) {
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						   mem_names[i]);
+		wkupm3->mem[i].cpu_addr = devm_ioremap_resource(dev, res);
+		if (IS_ERR(wkupm3->mem[i].cpu_addr)) {
+			dev_err(&pdev->dev, "devm_ioremap_resource failed for resource %d\n",
+				i);
+			ret = PTR_ERR(wkupm3->mem[i].cpu_addr);
+			goto err;
+		}
+		wkupm3->mem[i].bus_addr = res->start;
+		wkupm3->mem[i].size = resource_size(res);
+		addrp = of_get_address(dev->of_node, i, &size, NULL);
+		/*
+		 * The wkupm3 has umem at address 0 in its view, so the device
+		 * addresses for each memory region is computed as a relative
+		 * offset of the bus address for umem, and therefore needs to be
+		 * processed first.
+		 */
+		if (!strcmp(mem_names[i], "umem"))
+			l4_offset = be32_to_cpu(*addrp);
+		wkupm3->mem[i].dev_addr = be32_to_cpu(*addrp) - l4_offset;
+	}
+
+	dev_set_drvdata(dev, rproc);
+
+	ret = rproc_add(rproc);
+	if (ret) {
+		dev_err(dev, "rproc_add failed\n");
+		goto err_put_rproc;
+	}
+
+	return 0;
+
+err_put_rproc:
+	rproc_put(rproc);
+err:
+	pm_runtime_put_noidle(dev);
+	pm_runtime_disable(dev);
+	return ret;
+}
+
+static int wkup_m3_rproc_remove(struct platform_device *pdev)
+{
+	struct rproc *rproc = platform_get_drvdata(pdev);
+
+	rproc_del(rproc);
+	rproc_put(rproc);
+	pm_runtime_put_sync(&pdev->dev);
+	pm_runtime_disable(&pdev->dev);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM
+static int wkup_m3_rpm_suspend(struct device *dev)
+{
+	return -EBUSY;
+}
+
+static int wkup_m3_rpm_resume(struct device *dev)
+{
+	return 0;
+}
+#endif
+
+static const struct dev_pm_ops wkup_m3_rproc_pm_ops = {
+	SET_RUNTIME_PM_OPS(wkup_m3_rpm_suspend, wkup_m3_rpm_resume, NULL)
+};
+
+static struct platform_driver wkup_m3_rproc_driver = {
+	.probe = wkup_m3_rproc_probe,
+	.remove = wkup_m3_rproc_remove,
+	.driver = {
+		.name = "wkup_m3_rproc",
+		.of_match_table = wkup_m3_rproc_of_match,
+		.pm = &wkup_m3_rproc_pm_ops,
+	},
+};
+
+module_platform_driver(wkup_m3_rproc_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("TI Wakeup M3 remote processor control driver");
+MODULE_AUTHOR("Dave Gerlach <d-gerlach@ti.com>");
diff --git a/include/linux/platform_data/wkup_m3.h b/include/linux/platform_data/wkup_m3.h
new file mode 100644
index 000000000000..3f1d77effd71
--- /dev/null
+++ b/include/linux/platform_data/wkup_m3.h
@@ -0,0 +1,30 @@
+/*
+ * TI Wakeup M3 remote processor platform data
+ *
+ * Copyright (C) 2014-2015 Texas Instruments, Inc.
+ *
+ * Dave Gerlach <d-gerlach@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_PLATFORM_DATA_WKUP_M3_H
+#define _LINUX_PLATFORM_DATA_WKUP_M3_H
+
+struct platform_device;
+
+struct wkup_m3_platform_data {
+	const char *reset_name;
+
+	int (*assert_reset)(struct platform_device *pdev, const char *name);
+	int (*deassert_reset)(struct platform_device *pdev, const char *name);
+};
+
+#endif /* _LINUX_PLATFORM_DATA_WKUP_M3_H */
-- 
cgit v1.2.3


From 4bacc9c9234c7c8eec44f5ed4e960d9f96fa0f01 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 18 Jun 2015 14:32:31 +0100
Subject: overlayfs: Make f_path always point to the overlay and f_inode to the
 underlay

Make file->f_path always point to the overlay dentry so that the path in
/proc/pid/fd is correct and to ensure that label-based LSMs have access to the
overlay as well as the underlay (path-based LSMs probably don't need it).

Using my union testsuite to set things up, before the patch I see:

	[root@andromeda union-testsuite]# bash 5</mnt/a/foo107
	[root@andromeda union-testsuite]# ls -l /proc/$$/fd/
	...
	lr-x------. 1 root root 64 Jun  5 14:38 5 -> /a/foo107
	[root@andromeda union-testsuite]# stat /mnt/a/foo107
	...
	Device: 23h/35d Inode: 13381       Links: 1
	...
	[root@andromeda union-testsuite]# stat -L /proc/$$/fd/5
	...
	Device: 23h/35d Inode: 13381       Links: 1
	...

After the patch:

	[root@andromeda union-testsuite]# bash 5</mnt/a/foo107
	[root@andromeda union-testsuite]# ls -l /proc/$$/fd/
	...
	lr-x------. 1 root root 64 Jun  5 14:22 5 -> /mnt/a/foo107
	[root@andromeda union-testsuite]# stat /mnt/a/foo107
	...
	Device: 23h/35d Inode: 40346       Links: 1
	...
	[root@andromeda union-testsuite]# stat -L /proc/$$/fd/5
	...
	Device: 23h/35d Inode: 40346       Links: 1
	...

Note the change in where /proc/$$/fd/5 points to in the ls command.  It was
pointing to /a/foo107 (which doesn't exist) and now points to /mnt/a/foo107
(which is correct).

The inode accessed, however, is the lower layer.  The union layer is on device
25h/37d and the upper layer on 24h/36d.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c              |  5 ++++-
 fs/internal.h            |  1 +
 fs/open.c                | 49 +++++++++++++++++++++++++-----------------------
 fs/overlayfs/inode.c     | 14 ++++++--------
 fs/overlayfs/overlayfs.h |  1 +
 fs/overlayfs/super.c     |  1 +
 include/linux/dcache.h   |  2 ++
 include/linux/fs.h       |  2 --
 8 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 37b5afdaf698..c4ce35110704 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1673,7 +1673,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 				DCACHE_OP_COMPARE	|
 				DCACHE_OP_REVALIDATE	|
 				DCACHE_OP_WEAK_REVALIDATE	|
-				DCACHE_OP_DELETE ));
+				DCACHE_OP_DELETE	|
+				DCACHE_OP_SELECT_INODE));
 	dentry->d_op = op;
 	if (!op)
 		return;
@@ -1689,6 +1690,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
 		dentry->d_flags |= DCACHE_OP_DELETE;
 	if (op->d_prune)
 		dentry->d_flags |= DCACHE_OP_PRUNE;
+	if (op->d_select_inode)
+		dentry->d_flags |= DCACHE_OP_SELECT_INODE;
 
 }
 EXPORT_SYMBOL(d_set_d_op);
diff --git a/fs/internal.h b/fs/internal.h
index 01dce1d1476b..4d5af583ab03 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -107,6 +107,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 extern long do_handle_open(int mountdirfd,
 			   struct file_handle __user *ufh, int open_flag);
 extern int open_check_o_direct(struct file *f);
+extern int vfs_open(const struct path *, struct file *, const struct cred *);
 
 /*
  * inode.c
diff --git a/fs/open.c b/fs/open.c
index e0250bdcc440..b1c5823b7f11 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -678,18 +678,18 @@ int open_check_o_direct(struct file *f)
 }
 
 static int do_dentry_open(struct file *f,
+			  struct inode *inode,
 			  int (*open)(struct inode *, struct file *),
 			  const struct cred *cred)
 {
 	static const struct file_operations empty_fops = {};
-	struct inode *inode;
 	int error;
 
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 
 	path_get(&f->f_path);
-	inode = f->f_inode = f->f_path.dentry->d_inode;
+	f->f_inode = inode;
 	f->f_mapping = inode->i_mapping;
 
 	if (unlikely(f->f_flags & O_PATH)) {
@@ -793,7 +793,8 @@ int finish_open(struct file *file, struct dentry *dentry,
 	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
 
 	file->f_path.dentry = dentry;
-	error = do_dentry_open(file, open, current_cred());
+	error = do_dentry_open(file, d_backing_inode(dentry), open,
+			       current_cred());
 	if (!error)
 		*opened |= FILE_OPENED;
 
@@ -822,6 +823,28 @@ int finish_no_open(struct file *file, struct dentry *dentry)
 }
 EXPORT_SYMBOL(finish_no_open);
 
+/**
+ * vfs_open - open the file at the given path
+ * @path: path to open
+ * @file: newly allocated file with f_flag initialized
+ * @cred: credentials to use
+ */
+int vfs_open(const struct path *path, struct file *file,
+	     const struct cred *cred)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *inode = dentry->d_inode;
+
+	file->f_path = *path;
+	if (dentry->d_flags & DCACHE_OP_SELECT_INODE) {
+		inode = dentry->d_op->d_select_inode(dentry, file->f_flags);
+		if (IS_ERR(inode))
+			return PTR_ERR(inode);
+	}
+
+	return do_dentry_open(file, inode, NULL, cred);
+}
+
 struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *cred)
 {
@@ -853,26 +876,6 @@ struct file *dentry_open(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(dentry_open);
 
-/**
- * vfs_open - open the file at the given path
- * @path: path to open
- * @filp: newly allocated file with f_flag initialized
- * @cred: credentials to use
- */
-int vfs_open(const struct path *path, struct file *filp,
-	     const struct cred *cred)
-{
-	struct inode *inode = path->dentry->d_inode;
-
-	if (inode->i_op->dentry_open)
-		return inode->i_op->dentry_open(path->dentry, filp, cred);
-	else {
-		filp->f_path = *path;
-		return do_dentry_open(filp, NULL, cred);
-	}
-}
-EXPORT_SYMBOL(vfs_open);
-
 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 {
 	int lookup_flags = 0;
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 21079d1ca2aa..f140e3dbfb7b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -337,31 +337,30 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
 	return true;
 }
 
-static int ovl_dentry_open(struct dentry *dentry, struct file *file,
-		    const struct cred *cred)
+struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
 {
 	int err;
 	struct path realpath;
 	enum ovl_path_type type;
 
 	type = ovl_path_real(dentry, &realpath);
-	if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
+	if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
 		err = ovl_want_write(dentry);
 		if (err)
-			return err;
+			return ERR_PTR(err);
 
-		if (file->f_flags & O_TRUNC)
+		if (file_flags & O_TRUNC)
 			err = ovl_copy_up_last(dentry, NULL, true);
 		else
 			err = ovl_copy_up(dentry);
 		ovl_drop_write(dentry);
 		if (err)
-			return err;
+			return ERR_PTR(err);
 
 		ovl_path_upper(dentry, &realpath);
 	}
 
-	return vfs_open(&realpath, file, cred);
+	return d_backing_inode(realpath.dentry);
 }
 
 static const struct inode_operations ovl_file_inode_operations = {
@@ -372,7 +371,6 @@ static const struct inode_operations ovl_file_inode_operations = {
 	.getxattr	= ovl_getxattr,
 	.listxattr	= ovl_listxattr,
 	.removexattr	= ovl_removexattr,
-	.dentry_open	= ovl_dentry_open,
 };
 
 static const struct inode_operations ovl_symlink_inode_operations = {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 17ac5afc9ffb..ea5a40b06e3a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
 		     void *value, size_t size);
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
 int ovl_removexattr(struct dentry *dentry, const char *name);
+struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
 
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
 			    struct ovl_entry *oe);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 5f0d1993e6e3..84c5e27fbfd9 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -275,6 +275,7 @@ static void ovl_dentry_release(struct dentry *dentry)
 
 static const struct dentry_operations ovl_dentry_operations = {
 	.d_release = ovl_dentry_release,
+	.d_select_inode = ovl_d_select_inode,
 };
 
 static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index df334cbacc6d..167ec0934049 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -160,6 +160,7 @@ struct dentry_operations {
 	char *(*d_dname)(struct dentry *, char *, int);
 	struct vfsmount *(*d_automount)(struct path *);
 	int (*d_manage)(struct dentry *, bool);
+	struct inode *(*d_select_inode)(struct dentry *, unsigned);
 } ____cacheline_aligned;
 
 /*
@@ -225,6 +226,7 @@ struct dentry_operations {
 
 #define DCACHE_MAY_FREE			0x00800000
 #define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
+#define DCACHE_OP_SELECT_INODE		0x02000000 /* Unioned entry: dcache op selects inode */
 
 extern seqlock_t rename_lock;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b577e801b4af..2bd77e10e8e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1641,7 +1641,6 @@ struct inode_operations {
 	int (*set_acl)(struct inode *, struct posix_acl *, int);
 
 	/* WARNING: probably going away soon, do not use! */
-	int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
 } ____cacheline_aligned;
 
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
@@ -2194,7 +2193,6 @@ extern struct file *file_open_name(struct filename *, int, umode_t);
 extern struct file *filp_open(const char *, int, umode_t);
 extern struct file *file_open_root(struct dentry *, struct vfsmount *,
 				   const char *, int);
-extern int vfs_open(const struct path *, struct file *, const struct cred *);
 extern struct file * dentry_open(const struct path *, int, const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);
 
-- 
cgit v1.2.3


From 9bf39ab2adafd7cf8740859cb49e7b7952813a5d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 19 Jun 2015 10:29:13 +0200
Subject: vfs: add file_path() helper

Turn
	d_path(&file->f_path, ...);
into
	file_path(file, ...);

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arc/kernel/troubleshoot.c               | 10 +++-------
 arch/blackfin/kernel/trace.c                 |  2 +-
 arch/tile/kernel/stack.c                     |  2 +-
 arch/tile/mm/elf.c                           |  2 +-
 drivers/block/loop.c                         |  2 +-
 drivers/md/bitmap.c                          |  2 +-
 drivers/md/md.c                              |  2 +-
 drivers/usb/gadget/function/f_mass_storage.c |  2 +-
 drivers/usb/gadget/function/storage_common.c |  2 +-
 fs/binfmt_elf.c                              |  4 ++--
 fs/coredump.c                                |  2 +-
 fs/ext4/super.c                              |  2 +-
 fs/open.c                                    |  6 ++++++
 include/linux/fs.h                           |  2 ++
 kernel/events/core.c                         |  2 +-
 mm/memory.c                                  |  2 +-
 16 files changed, 25 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index e00a01879025..9f80c5adcb68 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -67,15 +67,12 @@ static void print_task_path_n_nm(struct task_struct *tsk, char *buf)
 	mmput(mm);
 
 	if (exe_file) {
-		path = exe_file->f_path;
-		path_get(&exe_file->f_path);
+		path_nm = file_path(exe_file, buf, 255);
 		fput(exe_file);
-		path_nm = d_path(&path, buf, 255);
-		path_put(&path);
 	}
 
 done:
-	pr_info("Path: %s\n", path_nm);
+	pr_info("Path: %s\n", !IS_ERR(path_nm) ? path_nm : "?");
 }
 
 static void show_faulting_vma(unsigned long address, char *buf)
@@ -99,8 +96,7 @@ static void show_faulting_vma(unsigned long address, char *buf)
 	if (vma && (vma->vm_start <= address)) {
 		struct file *file = vma->vm_file;
 		if (file) {
-			struct path *path = &file->f_path;
-			nm = d_path(path, buf, PAGE_SIZE - 1);
+			nm = file_path(file, buf, PAGE_SIZE - 1);
 			inode = file_inode(vma->vm_file);
 			dev = inode->i_sb->s_dev;
 			ino = inode->i_ino;
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index c36efa0c7163..719dd796c12c 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -136,7 +136,7 @@ void decode_address(char *buf, unsigned long address)
 				struct file *file = vma->vm_file;
 
 				if (file) {
-					char *d_name = d_path(&file->f_path, _tmpbuf,
+					char *d_name = file_path(file, _tmpbuf,
 						      sizeof(_tmpbuf));
 					if (!IS_ERR(d_name))
 						name = d_name;
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index c42dce50acd8..8d62cf12c2c0 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -334,7 +334,7 @@ static void describe_addr(struct KBacktraceIterator *kbt,
 	}
 
 	if (vma->vm_file) {
-		p = d_path(&vma->vm_file->f_path, buf, bufsize);
+		p = file_path(vma->vm_file, buf, bufsize);
 		if (IS_ERR(p))
 			p = "?";
 		name = kbasename(p);
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index f7ddae3725a4..6225cc998db1 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -56,7 +56,7 @@ static int notify_exec(struct mm_struct *mm)
 	if (exe_file == NULL)
 		goto done_free;
 
-	path = d_path(&exe_file->f_path, buf, PAGE_SIZE);
+	path = file_path(exe_file, buf, PAGE_SIZE);
 	if (IS_ERR(path))
 		goto done_put;
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d7173cb1ea76..0d8ad59413cd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -568,7 +568,7 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
 
 	spin_lock_irq(&lo->lo_lock);
 	if (lo->lo_backing_file)
-		p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+		p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
 	spin_unlock_irq(&lo->lo_lock);
 
 	if (IS_ERR_OR_NULL(p))
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2bc56e2a3526..dda33d648354 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -834,7 +834,7 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 		if (bitmap->storage.file) {
 			path = kmalloc(PAGE_SIZE, GFP_KERNEL);
 			if (path)
-				ptr = d_path(&bitmap->storage.file->f_path,
+				ptr = file_path(bitmap->storage.file,
 					     path, PAGE_SIZE);
 
 			printk(KERN_ALERT
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 593a02476c78..e67f3ac137bf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5741,7 +5741,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
 	/* bitmap disabled, zero the first byte and copy out */
 	if (!mddev->bitmap_info.file)
 		file->pathname[0] = '\0';
-	else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+	else if ((ptr = file_path(mddev->bitmap_info.file,
 			       file->pathname, sizeof(file->pathname))),
 		 IS_ERR(ptr))
 		err = PTR_ERR(ptr);
diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c
index 3cc109f3c9c8..d2259c663996 100644
--- a/drivers/usb/gadget/function/f_mass_storage.c
+++ b/drivers/usb/gadget/function/f_mass_storage.c
@@ -2936,7 +2936,7 @@ int fsg_common_create_lun(struct fsg_common *common, struct fsg_lun_config *cfg,
 	if (fsg_lun_is_open(lun)) {
 		p = "(error)";
 		if (pathbuf) {
-			p = d_path(&lun->filp->f_path, pathbuf, PATH_MAX);
+			p = file_path(lun->filp, pathbuf, PATH_MAX);
 			if (IS_ERR(p))
 				p = "(error)";
 		}
diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c
index 648f9e489b39..d62683017cf3 100644
--- a/drivers/usb/gadget/function/storage_common.c
+++ b/drivers/usb/gadget/function/storage_common.c
@@ -341,7 +341,7 @@ ssize_t fsg_show_file(struct fsg_lun *curlun, struct rw_semaphore *filesem,
 
 	down_read(filesem);
 	if (fsg_lun_is_open(curlun)) {	/* Get the complete pathname */
-		p = d_path(&curlun->filp->f_path, buf, PAGE_SIZE - 1);
+		p = file_path(curlun->filp, buf, PAGE_SIZE - 1);
 		if (IS_ERR(p))
 			rc = PTR_ERR(p);
 		else {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 241ef68d2893..5046b6247103 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1530,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note)
 		file = vma->vm_file;
 		if (!file)
 			continue;
-		filename = d_path(&file->f_path, name_curpos, remaining);
+		filename = file_path(file, name_curpos, remaining);
 		if (IS_ERR(filename)) {
 			if (PTR_ERR(filename) == -ENAMETOOLONG) {
 				vfree(data);
@@ -1540,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note)
 			continue;
 		}
 
-		/* d_path() fills at the end, move name down */
+		/* file_path() fills at the end, move name down */
 		/* n = strlen(filename) + 1: */
 		n = (name_curpos + remaining) - filename;
 		remaining = filename - name_curpos;
diff --git a/fs/coredump.c b/fs/coredump.c
index bbbe139ab280..5b771b36cc6e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -138,7 +138,7 @@ static int cn_print_exe_file(struct core_name *cn)
 		goto put_exe_file;
 	}
 
-	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+	path = file_path(exe_file, pathbuf, PATH_MAX);
 	if (IS_ERR(path)) {
 		ret = PTR_ERR(path);
 		goto free_buf;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f06d0589ddba..0ae853d2e1f1 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -449,7 +449,7 @@ void __ext4_error_file(struct file *file, const char *function,
 	es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	if (ext4_error_ratelimit(inode->i_sb)) {
-		path = d_path(&(file->f_path), pathname, sizeof(pathname));
+		path = file_path(file, pathname, sizeof(pathname));
 		if (IS_ERR(path))
 			path = "(unknown)";
 		va_start(args, fmt);
diff --git a/fs/open.c b/fs/open.c
index b1c5823b7f11..1dbc79358d59 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -823,6 +823,12 @@ int finish_no_open(struct file *file, struct dentry *dentry)
 }
 EXPORT_SYMBOL(finish_no_open);
 
+char *file_path(struct file *filp, char *buf, int buflen)
+{
+	return d_path(&filp->f_path, buf, buflen);
+}
+EXPORT_SYMBOL(file_path);
+
 /**
  * vfs_open - open the file at the given path
  * @path: path to open
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2bd77e10e8e5..2c135ad741a9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2500,6 +2500,8 @@ extern struct file * open_exec(const char *);
 extern int is_subdir(struct dentry *, struct dentry *);
 extern int path_is_under(struct path *, struct path *);
 
+extern char *file_path(struct file *, char *, int);
+
 #include <linux/err.h>
 
 /* needed for stackable file system support */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81aa3a4ece9f..5c964e845483 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5791,7 +5791,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 		 * need to add enough zero bytes after the string to handle
 		 * the 64bit alignment we do later.
 		 */
-		name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+		name = file_path(file, buf, PATH_MAX - sizeof(u64));
 		if (IS_ERR(name)) {
 			name = "//toolong";
 			goto cpy_name;
diff --git a/mm/memory.c b/mm/memory.c
index 22e037e3364e..28c10da1efbc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3724,7 +3724,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
 		if (buf) {
 			char *p;
 
-			p = d_path(&f->f_path, buf, PAGE_SIZE);
+			p = file_path(f, buf, PAGE_SIZE);
 			if (IS_ERR(p))
 				p = "?";
 			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
-- 
cgit v1.2.3


From 2726d56620ce71f40dd583d51391b86e1ab8cc57 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 19 Jun 2015 10:30:28 +0200
Subject: vfs: add seq_file_path() helper

Turn
	seq_path(..., &file->f_path, ...);
into
	seq_file_path(..., file, ...);

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/md/bitmap.c      |  2 +-
 fs/proc/nommu.c          |  2 +-
 fs/proc/task_mmu.c       |  4 ++--
 fs/proc/task_nommu.c     |  2 +-
 fs/seq_file.c            | 14 ++++++++++++++
 include/linux/seq_file.h |  1 +
 mm/swapfile.c            |  2 +-
 7 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index dda33d648354..3813fdfee4be 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1922,7 +1922,7 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 		   chunk_kb ? "KB" : "B");
 	if (bitmap->storage.file) {
 		seq_printf(seq, ", file: ");
-		seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
+		seq_file_path(seq, bitmap->storage.file, " \t\n");
 	}
 
 	seq_printf(seq, "\n");
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index d4a35746cab9..f8595e8b5cd0 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -64,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "");
+		seq_file_path(m, file, "");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6dee68d013ff..ca1e091881d4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -310,7 +310,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	 */
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "\n");
+		seq_file_path(m, file, "\n");
 		goto done;
 	}
 
@@ -1509,7 +1509,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 
 	if (file) {
 		seq_puts(m, " file=");
-		seq_path(m, &file->f_path, "\n\t= ");
+		seq_file_path(m, file, "\n\t= ");
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_puts(m, " heap");
 	} else {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 599ec2e20104..e0d64c92e4f6 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -180,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_path(m, &file->f_path, "");
+		seq_file_path(m, file, "");
 	} else if (mm) {
 		pid_t tid = pid_of_stack(priv, vma, is_pid);
 
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 555f82155be8..d8a0545ad7ea 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -487,6 +487,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc)
 }
 EXPORT_SYMBOL(seq_path);
 
+/**
+ * seq_file_path - seq_file interface to print a pathname of a file
+ * @m: the seq_file handle
+ * @file: the struct file to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path to the file.
+ */
+int seq_file_path(struct seq_file *m, struct file *file, const char *esc)
+{
+	return seq_path(m, &file->f_path, esc);
+}
+EXPORT_SYMBOL(seq_file_path);
+
 /*
  * Same as seq_path, but relative to supplied root.
  */
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index afbb1fd77c77..912a7c482649 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -123,6 +123,7 @@ __printf(2, 3) int seq_printf(struct seq_file *, const char *, ...);
 __printf(2, 0) int seq_vprintf(struct seq_file *, const char *, va_list args);
 
 int seq_path(struct seq_file *, const struct path *, const char *);
+int seq_file_path(struct seq_file *, struct file *, const char *);
 int seq_dentry(struct seq_file *, struct dentry *, const char *);
 int seq_path_root(struct seq_file *m, const struct path *path,
 		  const struct path *root, const char *esc);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a7e72103f23b..41e4581af7c5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2032,7 +2032,7 @@ static int swap_show(struct seq_file *swap, void *v)
 	}
 
 	file = si->swap_file;
-	len = seq_path(swap, &file->f_path, " \t\n\\");
+	len = seq_file_path(swap, file, " \t\n\\");
 	seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
 			len < 40 ? 40 - len : 1, " ",
 			S_ISBLK(file_inode(file)->i_mode) ?
-- 
cgit v1.2.3


From 5fa8e0a1c6a762857ae67d1628c58b9a02362003 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 May 2015 16:05:53 +0200
Subject: fs: Rename file_remove_suid() to file_remove_privs()

file_remove_suid() is a misnomer since it removes also file capabilities
stored in xattrs and sets S_NOSEC flag. Also should_remove_suid() tells
something else than whether file_remove_suid() call is necessary which
leads to bugs.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c    |  2 +-
 fs/ceph/file.c     |  2 +-
 fs/fuse/file.c     |  2 +-
 fs/inode.c         | 13 ++++++++-----
 fs/ntfs/file.c     |  2 +-
 fs/xfs/xfs_file.c  |  2 +-
 include/linux/fs.h |  2 +-
 mm/filemap.c       |  2 +-
 8 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b072e17479aa..86f97282779a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
 		goto out;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b6b522b4b31..e55fe32c6224 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -959,7 +959,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	pos = iocb->ki_pos;
 	count = iov_iter_count(from);
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err)
 		goto out;
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ef05b5c4cff..1344647965dc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1169,7 +1169,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (err <= 0)
 		goto out;
 
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err)
 		goto out;
 
diff --git a/fs/inode.c b/fs/inode.c
index 07f4cb5eab4b..849210c155dc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1685,7 +1685,11 @@ static int __remove_suid(struct dentry *dentry, int kill)
 	return notify_change(dentry, &newattrs, NULL);
 }
 
-int file_remove_suid(struct file *file)
+/*
+ * Remove special file priviledges (suid, capabilities) when file is written
+ * to or truncated.
+ */
+int file_remove_privs(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
@@ -1711,7 +1715,7 @@ int file_remove_suid(struct file *file)
 
 	return error;
 }
-EXPORT_SYMBOL(file_remove_suid);
+EXPORT_SYMBOL(file_remove_privs);
 
 /**
  *	file_update_time	-	update mtime and ctime time
@@ -1966,9 +1970,8 @@ EXPORT_SYMBOL(inode_dio_wait);
  * inode is being instantiated).  The reason for the cmpxchg() loop
  * --- which wouldn't be necessary if all code paths which modify
  * i_flags actually followed this rule, is that there is at least one
- * code path which doesn't today --- for example,
- * __generic_file_aio_write() calls file_remove_suid() without holding
- * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ * code path which doesn't today so we use cmpxchg() out of an abundance
+ * of caution.
  *
  * In the long run, i_mutex is overkill, and we should probably look
  * at using the i_lock spinlock to protect i_flags, and then make sure
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7bb487e663b4..182bb93aa79c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -382,7 +382,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
 	base_ni = ni;
 	if (NInoAttr(ni))
 		base_ni = ni->ext.base_ntfs_ino;
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (unlikely(err))
 		goto out;
 	/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8121e75352ee..f3e4fbb59985 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -623,7 +623,7 @@ restart:
 	 * setgid bits if the process is not being run by root.  This keeps
 	 * people from modifying setuid and setgid binaries.
 	 */
-	return file_remove_suid(file);
+	return file_remove_privs(file);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c135ad741a9..641e68d850cf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2553,7 +2553,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb);
 extern struct inode *new_inode(struct super_block *sb);
 extern void free_inode_nonrcu(struct inode *inode);
 extern int should_remove_suid(struct dentry *);
-extern int file_remove_suid(struct file *);
+extern int file_remove_privs(struct file *);
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 static inline void insert_inode_hash(struct inode *inode)
diff --git a/mm/filemap.c b/mm/filemap.c
index 6bf5e42d560a..f851e36802d5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2536,7 +2536,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
-	err = file_remove_suid(file);
+	err = file_remove_privs(file);
 	if (err)
 		goto out;
 
-- 
cgit v1.2.3


From dbfae0cdcd87602737101d4417811f4323156b54 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 May 2015 16:05:54 +0200
Subject: fs: Provide function telling whether file_remove_privs() will do
 anything

Provide function telling whether file_remove_privs() will do anything.
Currently we only have should_remove_suid() and that does something
slightly different.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 44 ++++++++++++++++++++++++++++++++------------
 include/linux/fs.h |  1 +
 2 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 849210c155dc..8c2dd74455c9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1673,7 +1673,32 @@ int should_remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(should_remove_suid);
 
-static int __remove_suid(struct dentry *dentry, int kill)
+/*
+ * Return mask of changes for notify_change() that need to be done as a
+ * response to write or truncate. Return 0 if nothing has to be changed.
+ * Negative value on error (change should be denied).
+ */
+int file_needs_remove_privs(struct file *file)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode = d_inode(dentry);
+	int mask = 0;
+	int ret;
+
+	if (IS_NOSEC(inode))
+		return 0;
+
+	mask = should_remove_suid(dentry);
+	ret = security_inode_need_killpriv(dentry);
+	if (ret < 0)
+		return ret;
+	if (ret)
+		mask |= ATTR_KILL_PRIV;
+	return mask;
+}
+EXPORT_SYMBOL(file_needs_remove_privs);
+
+static int __remove_privs(struct dentry *dentry, int kill)
 {
 	struct iattr newattrs;
 
@@ -1693,23 +1718,18 @@ int file_remove_privs(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
-	int killsuid;
-	int killpriv;
+	int kill;
 	int error = 0;
 
 	/* Fast path for nothing security related */
 	if (IS_NOSEC(inode))
 		return 0;
 
-	killsuid = should_remove_suid(dentry);
-	killpriv = security_inode_need_killpriv(dentry);
-
-	if (killpriv < 0)
-		return killpriv;
-	if (killpriv)
-		error = security_inode_killpriv(dentry);
-	if (!error && killsuid)
-		error = __remove_suid(dentry, killsuid);
+	kill = file_needs_remove_privs(file);
+	if (kill < 0)
+		return kill;
+	if (kill)
+		error = __remove_privs(dentry, kill);
 	if (!error)
 		inode_has_no_xattr(inode);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 641e68d850cf..ee60e8ab210f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2554,6 +2554,7 @@ extern struct inode *new_inode(struct super_block *sb);
 extern void free_inode_nonrcu(struct inode *inode);
 extern int should_remove_suid(struct dentry *);
 extern int file_remove_privs(struct file *);
+extern int file_needs_remove_privs(struct file *file);
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 static inline void insert_inode_hash(struct inode *inode)
-- 
cgit v1.2.3


From 45f147a1bc97c743c6101a8d2741c69a51f583e4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 21 May 2015 16:05:55 +0200
Subject: fs: Call security_ops->inode_killpriv on truncate

Comment in include/linux/security.h says that ->inode_killpriv() should
be called when setuid bit is being removed and that similar security
labels (in fact this applies only to file capabilities) should be
removed at this time as well. However we don't call ->inode_killpriv()
when we remove suid bit on truncate.

We fix the problem by calling ->inode_need_killpriv() and subsequently
->inode_killpriv() on truncate the same way as we do it on file write.

After this patch there's only one user of should_remove_suid() - ocfs2 -
and indeed it's buggy because it doesn't call ->inode_killpriv() on
write. However fixing it is difficult because of special locking
constraints.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         | 5 ++---
 fs/open.c          | 6 ++++--
 include/linux/fs.h | 6 +++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 8c2dd74455c9..0401d2c6d087 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1678,9 +1678,8 @@ EXPORT_SYMBOL(should_remove_suid);
  * response to write or truncate. Return 0 if nothing has to be changed.
  * Negative value on error (change should be denied).
  */
-int file_needs_remove_privs(struct file *file)
+int dentry_needs_remove_privs(struct dentry *dentry)
 {
-	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = d_inode(dentry);
 	int mask = 0;
 	int ret;
@@ -1696,7 +1695,7 @@ int file_needs_remove_privs(struct file *file)
 		mask |= ATTR_KILL_PRIV;
 	return mask;
 }
-EXPORT_SYMBOL(file_needs_remove_privs);
+EXPORT_SYMBOL(dentry_needs_remove_privs);
 
 static int __remove_privs(struct dentry *dentry, int kill)
 {
diff --git a/fs/open.c b/fs/open.c
index 1dbc79358d59..e33dab287fa0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -51,8 +51,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 		newattrs.ia_valid |= ATTR_FILE;
 	}
 
-	/* Remove suid/sgid on truncate too */
-	ret = should_remove_suid(dentry);
+	/* Remove suid, sgid, and file capabilities on truncate too */
+	ret = dentry_needs_remove_privs(dentry);
+	if (ret < 0)
+		return ret;
 	if (ret)
 		newattrs.ia_valid |= ret | ATTR_FORCE;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ee60e8ab210f..1e658b11c265 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2554,7 +2554,11 @@ extern struct inode *new_inode(struct super_block *sb);
 extern void free_inode_nonrcu(struct inode *inode);
 extern int should_remove_suid(struct dentry *);
 extern int file_remove_privs(struct file *);
-extern int file_needs_remove_privs(struct file *file);
+extern int dentry_needs_remove_privs(struct dentry *dentry);
+static inline int file_needs_remove_privs(struct file *file)
+{
+	return dentry_needs_remove_privs(file->f_path.dentry);
+}
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 static inline void insert_inode_hash(struct inode *inode)
-- 
cgit v1.2.3


From b57c2cb9ea1a02c2ae08e16de8c20cc13ffbf85a Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Sun, 24 May 2015 17:19:41 +0200
Subject: pagemap.h: move dir_pages() over there

That function was declared in a lot of filesystems to calculate
directory pages.

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exofs/dir.c            | 6 ------
 fs/ext2/dir.c             | 5 -----
 fs/freevxfs/vxfs_lookup.c | 7 -------
 fs/minix/dir.c            | 5 -----
 fs/nilfs2/dir.c           | 5 -----
 fs/qnx6/dir.c             | 5 -----
 fs/sysv/dir.c             | 5 -----
 include/linux/pagemap.h   | 6 ++++++
 8 files changed, 6 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4deb0b05b011..e5bb2abf77f9 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-/* Accesses dir's inode->i_size must be called under inode lock */
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
-
 static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
 {
 	loff_t last_byte = inode->i_size;
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 796b491e6978..0c6638b40f21 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 /*
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 99c7f0a37af4..484b32d3234a 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = {
 	.iterate =		vxfs_readdir,
 };
 
- 
-static inline u_long
-dir_pages(struct inode *inode)
-{
-	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
- 
 static inline u_long
 dir_blocks(struct inode *ip)
 {
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 118e4e7bc935..d19ac258105a 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr)
 	return last_byte;
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
 {
 	struct address_space *mapping = page->mapping;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 0ee0bed3649b..6b8b92b19cec 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 /*
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 8d64bb5366bf..e1f37278cf97 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
 	return page;
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static unsigned last_entry(struct inode *inode, unsigned long page_nr)
 {
 	unsigned long last_byte = inode->i_size;
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 8f3555f00c54..63c1bcb224ee 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page)
 	page_cache_release(page);
 }
 
-static inline unsigned long dir_pages(struct inode *inode)
-{
-	return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
-}
-
 static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
 {
 	struct address_space *mapping = page->mapping;
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4b3736f7065c..808942d31062 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -670,4 +670,10 @@ static inline int add_to_page_cache(struct page *page,
 	return error;
 }
 
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >>
+			       PAGE_CACHE_SHIFT;
+}
+
 #endif /* _LINUX_PAGEMAP_H */
-- 
cgit v1.2.3


From dc3f4198eac14e52a98dfc79cd84b45e280f59cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 18 May 2015 10:10:34 -0400
Subject: make simple_positive() public

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/s390/hypfs/inode.c                   |  7 +------
 drivers/block/drbd/drbd_debugfs.c         | 10 +---------
 drivers/infiniband/hw/ipath/ipath_fs.c    |  2 +-
 drivers/infiniband/hw/qib/qib_fs.c        |  2 +-
 fs/autofs4/autofs_i.h                     |  5 -----
 fs/ceph/dir.c                             |  2 +-
 fs/configfs/inode.c                       |  2 +-
 fs/debugfs/inode.c                        | 11 +++--------
 fs/libfs.c                                |  5 -----
 fs/nfs/dir.c                              |  2 +-
 fs/tracefs/inode.c                        | 11 +++--------
 include/linux/dcache.h                    |  5 +++++
 security/inode.c                          | 19 ++++++-------------
 14 files changed, 25 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 1ba6307be4db..11634fa7ab3c 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -166,7 +166,7 @@ static void spufs_prune_dir(struct dentry *dir)
 	mutex_lock(&d_inode(dir)->i_mutex);
 	list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
 		spin_lock(&dentry->d_lock);
-		if (!(d_unhashed(dentry)) && d_really_is_positive(dentry)) {
+		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index d3f896a35b98..8ffad5437232 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -62,18 +62,13 @@ static void hypfs_add_dentry(struct dentry *dentry)
 	hypfs_last_dentry = dentry;
 }
 
-static inline int hypfs_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 static void hypfs_remove(struct dentry *dentry)
 {
 	struct dentry *parent;
 
 	parent = dentry->d_parent;
 	mutex_lock(&d_inode(parent)->i_mutex);
-	if (hypfs_positive(dentry)) {
+	if (simple_positive(dentry)) {
 		if (d_is_dir(dentry))
 			simple_rmdir(d_inode(parent), dentry);
 		else
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index a6ee3d750c30..6b88a35fb048 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -419,14 +419,6 @@ static int in_flight_summary_show(struct seq_file *m, void *pos)
 	return 0;
 }
 
-/* simple_positive(file->f_path.dentry) respectively debugfs_positive(),
- * but neither is "reachable" from here.
- * So we have our own inline version of it above.  :-( */
-static inline int debugfs_positive(struct dentry *dentry)
-{
-        return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 /* make sure at *open* time that the respective object won't go away. */
 static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
 		                void *data, struct kref *kref,
@@ -444,7 +436,7 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo
 	/* serialize with d_delete() */
 	mutex_lock(&d_inode(parent)->i_mutex);
 	/* Make sure the object is still alive */
-	if (debugfs_positive(file->f_path.dentry)
+	if (simple_positive(file->f_path.dentry)
 	&& kref_get_unless_zero(kref))
 		ret = 0;
 	mutex_unlock(&d_inode(parent)->i_mutex);
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index 1ca8e32a9592..25422a3a7238 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -277,7 +277,7 @@ static int remove_file(struct dentry *parent, char *name)
 	}
 
 	spin_lock(&tmp->d_lock);
-	if (!d_unhashed(tmp) && d_really_is_positive(tmp)) {
+	if (simple_positive(tmp)) {
 		dget_dlock(tmp);
 		__d_drop(tmp);
 		spin_unlock(&tmp->d_lock);
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
index bdd5d3857203..13ef22bd9459 100644
--- a/drivers/infiniband/hw/qib/qib_fs.c
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -455,7 +455,7 @@ static int remove_file(struct dentry *parent, char *name)
 	}
 
 	spin_lock(&tmp->d_lock);
-	if (!d_unhashed(tmp) && d_really_is_positive(tmp)) {
+	if (simple_positive(tmp)) {
 		__d_drop(tmp);
 		spin_unlock(&tmp->d_lock);
 		simple_unlink(d_inode(parent), tmp);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5b700ef1e59d..c37149b929be 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -238,11 +238,6 @@ static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
 	return d_inode(sbi->sb->s_root)->i_ino;
 }
 
-static inline int simple_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4248307fea90..edbb8da02a6a 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -161,7 +161,7 @@ more:
 		}
 		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 		if (di->lease_shared_gen == shared_gen &&
-		    !d_unhashed(dentry) && d_really_is_positive(dentry) &&
+		    simple_positive(dentry) &&
 		    ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
 		    ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
 		    fpos_cmp(ctx->pos, di->offset) <= 0)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 8d89f5fd0331..eae87575e681 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -236,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent)
 
 	if (dentry) {
 		spin_lock(&dentry->d_lock);
-		if (!d_unhashed(dentry) && d_really_is_positive(dentry)) {
+		if (simple_positive(dentry)) {
 			dget_dlock(dentry);
 			__d_drop(dentry);
 			spin_unlock(&dentry->d_lock);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 7eaec88ea970..ef86ad6bdc3e 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
 	return inode;
 }
 
-static inline int debugfs_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 struct debugfs_mount_opts {
 	kuid_t uid;
 	kgid_t gid;
@@ -522,7 +517,7 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
-	if (debugfs_positive(dentry)) {
+	if (simple_positive(dentry)) {
 		dget(dentry);
 		if (d_is_dir(dentry))
 			ret = simple_rmdir(d_inode(parent), dentry);
@@ -602,7 +597,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	 */
 	spin_lock(&parent->d_lock);
 	list_for_each_entry(child, &parent->d_subdirs, d_child) {
-		if (!debugfs_positive(child))
+		if (!simple_positive(child))
 			continue;
 
 		/* perhaps simple_empty(child) makes more sense */
@@ -623,7 +618,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 		 * from d_subdirs. When releasing the parent->d_lock we can
 		 * no longer trust that the next pointer is valid.
 		 * Restart the loop. We'll skip this one with the
-		 * debugfs_positive() check.
+		 * simple_positive() check.
 		 */
 		goto loop;
 	}
diff --git a/fs/libfs.c b/fs/libfs.c
index 65e1feca8b98..4d9e6c118fe1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -20,11 +20,6 @@
 
 #include "internal.h"
 
-static inline int simple_positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		   struct kstat *stat)
 {
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b2c8b31b2be7..b9108f4254a7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1771,7 +1771,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir);
 
 static void nfs_dentry_handle_enoent(struct dentry *dentry)
 {
-	if (d_really_is_positive(dentry) && !d_unhashed(dentry))
+	if (simple_positive(dentry))
 		d_delete(dentry);
 }
 
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index d92bdf3b079a..6e8a1400d662 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -496,16 +496,11 @@ struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *pare
 	return dentry;
 }
 
-static inline int tracefs_positive(struct dentry *dentry)
-{
-	return dentry->d_inode && !d_unhashed(dentry);
-}
-
 static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
 {
 	int ret = 0;
 
-	if (tracefs_positive(dentry)) {
+	if (simple_positive(dentry)) {
 		if (dentry->d_inode) {
 			dget(dentry);
 			switch (dentry->d_inode->i_mode & S_IFMT) {
@@ -582,7 +577,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 	 */
 	spin_lock(&parent->d_lock);
 	list_for_each_entry(child, &parent->d_subdirs, d_child) {
-		if (!tracefs_positive(child))
+		if (!simple_positive(child))
 			continue;
 
 		/* perhaps simple_empty(child) makes more sense */
@@ -603,7 +598,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 		 * from d_subdirs. When releasing the parent->d_lock we can
 		 * no longer trust that the next pointer is valid.
 		 * Restart the loop. We'll skip this one with the
-		 * tracefs_positive() check.
+		 * simple_positive() check.
 		 */
 		goto loop;
 	}
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 167ec0934049..d2d50249b7b2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -507,6 +507,11 @@ static inline bool d_really_is_positive(const struct dentry *dentry)
 	return dentry->d_inode != NULL;
 }
 
+static inline int simple_positive(struct dentry *dentry)
+{
+	return d_really_is_positive(dentry) && !d_unhashed(dentry);
+}
+
 extern void d_set_fallthru(struct dentry *dentry);
 
 static inline bool d_is_fallthru(const struct dentry *dentry)
diff --git a/security/inode.c b/security/inode.c
index 91503b79c5f8..6df0d8dae1e0 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -25,11 +25,6 @@
 static struct vfsmount *mount;
 static int mount_count;
 
-static inline int positive(struct dentry *dentry)
-{
-	return d_really_is_positive(dentry) && !d_unhashed(dentry);
-}
-
 static int fill_super(struct super_block *sb, void *data, int silent)
 {
 	static struct tree_descr files[] = {{""}};
@@ -201,14 +196,12 @@ void securityfs_remove(struct dentry *dentry)
 		return;
 
 	mutex_lock(&d_inode(parent)->i_mutex);
-	if (positive(dentry)) {
-		if (d_really_is_positive(dentry)) {
-			if (d_is_dir(dentry))
-				simple_rmdir(d_inode(parent), dentry);
-			else
-				simple_unlink(d_inode(parent), dentry);
-			dput(dentry);
-		}
+	if (simple_positive(dentry)) {
+		if (d_is_dir(dentry))
+			simple_rmdir(d_inode(parent), dentry);
+		else
+			simple_unlink(d_inode(parent), dentry);
+		dput(dentry);
 	}
 	mutex_unlock(&d_inode(parent)->i_mutex);
 	simple_release_fs(&mount, &mount_count);
-- 
cgit v1.2.3


From be3a5d233922d73f27002ce2767f6ec03c3f473d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Tue, 23 Jun 2015 19:51:55 +0800
Subject: NFSv.2/pnfs Add a LAYOUTSTATS rpc function

Reviewed-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42.h          |   9 +++-
 fs/nfs/nfs42proc.c      |  27 +++++++++++
 fs/nfs/nfs42xdr.c       | 122 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4_fs.h        |   1 +
 fs/nfs/nfs4proc.c       |   4 +-
 fs/nfs/nfs4xdr.c        |   1 +
 include/linux/nfs4.h    |   1 +
 include/linux/nfs_xdr.h |  43 +++++++++++++++++
 8 files changed, 205 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 7afb8947dfdf..ff66ae700b89 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -5,11 +5,18 @@
 #ifndef __LINUX_FS_NFS_NFS4_2_H
 #define __LINUX_FS_NFS_NFS4_2_H
 
+/*
+ * FIXME:  four LAYOUTSTATS calls per compound at most! Do we need to support
+ * more? Need to consider not to pre-alloc too much for a compound.
+ */
+#define PNFS_LAYOUTSTATS_MAXDEV (4)
+
 /* nfs4.2proc.c */
 int nfs42_proc_allocate(struct file *, loff_t, loff_t);
 int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
 loff_t nfs42_proc_llseek(struct file *, loff_t, int);
-
+int nfs42_proc_layoutstats_generic(struct nfs_server *,
+				   struct nfs42_layoutstat_data *);
 /* nfs4.2xdr.h */
 extern struct rpc_procinfo nfs4_2_procedures[];
 
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 3a9e75235f30..ac929685350b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -165,3 +165,30 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 
 	return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
 }
+
+static const struct rpc_call_ops nfs42_layoutstat_ops = {
+};
+
+int nfs42_proc_layoutstats_generic(struct nfs_server *server,
+				   struct nfs42_layoutstat_data *data)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS],
+		.rpc_argp = &data->args,
+		.rpc_resp = &data->res,
+	};
+	struct rpc_task_setup task_setup = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs42_layoutstat_ops,
+		.callback_data = data,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct rpc_task *task;
+
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	task = rpc_run_task(&task_setup);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	return 0;
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 1a25b27248f2..9aae0205ef11 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -4,6 +4,8 @@
 #ifndef __LINUX_FS_NFS_NFS4_2XDR_H
 #define __LINUX_FS_NFS_NFS4_2XDR_H
 
+#include "nfs42.h"
+
 #define encode_fallocate_maxsz		(encode_stateid_maxsz + \
 					 2 /* offset */ + \
 					 2 /* length */)
@@ -22,6 +24,16 @@
 					 1 /* whence */ + \
 					 2 /* offset */ + \
 					 2 /* length */)
+#define encode_io_info_maxsz		4
+#define encode_layoutstats_maxsz	(op_decode_hdr_maxsz + \
+					2 /* offset */ + \
+					2 /* length */ + \
+					encode_stateid_maxsz + \
+					encode_io_info_maxsz + \
+					encode_io_info_maxsz + \
+					1 /* opaque devaddr4 length */ + \
+					XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
+#define decode_layoutstats_maxsz	(op_decode_hdr_maxsz)
 
 #define NFS4_enc_allocate_sz		(compound_encode_hdr_maxsz + \
 					 encode_putfh_maxsz + \
@@ -45,6 +57,14 @@
 #define NFS4_dec_seek_sz		(compound_decode_hdr_maxsz + \
 					 decode_putfh_maxsz + \
 					 decode_seek_maxsz)
+#define NFS4_enc_layoutstats_sz		(compound_encode_hdr_maxsz + \
+					 encode_sequence_maxsz + \
+					 encode_putfh_maxsz + \
+					 PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz)
+#define NFS4_dec_layoutstats_sz		(compound_decode_hdr_maxsz + \
+					 decode_sequence_maxsz + \
+					 decode_putfh_maxsz + \
+					 PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
 
 
 static void encode_fallocate(struct xdr_stream *xdr,
@@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr,
 	encode_uint32(xdr, args->sa_what);
 }
 
+static void encode_layoutstats(struct xdr_stream *xdr,
+			       struct nfs42_layoutstat_args *args,
+			       struct nfs42_layoutstat_devinfo *devinfo,
+			       struct compound_hdr *hdr)
+{
+	__be32 *p;
+
+	encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr);
+	p = reserve_space(xdr, 8 + 8);
+	p = xdr_encode_hyper(p, devinfo->offset);
+	p = xdr_encode_hyper(p, devinfo->length);
+	encode_nfs4_stateid(xdr, &args->stateid);
+	p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4);
+	p = xdr_encode_hyper(p, devinfo->read_count);
+	p = xdr_encode_hyper(p, devinfo->read_bytes);
+	p = xdr_encode_hyper(p, devinfo->write_count);
+	p = xdr_encode_hyper(p, devinfo->write_bytes);
+	p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data,
+			NFS4_DEVICEID4_SIZE);
+	/* Encode layoutupdate4 */
+	*p++ = cpu_to_be32(devinfo->layout_type);
+	if (devinfo->layoutstats_encode != NULL)
+		devinfo->layoutstats_encode(xdr, args, devinfo);
+	else
+		encode_uint32(xdr, 0);
+}
+
 /*
  * Encode ALLOCATE request
  */
@@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
 	encode_nops(&hdr);
 }
 
+/*
+ * Encode LAYOUTSTATS request
+ */
+static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
+				     struct xdr_stream *xdr,
+				     struct nfs42_layoutstat_args *args)
+{
+	int i;
+
+	struct compound_hdr hdr = {
+		.minorversion = nfs4_xdr_minorversion(&args->seq_args),
+	};
+
+	encode_compound_hdr(xdr, req, &hdr);
+	encode_sequence(xdr, &args->seq_args, &hdr);
+	encode_putfh(xdr, args->fh, &hdr);
+	WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+	for (i = 0; i < args->num_dev; i++)
+		encode_layoutstats(xdr, args, &args->devinfo[i], &hdr);
+	encode_nops(&hdr);
+}
+
 static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
 {
 	return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -169,6 +238,28 @@ out_overflow:
 	return -EIO;
 }
 
+static int decode_layoutstats(struct xdr_stream *xdr,
+			      struct nfs42_layoutstat_res *res)
+{
+	int status;
+	__be32 *p;
+
+	status = decode_op_hdr(xdr, OP_LAYOUTSTATS);
+	if (status)
+		return status;
+
+	p = xdr_inline_decode(xdr, 4);
+	if (unlikely(!p))
+		goto out_overflow;
+
+	res->rpc_status = be32_to_cpup(p++);
+	return 0;
+
+out_overflow:
+	print_overflow_msg(__func__, xdr);
+	return -EIO;
+}
+
 /*
  * Decode ALLOCATE request
  */
@@ -246,4 +337,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
 out:
 	return status;
 }
+
+/*
+ * Decode LAYOUTSTATS request
+ */
+static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp,
+				    struct xdr_stream *xdr,
+				    struct nfs42_layoutstat_res *res)
+{
+	struct compound_hdr hdr;
+	int status, i;
+
+	status = decode_compound_hdr(xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_sequence(xdr, &res->seq_res, rqstp);
+	if (status)
+		goto out;
+	status = decode_putfh(xdr);
+	if (status)
+		goto out;
+	WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV);
+	for (i = 0; i < res->num_dev; i++) {
+		status = decode_layoutstats(xdr, res);
+		if (status)
+			goto out;
+	}
+out:
+	res->rpc_status = status;
+	return status;
+}
+
 #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index fdef424b0cd3..ea3bee919a76 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception
 extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
 			  struct rpc_message *, struct nfs4_sequence_args *,
 			  struct nfs4_sequence_res *, int);
+extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int);
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e9cd45f1d60f..643ce3a91b22 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -476,8 +476,8 @@ struct nfs4_call_sync_data {
 	struct nfs4_sequence_res *seq_res;
 };
 
-static void nfs4_init_sequence(struct nfs4_sequence_args *args,
-			       struct nfs4_sequence_res *res, int cache_reply)
+void nfs4_init_sequence(struct nfs4_sequence_args *args,
+			struct nfs4_sequence_res *res, int cache_reply)
 {
 	args->sa_slot = NULL;
 	args->sa_cache_this = cache_reply;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index a07555d6968c..558cd65dbdb7 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -7431,6 +7431,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
 	PROC(SEEK,		enc_seek,		dec_seek),
 	PROC(ALLOCATE,		enc_allocate,		dec_allocate),
 	PROC(DEALLOCATE,	enc_deallocate,		dec_deallocate),
+	PROC(LAYOUTSTATS,	enc_layoutstats,	dec_layoutstats),
 #endif /* CONFIG_NFS_V4_2 */
 };
 
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 32201c269890..b8e72aad919c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -500,6 +500,7 @@ enum {
 	NFSPROC4_CLNT_SEEK,
 	NFSPROC4_CLNT_ALLOCATE,
 	NFSPROC4_CLNT_DEALLOCATE,
+	NFSPROC4_CLNT_LAYOUTSTATS,
 };
 
 /* nfs41 types */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c959e7eb5bd4..7bbe50504211 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -316,6 +316,49 @@ struct nfs4_layoutreturn {
 	int rpc_status;
 };
 
+#define PNFS_LAYOUTSTATS_MAXSIZE 256
+
+struct nfs42_layoutstat_args;
+struct nfs42_layoutstat_devinfo;
+typedef	void (*layoutstats_encode_t)(struct xdr_stream *,
+		struct nfs42_layoutstat_args *,
+		struct nfs42_layoutstat_devinfo *);
+
+/* Per file per deviceid layoutstats */
+struct nfs42_layoutstat_devinfo {
+	struct nfs4_deviceid dev_id;
+	__u64 offset;
+	__u64 length;
+	__u64 read_count;
+	__u64 read_bytes;
+	__u64 write_count;
+	__u64 write_bytes;
+	__u32 layout_type;
+	layoutstats_encode_t layoutstats_encode;
+	void *layout_private;
+};
+
+struct nfs42_layoutstat_args {
+	struct nfs4_sequence_args seq_args;
+	struct nfs_fh *fh;
+	struct inode *inode;
+	nfs4_stateid stateid;
+	int num_dev;
+	struct nfs42_layoutstat_devinfo *devinfo;
+};
+
+struct nfs42_layoutstat_res {
+	struct nfs4_sequence_res seq_res;
+	int num_dev;
+	int rpc_status;
+};
+
+struct nfs42_layoutstat_data {
+	struct inode *inode;
+	struct nfs42_layoutstat_args args;
+	struct nfs42_layoutstat_res res;
+};
+
 struct stateowner_id {
 	__u64	create_time;
 	__u32	uniquifier;
-- 
cgit v1.2.3


From 1bfe3b259ff2e579b2acb190f874397d53274497 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Tue, 23 Jun 2015 19:52:03 +0800
Subject: nfs42: serialize LAYOUTSTATS calls of the same file

There is no need to report concurrently.

Reviewed-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42proc.c     | 3 +++
 fs/nfs/pnfs.c          | 7 +++++++
 include/linux/nfs_fs.h | 1 +
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index ee0248340a42..06c74cd93875 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -204,6 +204,9 @@ nfs42_layoutstat_release(void *calldata)
 		nfss->pnfs_curr_ld->cleanup_layoutstats(data);
 
 	pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
+	smp_mb__before_atomic();
+	clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags);
+	smp_mb__after_atomic();
 	nfs_iput_and_deactive(data->inode);
 	kfree(data->args.devinfo);
 	kfree(data);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6279cff7df74..5b5224341bcd 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2257,6 +2257,7 @@ pnfs_report_layoutstat(struct inode *inode)
 {
 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
 	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs42_layoutstat_data *data;
 	struct pnfs_layout_hdr *hdr;
 	int status = 0;
@@ -2264,6 +2265,9 @@ pnfs_report_layoutstat(struct inode *inode)
 	if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
 		goto out;
 
+	if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
+		goto out;
+
 	spin_lock(&inode->i_lock);
 	if (!NFS_I(inode)->layout) {
 		spin_unlock(&inode->i_lock);
@@ -2296,6 +2300,9 @@ out_free:
 	kfree(data);
 out_put:
 	pnfs_put_layout_hdr(hdr);
+	smp_mb__before_atomic();
+	clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
+	smp_mb__after_atomic();
 	goto out;
 }
 EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index b95f914ce083..f91b5ade30c9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -219,6 +219,7 @@ struct nfs_inode {
 #define NFS_INO_COMMIT		(7)		/* inode is committing unstable writes */
 #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
 #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
+#define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */
 
 static inline struct nfs_inode *NFS_I(const struct inode *inode)
 {
-- 
cgit v1.2.3


From 144cba1493fdd6e3e1980e439a31df877831ebcd Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Mon, 27 Apr 2015 11:09:54 +0800
Subject: libceph: allow setting osd_req_op's flags

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c             |  4 ++--
 fs/ceph/addr.c                  |  3 ++-
 fs/ceph/file.c                  |  2 +-
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/osd_client.c           | 24 +++++++++++++++---------
 5 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index ec6c5c6e1ac9..349115ae3bc2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -2376,7 +2376,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
 	}
 
 	if (opcode == CEPH_OSD_OP_DELETE)
-		osd_req_op_init(osd_request, num_ops, opcode);
+		osd_req_op_init(osd_request, num_ops, opcode, 0);
 	else
 		osd_req_op_extent_init(osd_request, num_ops, opcode,
 				       offset, length, 0, 0);
@@ -2848,7 +2848,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
 		goto out;
 	stat_request->callback = rbd_img_obj_exists_callback;
 
-	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
+	osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
 	osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
 					false, false);
 	rbd_osd_req_format_read(stat_request);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e162bcd105ee..feeaf3b65fa0 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -884,7 +884,8 @@ get_more_pages:
 				}
 
 				if (do_sync)
-					osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+					osd_req_op_init(req, 1,
+							CEPH_OSD_OP_STARTSYNC, 0);
 
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b6b522b4b31..777eb21d556f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -614,7 +614,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 			break;
 		}
 
-		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
+		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 
 		n = iov_iter_get_pages_alloc(from, &pages, len, &start);
 		if (unlikely(n < 0)) {
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 61b19c46bdb3..7506b485bb6d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -249,7 +249,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
-					unsigned int which, u16 opcode);
+			    unsigned int which, u16 opcode, u32 flags);
 
 extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
 					unsigned int which,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 72459b9df8a1..4cb4fab46e4f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -453,7 +453,7 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
  */
 static struct ceph_osd_req_op *
 _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
-				u16 opcode)
+		 u16 opcode, u32 flags)
 {
 	struct ceph_osd_req_op *op;
 
@@ -463,14 +463,15 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
 	op = &osd_req->r_ops[which];
 	memset(op, 0, sizeof (*op));
 	op->op = opcode;
+	op->flags = flags;
 
 	return op;
 }
 
 void osd_req_op_init(struct ceph_osd_request *osd_req,
-				unsigned int which, u16 opcode)
+		     unsigned int which, u16 opcode, u32 flags)
 {
-	(void)_osd_req_op_init(osd_req, which, opcode);
+	(void)_osd_req_op_init(osd_req, which, opcode, flags);
 }
 EXPORT_SYMBOL(osd_req_op_init);
 
@@ -479,7 +480,8 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
 				u64 offset, u64 length,
 				u64 truncate_size, u32 truncate_seq)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      opcode, 0);
 	size_t payload_len = 0;
 
 	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
@@ -518,7 +520,8 @@ EXPORT_SYMBOL(osd_req_op_extent_update);
 void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 			u16 opcode, const char *class, const char *method)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      opcode, 0);
 	struct ceph_pagelist *pagelist;
 	size_t payload_len = 0;
 	size_t size;
@@ -555,7 +558,8 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
 			  u16 opcode, const char *name, const void *value,
 			  size_t size, u8 cmp_op, u8 cmp_mode)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      opcode, 0);
 	struct ceph_pagelist *pagelist;
 	size_t payload_len;
 
@@ -588,7 +592,8 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 				unsigned int which, u16 opcode,
 				u64 cookie, u64 version, int flag)
 {
-	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+						      opcode, 0);
 
 	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
 
@@ -605,7 +610,8 @@ void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 				u64 expected_write_size)
 {
 	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
-						      CEPH_OSD_OP_SETALLOCHINT);
+						      CEPH_OSD_OP_SETALLOCHINT,
+						      0);
 
 	op->alloc_hint.expected_object_size = expected_object_size;
 	op->alloc_hint.expected_write_size = expected_write_size;
@@ -789,7 +795,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	}
 
 	if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
-		osd_req_op_init(req, which, opcode);
+		osd_req_op_init(req, which, opcode, 0);
 	} else {
 		u32 object_size = le32_to_cpu(layout->fl_object_size);
 		u32 object_base = off - objoff;
-- 
cgit v1.2.3


From d50c97b566c5bbf990eff472e9feaa58fdebdd33 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 15 May 2015 11:52:20 +0300
Subject: libceph: nuke time_sub()

Unused since ceph got merged into mainline I guess.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 include/linux/ceph/libceph.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 30f92cefaa72..85ae9a889a3f 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -93,15 +93,6 @@ enum {
 	CEPH_MOUNT_SHUTDOWN,
 };
 
-/*
- * subtract jiffies
- */
-static inline unsigned long time_sub(unsigned long a, unsigned long b)
-{
-	BUG_ON(time_after(b, a));
-	return (long)a - (long)b;
-}
-
 struct ceph_mds_client;
 
 /*
-- 
cgit v1.2.3


From a319bf56a617354e62cf5f774d2ca4e1a8a3bff3 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 15 May 2015 12:02:17 +0300
Subject: libceph: store timeouts in jiffies, verify user input

There are currently three libceph-level timeouts that the user can
specify on mount: mount_timeout, osd_idle_ttl and osdkeepalive.  All of
these are in seconds and no checking is done on user input: negative
values are accepted, we multiply them all by HZ which may or may not
overflow, arbitrarily large jiffies then get added together, etc.

There is also a bug in the way mount_timeout=0 is handled.  It's
supposed to mean "infinite timeout", but that's not how wait.h APIs
treat it and so __ceph_open_session() for example will busy loop
without much chance of being interrupted if none of ceph-mons are
there.

Fix all this by verifying user input, storing timeouts capped by
msecs_to_jiffies() in jiffies and using the new ceph_timeout_jiffies()
helper for all user-specified waits to handle infinite timeouts
correctly.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Alex Elder <elder@linaro.org>
---
 drivers/block/rbd.c          |  5 +++--
 fs/ceph/dir.c                |  4 ++--
 fs/ceph/mds_client.c         | 12 ++++++------
 fs/ceph/mds_client.h         |  2 +-
 fs/ceph/super.c              |  2 +-
 include/linux/ceph/libceph.h | 17 +++++++++++------
 net/ceph/ceph_common.c       | 41 +++++++++++++++++++++++++++++++----------
 net/ceph/mon_client.c        | 11 +++++++++--
 net/ceph/osd_client.c        | 15 +++++++--------
 9 files changed, 71 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 349115ae3bc2..992683b6b299 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4963,8 +4963,8 @@ out_err:
  */
 static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
 {
+	struct ceph_options *opts = rbdc->client->options;
 	u64 newest_epoch;
-	unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
 	int tries = 0;
 	int ret;
 
@@ -4979,7 +4979,8 @@ again:
 		if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
 			ceph_monc_request_next_osdmap(&rbdc->client->monc);
 			(void) ceph_monc_wait_osdmap(&rbdc->client->monc,
-						     newest_epoch, timeout);
+						     newest_epoch,
+						     opts->mount_timeout);
 			goto again;
 		} else {
 			/* the osdmap we have is new enough */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4248307fea90..173dd4b58c71 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1259,8 +1259,8 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
 		     inode, req->r_tid, last_tid);
 		if (req->r_timeout) {
 			unsigned long time_left = wait_for_completion_timeout(
-							&req->r_safe_completion,
-							req->r_timeout);
+					&req->r_safe_completion,
+					ceph_timeout_jiffies(req->r_timeout));
 			if (time_left > 0)
 				ret = 0;
 			else
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 69a36f40517f..0b0e0a9a81c0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2268,7 +2268,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 	dout("do_request waiting\n");
 	if (req->r_timeout) {
 		err = (long)wait_for_completion_killable_timeout(
-			&req->r_completion, req->r_timeout);
+					&req->r_completion,
+					ceph_timeout_jiffies(req->r_timeout));
 		if (err == 0)
 			err = -EIO;
 	} else if (req->r_wait_for_completion) {
@@ -3424,8 +3425,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
  */
 static void wait_requests(struct ceph_mds_client *mdsc)
 {
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_request *req;
-	struct ceph_fs_client *fsc = mdsc->fsc;
 
 	mutex_lock(&mdsc->mutex);
 	if (__get_oldest_req(mdsc)) {
@@ -3433,7 +3434,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
 
 		dout("wait_requests waiting for requests\n");
 		wait_for_completion_timeout(&mdsc->safe_umount_waiters,
-				    fsc->client->options->mount_timeout * HZ);
+				    ceph_timeout_jiffies(opts->mount_timeout));
 
 		/* tear down remaining requests */
 		mutex_lock(&mdsc->mutex);
@@ -3556,10 +3557,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc)
  */
 void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
+	struct ceph_options *opts = mdsc->fsc->client->options;
 	struct ceph_mds_session *session;
 	int i;
-	struct ceph_fs_client *fsc = mdsc->fsc;
-	unsigned long timeout = fsc->client->options->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
@@ -3580,7 +3580,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 
 	dout("waiting for sessions to close\n");
 	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
-			   timeout);
+			   ceph_timeout_jiffies(opts->mount_timeout));
 
 	/* tear down remaining sessions */
 	mutex_lock(&mdsc->mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2ef799961ebb..509d6822e9b1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -227,7 +227,7 @@ struct ceph_mds_request {
 	int r_err;
 	bool r_aborted;
 
-	unsigned long r_timeout;  /* optional.  jiffies */
+	unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
 	unsigned long r_started;  /* start time to measure timeout against */
 	unsigned long r_request_started; /* start time for mds request only,
 					    used to measure lease durations */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9a5350030af8..edeb83c43112 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -742,7 +742,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 	req->r_ino1.ino = CEPH_INO_ROOT;
 	req->r_ino1.snap = CEPH_NOSNAP;
 	req->r_started = started;
-	req->r_timeout = fsc->client->options->mount_timeout * HZ;
+	req->r_timeout = fsc->client->options->mount_timeout;
 	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 85ae9a889a3f..d73a569f9bf5 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -43,9 +43,9 @@ struct ceph_options {
 	int flags;
 	struct ceph_fsid fsid;
 	struct ceph_entity_addr my_addr;
-	int mount_timeout;
-	int osd_idle_ttl;
-	int osd_keepalive_timeout;
+	unsigned long mount_timeout;		/* jiffies */
+	unsigned long osd_idle_ttl;		/* jiffies */
+	unsigned long osd_keepalive_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -63,9 +63,9 @@ struct ceph_options {
 /*
  * defaults
  */
-#define CEPH_MOUNT_TIMEOUT_DEFAULT  60
-#define CEPH_OSD_KEEPALIVE_DEFAULT  5
-#define CEPH_OSD_IDLE_TTL_DEFAULT    60
+#define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
+#define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
@@ -93,6 +93,11 @@ enum {
 	CEPH_MOUNT_SHUTDOWN,
 };
 
+static inline unsigned long ceph_timeout_jiffies(unsigned long timeout)
+{
+	return timeout ?: MAX_SCHEDULE_TIMEOUT;
+}
+
 struct ceph_mds_client;
 
 /*
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 79e8f71aef5b..a80e91c2c9a3 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -352,8 +352,8 @@ ceph_parse_options(char *options, const char *dev_name,
 	/* start with defaults */
 	opt->flags = CEPH_OPT_DEFAULT;
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
-	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
-	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;   /* seconds */
+	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
+	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
@@ -439,13 +439,32 @@ ceph_parse_options(char *options, const char *dev_name,
 			pr_warn("ignoring deprecated osdtimeout option\n");
 			break;
 		case Opt_osdkeepalivetimeout:
-			opt->osd_keepalive_timeout = intval;
+			/* 0 isn't well defined right now, reject it */
+			if (intval < 1 || intval > INT_MAX / 1000) {
+				pr_err("osdkeepalive out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_keepalive_timeout =
+					msecs_to_jiffies(intval * 1000);
 			break;
 		case Opt_osd_idle_ttl:
-			opt->osd_idle_ttl = intval;
+			/* 0 isn't well defined right now, reject it */
+			if (intval < 1 || intval > INT_MAX / 1000) {
+				pr_err("osd_idle_ttl out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
 			break;
 		case Opt_mount_timeout:
-			opt->mount_timeout = intval;
+			/* 0 is "wait forever" (i.e. infinite timeout) */
+			if (intval < 0 || intval > INT_MAX / 1000) {
+				pr_err("mount_timeout out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->mount_timeout = msecs_to_jiffies(intval * 1000);
 			break;
 
 		case Opt_share:
@@ -512,12 +531,14 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 		seq_puts(m, "notcp_nodelay,");
 
 	if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-		seq_printf(m, "mount_timeout=%d,", opt->mount_timeout);
+		seq_printf(m, "mount_timeout=%d,",
+			   jiffies_to_msecs(opt->mount_timeout) / 1000);
 	if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-		seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl);
+		seq_printf(m, "osd_idle_ttl=%d,",
+			   jiffies_to_msecs(opt->osd_idle_ttl) / 1000);
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, "osdkeepalivetimeout=%d,",
-			   opt->osd_keepalive_timeout);
+		    jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
 
 	/* drop redundant comma */
 	if (m->count != pos)
@@ -627,7 +648,7 @@ static int have_mon_and_osd_map(struct ceph_client *client)
 int __ceph_open_session(struct ceph_client *client, unsigned long started)
 {
 	int err;
-	unsigned long timeout = client->options->mount_timeout * HZ;
+	unsigned long timeout = client->options->mount_timeout;
 
 	/* open session, and wait for mon and osd maps */
 	err = ceph_monc_open_session(&client->monc);
@@ -643,7 +664,7 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
 		dout("mount waiting for mon_map\n");
 		err = wait_event_interruptible_timeout(client->auth_wq,
 			have_mon_and_osd_map(client) || (client->auth_err < 0),
-			timeout);
+			ceph_timeout_jiffies(timeout));
 		if (err == -EINTR || err == -ERESTARTSYS)
 			return err;
 		if (client->auth_err < 0)
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 2b3cf05e87b0..0da3bdc116f7 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -298,6 +298,12 @@ void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
 }
 EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
 
+/*
+ * Wait for an osdmap with a given epoch.
+ *
+ * @epoch: epoch to wait for
+ * @timeout: in jiffies, 0 means "wait forever"
+ */
 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 			  unsigned long timeout)
 {
@@ -308,11 +314,12 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
 	while (monc->have_osdmap < epoch) {
 		mutex_unlock(&monc->mutex);
 
-		if (timeout != 0 && time_after_eq(jiffies, started + timeout))
+		if (timeout && time_after_eq(jiffies, started + timeout))
 			return -ETIMEDOUT;
 
 		ret = wait_event_interruptible_timeout(monc->client->auth_wq,
-					 monc->have_osdmap >= epoch, timeout);
+						monc->have_osdmap >= epoch,
+						ceph_timeout_jiffies(timeout));
 		if (ret < 0)
 			return ret;
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4cb4fab46e4f..50033677c0fa 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1097,7 +1097,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
 	BUG_ON(!list_empty(&osd->o_osd_lru));
 
 	list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
-	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
+	osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl;
 }
 
 static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc,
@@ -1208,7 +1208,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
 {
 	schedule_delayed_work(&osdc->timeout_work,
-			osdc->client->options->osd_keepalive_timeout * HZ);
+			      osdc->client->options->osd_keepalive_timeout);
 }
 
 static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -1576,10 +1576,9 @@ static void handle_timeout(struct work_struct *work)
 {
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client, timeout_work.work);
+	struct ceph_options *opts = osdc->client->options;
 	struct ceph_osd_request *req;
 	struct ceph_osd *osd;
-	unsigned long keepalive =
-		osdc->client->options->osd_keepalive_timeout * HZ;
 	struct list_head slow_osds;
 	dout("timeout\n");
 	down_read(&osdc->map_sem);
@@ -1595,7 +1594,8 @@ static void handle_timeout(struct work_struct *work)
 	 */
 	INIT_LIST_HEAD(&slow_osds);
 	list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
-		if (time_before(jiffies, req->r_stamp + keepalive))
+		if (time_before(jiffies,
+				req->r_stamp + opts->osd_keepalive_timeout))
 			break;
 
 		osd = req->r_osd;
@@ -1622,8 +1622,7 @@ static void handle_osds_timeout(struct work_struct *work)
 	struct ceph_osd_client *osdc =
 		container_of(work, struct ceph_osd_client,
 			     osds_timeout_work.work);
-	unsigned long delay =
-		osdc->client->options->osd_idle_ttl * HZ >> 2;
+	unsigned long delay = osdc->client->options->osd_idle_ttl / 4;
 
 	dout("osds timeout\n");
 	down_read(&osdc->map_sem);
@@ -2628,7 +2627,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
 	osdc->event_count = 0;
 
 	schedule_delayed_work(&osdc->osds_timeout_work,
-	   round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
+	    round_jiffies_relative(osdc->client->options->osd_idle_ttl));
 
 	err = -ENOMEM;
 	osdc->req_mempool = mempool_create_kmalloc_pool(10,
-- 
cgit v1.2.3


From f66fd9f0952187d274c13c136b74548f792c1925 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 10 Jun 2015 17:26:13 +0800
Subject: ceph: pre-allocate data structure that tracks caps flushing

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c               | 19 +++++++++++++++----
 fs/ceph/caps.c               | 26 ++++++++++++++++++++++----
 fs/ceph/file.c               | 18 ++++++++++++++++--
 fs/ceph/inode.c              | 15 +++++++++++++--
 fs/ceph/mds_client.c         |  6 +++++-
 fs/ceph/super.c              |  8 ++++++++
 fs/ceph/super.h              |  6 +++++-
 fs/ceph/xattr.c              | 20 ++++++++++++++++++--
 include/linux/ceph/libceph.h |  1 +
 9 files changed, 103 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5f53ac0d9d7c..7edf3c49e661 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1308,12 +1308,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
+	struct ceph_cap_flush *prealloc_cf;
 	struct page *page = vmf->page;
 	loff_t off = page_offset(page);
 	loff_t size = i_size_read(inode);
 	size_t len;
 	int want, got, ret;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return VM_FAULT_SIGBUS;
+
 	if (ci->i_inline_version != CEPH_INLINE_NONE) {
 		struct page *locked_page = NULL;
 		if (off == 0) {
@@ -1323,8 +1328,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		ret = ceph_uninline_data(vma->vm_file, locked_page);
 		if (locked_page)
 			unlock_page(locked_page);
-		if (ret < 0)
-			return VM_FAULT_SIGBUS;
+		if (ret < 0) {
+			ret = VM_FAULT_SIGBUS;
+			goto out_free;
+		}
 	}
 
 	if (off + PAGE_CACHE_SIZE <= size)
@@ -1346,7 +1353,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 			break;
 		if (ret != -ERESTARTSYS) {
 			WARN_ON(1);
-			return VM_FAULT_SIGBUS;
+			ret = VM_FAULT_SIGBUS;
+			goto out_free;
 		}
 	}
 	dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
@@ -1381,7 +1389,8 @@ out:
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1390,6 +1399,8 @@ out:
 	dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
 	     inode, off, len, ceph_cap_string(got), ret);
 	ceph_put_cap_refs(ci, got);
+out_free:
+	ceph_free_cap_flush(prealloc_cf);
 
 	return ret;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 69a16044ec41..dd7b20adf1d4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1356,7 +1356,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
  * Caller is then responsible for calling __mark_inode_dirty with the
  * returned flags value.
  */
-int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+			   struct ceph_cap_flush **pcf)
 {
 	struct ceph_mds_client *mdsc =
 		ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
@@ -1376,6 +1377,9 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 	     ceph_cap_string(was | mask));
 	ci->i_dirty_caps |= mask;
 	if (was == 0) {
+		WARN_ON_ONCE(ci->i_prealloc_cap_flush);
+		swap(ci->i_prealloc_cap_flush, *pcf);
+
 		if (!ci->i_head_snapc) {
 			WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
 			ci->i_head_snapc = ceph_get_snap_context(
@@ -1391,6 +1395,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 			ihold(inode);
 			dirty |= I_DIRTY_SYNC;
 		}
+	} else {
+		WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
 	}
 	BUG_ON(list_empty(&ci->i_dirty_item));
 	if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
@@ -1446,6 +1452,17 @@ static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
 	rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
 }
 
+struct ceph_cap_flush *ceph_alloc_cap_flush(void)
+{
+	return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
+}
+
+void ceph_free_cap_flush(struct ceph_cap_flush *cf)
+{
+	if (cf)
+		kmem_cache_free(ceph_cap_flush_cachep, cf);
+}
+
 static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
 {
 	struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
@@ -1469,11 +1486,12 @@ static int __mark_caps_flushing(struct inode *inode,
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	struct ceph_cap_flush *cf;
+	struct ceph_cap_flush *cf = NULL;
 	int flushing;
 
 	BUG_ON(ci->i_dirty_caps == 0);
 	BUG_ON(list_empty(&ci->i_dirty_item));
+	BUG_ON(!ci->i_prealloc_cap_flush);
 
 	flushing = ci->i_dirty_caps;
 	dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
@@ -1484,7 +1502,7 @@ static int __mark_caps_flushing(struct inode *inode,
 	ci->i_dirty_caps = 0;
 	dout(" inode %p now !dirty\n", inode);
 
-	cf = kmalloc(sizeof(*cf), GFP_ATOMIC);
+	swap(cf, ci->i_prealloc_cap_flush);
 	cf->caps = flushing;
 	cf->kick = false;
 
@@ -3075,7 +3093,7 @@ out:
 		cf = list_first_entry(&to_remove,
 				      struct ceph_cap_flush, list);
 		list_del(&cf->list);
-		kfree(cf);
+		ceph_free_cap_flush(cf);
 	}
 	if (drop)
 		iput(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0a76a370d798..8a4eb4d21d3c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -939,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_sb_to_client(inode->i_sb)->client->osdc;
+	struct ceph_cap_flush *prealloc_cf;
 	ssize_t count, written = 0;
 	int err, want, got;
 	loff_t pos;
@@ -946,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ceph_snap(inode) != CEPH_NOSNAP)
 		return -EROFS;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
@@ -1050,7 +1055,8 @@ retry_snap:
 		int dirty;
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1074,6 +1080,7 @@ retry_snap:
 out:
 	mutex_unlock(&inode->i_mutex);
 out_unlocked:
+	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
@@ -1270,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
+	struct ceph_cap_flush *prealloc_cf;
 	int want, got = 0;
 	int dirty;
 	int ret = 0;
@@ -1282,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (!S_ISREG(inode->i_mode))
 		return -EOPNOTSUPP;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	mutex_lock(&inode->i_mutex);
 
 	if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -1328,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (!ret) {
 		spin_lock(&ci->i_ceph_lock);
 		ci->i_inline_version = CEPH_INLINE_NONE;
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &prealloc_cf);
 		spin_unlock(&ci->i_ceph_lock);
 		if (dirty)
 			__mark_inode_dirty(inode, dirty);
@@ -1337,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode,
 	ceph_put_cap_refs(ci, got);
 unlock:
 	mutex_unlock(&inode->i_mutex);
+	ceph_free_cap_flush(prealloc_cf);
 	return ret;
 }
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3326302f5884..e86d1a4efc46 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -416,6 +416,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_flushing_caps = 0;
 	INIT_LIST_HEAD(&ci->i_dirty_item);
 	INIT_LIST_HEAD(&ci->i_flushing_item);
+	ci->i_prealloc_cap_flush = NULL;
 	ci->i_cap_flush_tree = RB_ROOT;
 	init_waitqueue_head(&ci->i_cap_wq);
 	ci->i_hold_caps_min = 0;
@@ -1720,6 +1721,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	const unsigned int ia_valid = attr->ia_valid;
 	struct ceph_mds_request *req;
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf;
 	int issued;
 	int release = 0, dirtied = 0;
 	int mask = 0;
@@ -1734,10 +1736,16 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err != 0)
 		return err;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
 				       USE_AUTH_MDS);
-	if (IS_ERR(req))
+	if (IS_ERR(req)) {
+		ceph_free_cap_flush(prealloc_cf);
 		return PTR_ERR(req);
+	}
 
 	spin_lock(&ci->i_ceph_lock);
 	issued = __ceph_caps_issued(ci, NULL);
@@ -1895,7 +1903,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		dout("setattr %p ATTR_FILE ... hrm!\n", inode);
 
 	if (dirtied) {
-		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied);
+		inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
+							   &prealloc_cf);
 		inode->i_ctime = CURRENT_TIME;
 	}
 
@@ -1927,9 +1936,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 	ceph_mdsc_put_request(req);
 	if (mask & CEPH_SETATTR_SIZE)
 		__ceph_do_pending_vmtruncate(inode);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 out_put:
 	ceph_mdsc_put_request(req);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 }
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 89e4305a94d4..8d73fe9d488b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1189,6 +1189,10 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		}
 		spin_unlock(&mdsc->cap_dirty_lock);
 
+		if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
+			list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+			ci->i_prealloc_cap_flush = NULL;
+		}
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	while (!list_empty(&to_remove)) {
@@ -1196,7 +1200,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		cf = list_first_entry(&to_remove,
 				      struct ceph_cap_flush, list);
 		list_del(&cf->list);
-		kfree(cf);
+		ceph_free_cap_flush(cf);
 	}
 	while (drop--)
 		iput(inode);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index edeb83c43112..d1c833c321b9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -622,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
  */
 struct kmem_cache *ceph_inode_cachep;
 struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_flush_cachep;
 struct kmem_cache *ceph_dentry_cachep;
 struct kmem_cache *ceph_file_cachep;
 
@@ -647,6 +648,10 @@ static int __init init_caches(void)
 				     SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
 	if (ceph_cap_cachep == NULL)
 		goto bad_cap;
+	ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
+					   SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+	if (ceph_cap_flush_cachep == NULL)
+		goto bad_cap_flush;
 
 	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
 					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
@@ -665,6 +670,8 @@ static int __init init_caches(void)
 bad_file:
 	kmem_cache_destroy(ceph_dentry_cachep);
 bad_dentry:
+	kmem_cache_destroy(ceph_cap_flush_cachep);
+bad_cap_flush:
 	kmem_cache_destroy(ceph_cap_cachep);
 bad_cap:
 	kmem_cache_destroy(ceph_inode_cachep);
@@ -681,6 +688,7 @@ static void destroy_caches(void)
 
 	kmem_cache_destroy(ceph_inode_cachep);
 	kmem_cache_destroy(ceph_cap_cachep);
+	kmem_cache_destroy(ceph_cap_flush_cachep);
 	kmem_cache_destroy(ceph_dentry_cachep);
 	kmem_cache_destroy(ceph_file_cachep);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e7f13f742357..4415e977d72b 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -309,6 +309,7 @@ struct ceph_inode_info {
 	/* we need to track cap writeback on a per-cap-bit basis, to allow
 	 * overlapping, pipelined cap flushes to the mds.  we can probably
 	 * reduce the tid to 8 bits if we're concerned about inode size. */
+	struct ceph_cap_flush *i_prealloc_cap_flush;
 	struct rb_root i_cap_flush_tree;
 	wait_queue_head_t i_cap_wq;      /* threads waiting on a capability */
 	unsigned long i_hold_caps_min; /* jiffies */
@@ -578,7 +579,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
 {
 	return ci->i_dirty_caps | ci->i_flushing_caps;
 }
-extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern struct ceph_cap_flush *ceph_alloc_cap_flush(void);
+extern void ceph_free_cap_flush(struct ceph_cap_flush *cf);
+extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
+				  struct ceph_cap_flush **pcf);
 
 extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
 				      struct ceph_cap *ocap, int mask);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c6f7d9b82085..819163d8313b 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -912,6 +912,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
 	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf = NULL;
 	int issued;
 	int err;
 	int dirty = 0;
@@ -950,6 +951,10 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
 	if (!xattr)
 		goto out;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		goto out;
+
 	spin_lock(&ci->i_ceph_lock);
 retry:
 	issued = __ceph_caps_issued(ci, NULL);
@@ -991,7 +996,8 @@ retry:
 			  flags, value ? 1 : -1, &xattr);
 
 	if (!err) {
-		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+					       &prealloc_cf);
 		ci->i_xattrs.dirty = true;
 		inode->i_ctime = CURRENT_TIME;
 	}
@@ -1001,6 +1007,7 @@ retry:
 		up_read(&mdsc->snap_rwsem);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 
 do_sync:
@@ -1010,6 +1017,7 @@ do_sync_unlocked:
 		up_read(&mdsc->snap_rwsem);
 	err = ceph_sync_setxattr(dentry, name, value, size, flags);
 out:
+	ceph_free_cap_flush(prealloc_cf);
 	kfree(newname);
 	kfree(newval);
 	kfree(xattr);
@@ -1062,6 +1070,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
 	struct ceph_vxattr *vxattr;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
+	struct ceph_cap_flush *prealloc_cf = NULL;
 	int issued;
 	int err;
 	int required_blob_size;
@@ -1079,6 +1088,10 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
 	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
 		goto do_sync_unlocked;
 
+	prealloc_cf = ceph_alloc_cap_flush();
+	if (!prealloc_cf)
+		return -ENOMEM;
+
 	err = -ENOMEM;
 	spin_lock(&ci->i_ceph_lock);
 retry:
@@ -1120,7 +1133,8 @@ retry:
 
 	err = __remove_xattr_by_name(ceph_inode(inode), name);
 
-	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+	dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
+				       &prealloc_cf);
 	ci->i_xattrs.dirty = true;
 	inode->i_ctime = CURRENT_TIME;
 	spin_unlock(&ci->i_ceph_lock);
@@ -1128,12 +1142,14 @@ retry:
 		up_read(&mdsc->snap_rwsem);
 	if (dirty)
 		__mark_inode_dirty(inode, dirty);
+	ceph_free_cap_flush(prealloc_cf);
 	return err;
 do_sync:
 	spin_unlock(&ci->i_ceph_lock);
 do_sync_unlocked:
 	if (lock_snap_rwsem)
 		up_read(&mdsc->snap_rwsem);
+	ceph_free_cap_flush(prealloc_cf);
 	err = ceph_send_removexattr(dentry, name);
 	return err;
 }
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index d73a569f9bf5..9ebee53d3bf5 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -174,6 +174,7 @@ static inline int calc_pages_for(u64 off, u64 len)
 
 extern struct kmem_cache *ceph_inode_cachep;
 extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_cap_flush_cachep;
 extern struct kmem_cache *ceph_dentry_cachep;
 extern struct kmem_cache *ceph_file_cachep;
 
-- 
cgit v1.2.3


From b459be739f97e2062b2ba77cfe8ea198dbd58904 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 12 Jun 2015 13:21:07 +0300
Subject: crush: sync up with userspace

.. up to ceph.git commit 1db1abc8328d ("crush: eliminate ad hoc diff
between kernel and userspace").  This fixes a bunch of recently pulled
coding style issues and makes includes a bit cleaner.

A patch "crush:Make the function crush_ln static" from Nicholas Krause
<xerofoify@gmail.com> is folded in as crush_ln() has been made static
in userspace as well.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/crush/crush.h     |  40 +++++++++++-
 include/linux/crush/hash.h      |   6 ++
 include/linux/crush/mapper.h    |   2 +-
 net/ceph/crush/crush.c          |  13 ++--
 net/ceph/crush/crush_ln_table.h |  32 +++++-----
 net/ceph/crush/hash.c           |   8 ++-
 net/ceph/crush/mapper.c         | 137 ++++++++++++++++++++++++++--------------
 7 files changed, 160 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 48a1a7d100f1..48b49305716b 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -1,7 +1,11 @@
 #ifndef CEPH_CRUSH_CRUSH_H
 #define CEPH_CRUSH_CRUSH_H
 
-#include <linux/types.h>
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
 
 /*
  * CRUSH is a pseudo-random data distribution algorithm that
@@ -20,7 +24,11 @@
 #define CRUSH_MAGIC 0x00010000ul   /* for detecting algorithm revisions */
 
 #define CRUSH_MAX_DEPTH 10  /* max crush hierarchy depth */
+#define CRUSH_MAX_RULESET (1<<8)  /* max crush ruleset number */
+#define CRUSH_MAX_RULES CRUSH_MAX_RULESET  /* should be the same as max rulesets */
 
+#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
+#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
 
 #define CRUSH_ITEM_UNDEF  0x7ffffffe  /* undefined result (internal use only) */
 #define CRUSH_ITEM_NONE   0x7fffffff  /* no result */
@@ -108,6 +116,15 @@ enum {
 };
 extern const char *crush_bucket_alg_name(int alg);
 
+/*
+ * although tree was a legacy algorithm, it has been buggy, so
+ * exclude it.
+ */
+#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS (	\
+		(1 << CRUSH_BUCKET_UNIFORM) |	\
+		(1 << CRUSH_BUCKET_LIST) |	\
+		(1 << CRUSH_BUCKET_STRAW))
+
 struct crush_bucket {
 	__s32 id;        /* this'll be negative */
 	__u16 type;      /* non-zero; type=0 is reserved for devices */
@@ -174,7 +191,7 @@ struct crush_map {
 	/* choose local attempts using a fallback permutation before
 	 * re-descent */
 	__u32 choose_local_fallback_tries;
-	/* choose attempts before giving up */ 
+	/* choose attempts before giving up */
 	__u32 choose_total_tries;
 	/* attempt chooseleaf inner descent once for firstn mode; on
 	 * reject retry outer descent.  Note that this does *not*
@@ -187,6 +204,25 @@ struct crush_map {
 	 * that want to limit reshuffling, a value of 3 or 4 will make the
 	 * mappings line up a bit better with previous mappings. */
 	__u8 chooseleaf_vary_r;
+
+#ifndef __KERNEL__
+	/*
+	 * version 0 (original) of straw_calc has various flaws.  version 1
+	 * fixes a few of them.
+	 */
+	__u8 straw_calc_version;
+
+	/*
+	 * allowed bucket algs is a bitmask, here the bit positions
+	 * are CRUSH_BUCKET_*.  note that these are *bits* and
+	 * CRUSH_BUCKET_* values are not, so we need to or together (1
+	 * << CRUSH_BUCKET_WHATEVER).  The 0th bit is not used to
+	 * minimize confusion (bucket type values start at 1).
+	 */
+	__u32 allowed_bucket_algs;
+
+	__u32 *choose_tries;
+#endif
 };
 
 
diff --git a/include/linux/crush/hash.h b/include/linux/crush/hash.h
index 91e884230d5d..d1d90258242e 100644
--- a/include/linux/crush/hash.h
+++ b/include/linux/crush/hash.h
@@ -1,6 +1,12 @@
 #ifndef CEPH_CRUSH_HASH_H
 #define CEPH_CRUSH_HASH_H
 
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
+
 #define CRUSH_HASH_RJENKINS1   0
 
 #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h
index eab367446eea..5dfd5b1125d2 100644
--- a/include/linux/crush/mapper.h
+++ b/include/linux/crush/mapper.h
@@ -8,7 +8,7 @@
  * LGPL2
  */
 
-#include <linux/crush/crush.h>
+#include "crush.h"
 
 extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
 extern int crush_do_rule(const struct crush_map *map,
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 9d84ce4ea0df..80d7c3a97cb8 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -1,15 +1,11 @@
-
 #ifdef __KERNEL__
 # include <linux/slab.h>
+# include <linux/crush/crush.h>
 #else
-# include <stdlib.h>
-# include <assert.h>
-# define kfree(x) do { if (x) free(x); } while (0)
-# define BUG_ON(x) assert(!(x))
+# include "crush_compat.h"
+# include "crush.h"
 #endif
 
-#include <linux/crush/crush.h>
-
 const char *crush_bucket_alg_name(int alg)
 {
 	switch (alg) {
@@ -134,6 +130,9 @@ void crush_destroy(struct crush_map *map)
 		kfree(map->rules);
 	}
 
+#ifndef __KERNEL__
+	kfree(map->choose_tries);
+#endif
 	kfree(map);
 }
 
diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h
index 6192c7fc958c..aae534c901a4 100644
--- a/net/ceph/crush/crush_ln_table.h
+++ b/net/ceph/crush/crush_ln_table.h
@@ -10,20 +10,20 @@
  *
  */
 
-#if defined(__linux__)
-#include <linux/types.h>
-#elif defined(__FreeBSD__)
-#include <sys/types.h>
-#endif
-
 #ifndef CEPH_CRUSH_LN_H
 #define CEPH_CRUSH_LN_H
 
+#ifdef __KERNEL__
+# include <linux/types.h>
+#else
+# include "crush_compat.h"
+#endif
 
-// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
-// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
-
-static int64_t __RH_LH_tbl[128*2+2] = {
+/*
+ * RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0)
+ * RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0)
+ */
+static __s64 __RH_LH_tbl[128*2+2] = {
   0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll,
   0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all,
   0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll,
@@ -89,11 +89,12 @@ static int64_t __RH_LH_tbl[128*2+2] = {
   0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll,
   0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll,
   0x0000800000000000ll, 0x0000ffff00000000ll,
-  };
-
+};
 
-    // LL_tbl[k] = 2^48*log2(1.0+k/2^15);
-static int64_t __LL_tbl[256] = {
+/*
+ * LL_tbl[k] = 2^48*log2(1.0+k/2^15)
+ */
+static __s64 __LL_tbl[256] = {
   0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull,
   0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull,
   0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull,
@@ -160,7 +161,4 @@ static int64_t __LL_tbl[256] = {
   0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull,
 };
 
-
-
-
 #endif
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
index 5bb63e37a8a1..ed123af49eba 100644
--- a/net/ceph/crush/hash.c
+++ b/net/ceph/crush/hash.c
@@ -1,6 +1,8 @@
-
-#include <linux/types.h>
-#include <linux/crush/hash.h>
+#ifdef __KERNEL__
+# include <linux/crush/hash.h>
+#else
+# include "hash.h"
+#endif
 
 /*
  * Robert Jenkins' function for mixing 32-bit values
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 7568cb59b9e5..393bfb22d5bb 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -1,27 +1,31 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel Corporation All Rights Reserved
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
 
 #ifdef __KERNEL__
 # include <linux/string.h>
 # include <linux/slab.h>
 # include <linux/bug.h>
 # include <linux/kernel.h>
-# ifndef dprintk
-#  define dprintk(args...)
-# endif
+# include <linux/crush/crush.h>
+# include <linux/crush/hash.h>
 #else
-# include <string.h>
-# include <stdio.h>
-# include <stdlib.h>
-# include <assert.h>
-# define BUG_ON(x) assert(!(x))
-# define dprintk(args...) /* printf(args) */
-# define kmalloc(x, f) malloc(x)
-# define kfree(x) free(x)
+# include "crush_compat.h"
+# include "crush.h"
+# include "hash.h"
 #endif
-
-#include <linux/crush/crush.h>
-#include <linux/crush/hash.h>
 #include "crush_ln_table.h"
 
+#define dprintk(args...) /* printf(args) */
+
 /*
  * Implement the core CRUSH mapping algorithm.
  */
@@ -139,7 +143,7 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
 	int i;
 
 	for (i = bucket->h.size-1; i >= 0; i--) {
-		__u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+		__u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
 					 r, bucket->h.id);
 		w &= 0xffff;
 		dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
@@ -238,43 +242,46 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 	return bucket->h.items[high];
 }
 
-// compute 2^44*log2(input+1)
-uint64_t crush_ln(unsigned xin)
+/* compute 2^44*log2(input+1) */
+static __u64 crush_ln(unsigned int xin)
 {
-    unsigned x=xin, x1;
-    int iexpon, index1, index2;
-    uint64_t RH, LH, LL, xl64, result;
+	unsigned int x = xin, x1;
+	int iexpon, index1, index2;
+	__u64 RH, LH, LL, xl64, result;
 
-    x++;
+	x++;
 
-    // normalize input
-    iexpon = 15;
-    while(!(x&0x18000)) { x<<=1; iexpon--; }
+	/* normalize input */
+	iexpon = 15;
+	while (!(x & 0x18000)) {
+		x <<= 1;
+		iexpon--;
+	}
 
-    index1 = (x>>8)<<1;
-    // RH ~ 2^56/index1
-    RH = __RH_LH_tbl[index1 - 256];
-    // LH ~ 2^48 * log2(index1/256)
-    LH = __RH_LH_tbl[index1 + 1 - 256];
+	index1 = (x >> 8) << 1;
+	/* RH ~ 2^56/index1 */
+	RH = __RH_LH_tbl[index1 - 256];
+	/* LH ~ 2^48 * log2(index1/256) */
+	LH = __RH_LH_tbl[index1 + 1 - 256];
 
-    // RH*x ~ 2^48 * (2^15 + xf), xf<2^8
-    xl64 = (int64_t)x * RH;
-    xl64 >>= 48;
-    x1 = xl64;
+	/* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
+	xl64 = (__s64)x * RH;
+	xl64 >>= 48;
+	x1 = xl64;
 
-    result = iexpon;
-    result <<= (12 + 32);
+	result = iexpon;
+	result <<= (12 + 32);
 
-    index2 = x1 & 0xff;
-    // LL ~ 2^48*log2(1.0+index2/2^15)
-    LL = __LL_tbl[index2];
+	index2 = x1 & 0xff;
+	/* LL ~ 2^48*log2(1.0+index2/2^15) */
+	LL = __LL_tbl[index2];
 
-    LH = LH + LL;
+	LH = LH + LL;
 
-    LH >>= (48-12 - 32);
-    result += LH;
+	LH >>= (48 - 12 - 32);
+	result += LH;
 
-    return result;
+	return result;
 }
 
 
@@ -290,9 +297,9 @@ uint64_t crush_ln(unsigned xin)
 static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
 				int x, int r)
 {
-	unsigned i, high = 0;
-	unsigned u;
-	unsigned w;
+	unsigned int i, high = 0;
+	unsigned int u;
+	unsigned int w;
 	__s64 ln, draw, high_draw = 0;
 
 	for (i = 0; i < bucket->h.size; i++) {
@@ -567,6 +574,10 @@ reject:
 		out[outpos] = item;
 		outpos++;
 		count--;
+#ifndef __KERNEL__
+		if (map->choose_tries && ftotal <= map->choose_total_tries)
+			map->choose_tries[ftotal]++;
+#endif
 	}
 
 	dprintk("CHOOSE returns %d\n", outpos);
@@ -610,6 +621,20 @@ static void crush_choose_indep(const struct crush_map *map,
 	}
 
 	for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
+#ifdef DEBUG_INDEP
+		if (out2 && ftotal) {
+			dprintk("%u %d a: ", ftotal, left);
+			for (rep = outpos; rep < endpos; rep++) {
+				dprintk(" %d", out[rep]);
+			}
+			dprintk("\n");
+			dprintk("%u %d b: ", ftotal, left);
+			for (rep = outpos; rep < endpos; rep++) {
+				dprintk(" %d", out2[rep]);
+			}
+			dprintk("\n");
+		}
+#endif
 		for (rep = outpos; rep < endpos; rep++) {
 			if (out[rep] != CRUSH_ITEM_UNDEF)
 				continue;
@@ -726,6 +751,24 @@ static void crush_choose_indep(const struct crush_map *map,
 			out2[rep] = CRUSH_ITEM_NONE;
 		}
 	}
+#ifndef __KERNEL__
+	if (map->choose_tries && ftotal <= map->choose_total_tries)
+		map->choose_tries[ftotal]++;
+#endif
+#ifdef DEBUG_INDEP
+	if (out2) {
+		dprintk("%u %d a: ", ftotal, left);
+		for (rep = outpos; rep < endpos; rep++) {
+			dprintk(" %d", out[rep]);
+		}
+		dprintk("\n");
+		dprintk("%u %d b: ", ftotal, left);
+		for (rep = outpos; rep < endpos; rep++) {
+			dprintk(" %d", out2[rep]);
+		}
+		dprintk("\n");
+	}
+#endif
 }
 
 /**
@@ -884,7 +927,7 @@ int crush_do_rule(const struct crush_map *map,
 						0);
 				} else {
 					out_size = ((numrep < (result_max-osize)) ?
-                                                    numrep : (result_max-osize));
+						    numrep : (result_max-osize));
 					crush_choose_indep(
 						map,
 						map->buckets[-1-w[i]],
@@ -930,5 +973,3 @@ int crush_do_rule(const struct crush_map *map,
 	}
 	return result_len;
 }
-
-
-- 
cgit v1.2.3


From 7c494375b773497228502133879072a9e959c063 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Mon, 1 Jun 2015 10:35:16 -0700
Subject: Input: improve parsing OF parameters for touchscreens

When applying touchscreen parameters specified in device tree let's make
sure we keep whatever setup was done by the driver and not reset the
missing values to zero.

Reported-by: Pavel Machek <pavel@ucw.cz>
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/edt-ft5x06.c     |  2 +-
 drivers/input/touchscreen/of_touchscreen.c | 69 +++++++++++++++++++-----------
 drivers/input/touchscreen/tsc2005.c        |  2 +-
 include/linux/input/touchscreen.h          |  5 ++-
 4 files changed, 49 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/edt-ft5x06.c b/drivers/input/touchscreen/edt-ft5x06.c
index 29d179a1953b..394b1de9a2a3 100644
--- a/drivers/input/touchscreen/edt-ft5x06.c
+++ b/drivers/input/touchscreen/edt-ft5x06.c
@@ -1041,7 +1041,7 @@ static int edt_ft5x06_ts_probe(struct i2c_client *client,
 			     0, tsdata->num_y * 64 - 1, 0, 0);
 
 	if (!pdata)
-		touchscreen_parse_of_params(input);
+		touchscreen_parse_of_params(input, true);
 
 	error = input_mt_init_slots(input, MAX_SUPPORT_POINTS, INPUT_MT_DIRECT);
 	if (error) {
diff --git a/drivers/input/touchscreen/of_touchscreen.c b/drivers/input/touchscreen/of_touchscreen.c
index b82b5207c78b..806cd0ad160f 100644
--- a/drivers/input/touchscreen/of_touchscreen.c
+++ b/drivers/input/touchscreen/of_touchscreen.c
@@ -14,14 +14,22 @@
 #include <linux/input/mt.h>
 #include <linux/input/touchscreen.h>
 
-static u32 of_get_optional_u32(struct device_node *np,
-			       const char *property)
+static bool touchscreen_get_prop_u32(struct device_node *np,
+				     const char *property,
+				     unsigned int default_value,
+				     unsigned int *value)
 {
-	u32 val = 0;
+	u32 val;
+	int error;
 
-	of_property_read_u32(np, property, &val);
+	error = of_property_read_u32(np, property, &val);
+	if (error) {
+		*value = default_value;
+		return false;
+	}
 
-	return val;
+	*value = val;
+	return true;
 }
 
 static void touchscreen_set_params(struct input_dev *dev,
@@ -54,34 +62,45 @@ static void touchscreen_set_params(struct input_dev *dev,
  * input device accordingly. The function keeps previously setuped default
  * values if no value is specified via DT.
  */
-void touchscreen_parse_of_params(struct input_dev *dev)
+void touchscreen_parse_of_params(struct input_dev *dev, bool multitouch)
 {
 	struct device_node *np = dev->dev.parent->of_node;
-	u32 maximum, fuzz;
+	unsigned int axis;
+	unsigned int maximum, fuzz;
+	bool data_present;
 
 	input_alloc_absinfo(dev);
 	if (!dev->absinfo)
 		return;
 
-	maximum = of_get_optional_u32(np, "touchscreen-size-x");
-	fuzz = of_get_optional_u32(np, "touchscreen-fuzz-x");
-	if (maximum || fuzz) {
-		touchscreen_set_params(dev, ABS_X, maximum, fuzz);
-		touchscreen_set_params(dev, ABS_MT_POSITION_X, maximum, fuzz);
-	}
+	axis = multitouch ? ABS_MT_POSITION_X : ABS_X;
+	data_present = touchscreen_get_prop_u32(np, "touchscreen-size-x",
+						input_abs_get_max(dev, axis),
+						&maximum) |
+		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-x",
+						input_abs_get_fuzz(dev, axis),
+						&fuzz);
+	if (data_present)
+		touchscreen_set_params(dev, axis, maximum, fuzz);
 
-	maximum = of_get_optional_u32(np, "touchscreen-size-y");
-	fuzz = of_get_optional_u32(np, "touchscreen-fuzz-y");
-	if (maximum || fuzz) {
-		touchscreen_set_params(dev, ABS_Y, maximum, fuzz);
-		touchscreen_set_params(dev, ABS_MT_POSITION_Y, maximum, fuzz);
-	}
+	axis = multitouch ? ABS_MT_POSITION_Y : ABS_Y;
+	data_present = touchscreen_get_prop_u32(np, "touchscreen-size-y",
+						input_abs_get_max(dev, axis),
+						&maximum) |
+		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-y",
+						input_abs_get_fuzz(dev, axis),
+						&fuzz);
+	if (data_present)
+		touchscreen_set_params(dev, axis, maximum, fuzz);
 
-	maximum = of_get_optional_u32(np, "touchscreen-max-pressure");
-	fuzz = of_get_optional_u32(np, "touchscreen-fuzz-pressure");
-	if (maximum || fuzz) {
-		touchscreen_set_params(dev, ABS_PRESSURE, maximum, fuzz);
-		touchscreen_set_params(dev, ABS_MT_PRESSURE, maximum, fuzz);
-	}
+	axis = multitouch ? ABS_MT_PRESSURE : ABS_PRESSURE;
+	data_present = touchscreen_get_prop_u32(np, "touchscreen-max-pressure",
+						input_abs_get_max(dev, axis),
+						&maximum) |
+		       touchscreen_get_prop_u32(np, "touchscreen-fuzz-pressure",
+						input_abs_get_fuzz(dev, axis),
+						&fuzz);
+	if (data_present)
+		touchscreen_set_params(dev, axis, maximum, fuzz);
 }
 EXPORT_SYMBOL(touchscreen_parse_of_params);
diff --git a/drivers/input/touchscreen/tsc2005.c b/drivers/input/touchscreen/tsc2005.c
index 72657c579430..d8c025b0f88c 100644
--- a/drivers/input/touchscreen/tsc2005.c
+++ b/drivers/input/touchscreen/tsc2005.c
@@ -709,7 +709,7 @@ static int tsc2005_probe(struct spi_device *spi)
 	input_set_abs_params(input_dev, ABS_PRESSURE, 0, max_p, fudge_p, 0);
 
 	if (np)
-		touchscreen_parse_of_params(input_dev);
+		touchscreen_parse_of_params(input_dev, false);
 
 	input_dev->open = tsc2005_open;
 	input_dev->close = tsc2005_close;
diff --git a/include/linux/input/touchscreen.h b/include/linux/input/touchscreen.h
index 08a5ef6e8f25..eecc9ea6cd58 100644
--- a/include/linux/input/touchscreen.h
+++ b/include/linux/input/touchscreen.h
@@ -12,9 +12,10 @@
 #include <linux/input.h>
 
 #ifdef CONFIG_OF
-void touchscreen_parse_of_params(struct input_dev *dev);
+void touchscreen_parse_of_params(struct input_dev *dev, bool multitouch);
 #else
-static inline void touchscreen_parse_of_params(struct input_dev *dev)
+static inline void touchscreen_parse_of_params(struct input_dev *dev,
+					       bool multitouch)
 {
 }
 #endif
-- 
cgit v1.2.3


From 6c5a0d891543873aefc3aaf846c1e7afe0982ff9 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sat, 27 Jun 2015 11:45:46 -0400
Subject: NFSv4.2: LAYOUTSTATS is optional to implement

Make it so, by checking the return value for NFS4ERR_MOTSUPP and
caching the information as a server capability.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs42proc.c        | 10 ++++++++--
 fs/nfs/nfs4proc.c         |  3 ++-
 fs/nfs/pnfs.c             |  3 +++
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 06c74cd93875..f486b80f927a 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -189,9 +189,15 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return;
 
-	/* well, we don't care about errors at all! */
-	if (task->tk_status)
+	switch (task->tk_status) {
+	case 0:
+		break;
+	case -ENOTSUPP:
+	case -EOPNOTSUPP:
+		NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+	default:
 		dprintk("%s server returns %d\n", __func__, task->tk_status);
+	}
 }
 
 static void
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 643ce3a91b22..8eab42407d39 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8635,7 +8635,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_ATOMIC_OPEN_V1
 		| NFS_CAP_ALLOCATE
 		| NFS_CAP_DEALLOCATE
-		| NFS_CAP_SEEK,
+		| NFS_CAP_SEEK
+		| NFS_CAP_LAYOUTSTATS,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 40bacebb5b97..0ba9a02c9566 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2266,6 +2266,9 @@ pnfs_report_layoutstat(struct inode *inode)
 	if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
 		goto out;
 
+	if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
+		goto out;
+
 	if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
 		goto out;
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 5e1273d4de14..a2ea1491d3df 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -237,5 +237,6 @@ struct nfs_server {
 #define NFS_CAP_SEEK		(1U << 19)
 #define NFS_CAP_ALLOCATE	(1U << 20)
 #define NFS_CAP_DEALLOCATE	(1U << 21)
+#define NFS_CAP_LAYOUTSTATS	(1U << 22)
 
 #endif
-- 
cgit v1.2.3


From 31f02455455d405320e2f749696bef4e02903b35 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 30 Jun 2015 12:07:17 -0400
Subject: sparse: fix misplaced __pmem definition

Move the definition of __pmem outside of CONFIG_SPARSE_RCU_POINTER to fix:

drivers/nvdimm/pmem.c:198:17: sparse: too many arguments for function __builtin_expect
drivers/nvdimm/pmem.c:36:33: sparse: expected ; at end of declaration
drivers/nvdimm/pmem.c:48:21: sparse: void declaration

...due to __pmem failing to be defined in some configurations when
CONFIG_SPARSE_RCU_POINTER=y.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 26fc8bc77f85..d8fbd500330e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -17,11 +17,11 @@
 # define __release(x)	__context__(x,-1)
 # define __cond_lock(x,c)	((c) ? ({ __acquire(x); 1; }) : 0)
 # define __percpu	__attribute__((noderef, address_space(3)))
+# define __pmem		__attribute__((noderef, address_space(5)))
 #ifdef CONFIG_SPARSE_RCU_POINTER
 # define __rcu		__attribute__((noderef, address_space(4)))
 #else
 # define __rcu
-# define __pmem		__attribute__((noderef, address_space(5)))
 #endif
 extern void __chk_user_ptr(const volatile void __user *);
 extern void __chk_io_ptr(const volatile void __iomem *);
-- 
cgit v1.2.3


From 8a81252b774b53e628a8a0fe18e2b8fc236d92cc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Jun 2015 15:54:08 +0200
Subject: fs/file.c: don't acquire files->file_lock in fd_install()

Mateusz Guzik reported :

 Currently obtaining a new file descriptor results in locking fdtable
 twice - once in order to reserve a slot and second time to fill it.

Holding the spinlock in __fd_install() is needed in case a resize is
done, or to prevent a resize.

Mateusz provided an RFC patch and a micro benchmark :
  http://people.redhat.com/~mguzik/pipebench.c

A resize is an unlikely operation in a process lifetime,
as table size is at least doubled at every resize.

We can use RCU instead of the spinlock.

__fd_install() must wait if a resize is in progress.

The resize must block new __fd_install() callers from starting,
and wait that ongoing install are finished (synchronize_sched())

resize should be attempted by a single thread to not waste resources.

rcu_sched variant is used, as __fd_install() and expand_fdtable() run
from process context.

It gives us a ~30% speedup using pipebench on a dual Intel(R) Xeon(R)
CPU E5-2696 v2 @ 2.50GHz

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Mateusz Guzik <mguzik@redhat.com>
Acked-by: Mateusz Guzik <mguzik@redhat.com>
Tested-by: Mateusz Guzik <mguzik@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting |  4 +++
 fs/file.c                         | 67 ++++++++++++++++++++++++++++-----------
 include/linux/fdtable.h           |  3 ++
 3 files changed, 55 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 3eae250254d5..ec5456113072 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -500,3 +500,7 @@ in your dentry operations instead.
 	dentry,  it does not get nameidata at all and it gets called only when cookie
 	is non-NULL.  Note that link body isn't available anymore, so if you need it,
 	store it as cookie.
+--
+[mandatory]
+	__fd_install() & fd_install() can now sleep. Callers should not
+	hold a spinlock	or other resources that do not allow a schedule.
diff --git a/fs/file.c b/fs/file.c
index 93c5f89c248b..3d2eb4c542a4 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr)
 
 	spin_unlock(&files->file_lock);
 	new_fdt = alloc_fdtable(nr);
+
+	/* make sure all __fd_install() have seen resize_in_progress
+	 * or have finished their rcu_read_lock_sched() section.
+	 */
+	if (atomic_read(&files->count) > 1)
+		synchronize_sched();
+
 	spin_lock(&files->file_lock);
 	if (!new_fdt)
 		return -ENOMEM;
@@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr)
 		__free_fdtable(new_fdt);
 		return -EMFILE;
 	}
-	/*
-	 * Check again since another task may have expanded the fd table while
-	 * we dropped the lock
-	 */
 	cur_fdt = files_fdtable(files);
-	if (nr >= cur_fdt->max_fds) {
-		/* Continue as planned */
-		copy_fdtable(new_fdt, cur_fdt);
-		rcu_assign_pointer(files->fdt, new_fdt);
-		if (cur_fdt != &files->fdtab)
-			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
-	} else {
-		/* Somebody else expanded, so undo our attempt */
-		__free_fdtable(new_fdt);
-	}
+	BUG_ON(nr < cur_fdt->max_fds);
+	copy_fdtable(new_fdt, cur_fdt);
+	rcu_assign_pointer(files->fdt, new_fdt);
+	if (cur_fdt != &files->fdtab)
+		call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+	/* coupled with smp_rmb() in __fd_install() */
+	smp_wmb();
 	return 1;
 }
 
@@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr)
  * The files->file_lock should be held on entry, and will be held on exit.
  */
 static int expand_files(struct files_struct *files, int nr)
+	__releases(files->file_lock)
+	__acquires(files->file_lock)
 {
 	struct fdtable *fdt;
+	int expanded = 0;
 
+repeat:
 	fdt = files_fdtable(files);
 
 	/* Do we need to expand? */
 	if (nr < fdt->max_fds)
-		return 0;
+		return expanded;
 
 	/* Can we expand? */
 	if (nr >= sysctl_nr_open)
 		return -EMFILE;
 
+	if (unlikely(files->resize_in_progress)) {
+		spin_unlock(&files->file_lock);
+		expanded = 1;
+		wait_event(files->resize_wait, !files->resize_in_progress);
+		spin_lock(&files->file_lock);
+		goto repeat;
+	}
+
 	/* All good, so we try */
-	return expand_fdtable(files, nr);
+	files->resize_in_progress = true;
+	expanded = expand_fdtable(files, nr);
+	files->resize_in_progress = false;
+
+	wake_up_all(&files->resize_wait);
+	return expanded;
 }
 
 static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
@@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	atomic_set(&newf->count, 1);
 
 	spin_lock_init(&newf->file_lock);
+	newf->resize_in_progress = false;
+	init_waitqueue_head(&newf->resize_wait);
 	newf->next_fd = 0;
 	new_fdt = &newf->fdtab;
 	new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd,
 		struct file *file)
 {
 	struct fdtable *fdt;
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
+
+	might_sleep();
+	rcu_read_lock_sched();
+
+	while (unlikely(files->resize_in_progress)) {
+		rcu_read_unlock_sched();
+		wait_event(files->resize_wait, !files->resize_in_progress);
+		rcu_read_lock_sched();
+	}
+	/* coupled with smp_wmb() in expand_fdtable() */
+	smp_rmb();
+	fdt = rcu_dereference_sched(files->fdt);
 	BUG_ON(fdt->fd[fd] != NULL);
 	rcu_assign_pointer(fdt->fd[fd], file);
-	spin_unlock(&files->file_lock);
+	rcu_read_unlock_sched();
 }
 
 void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87bdf5ad..fbb88740634a 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -47,6 +47,9 @@ struct files_struct {
    * read mostly part
    */
 	atomic_t count;
+	bool resize_in_progress;
+	wait_queue_head_t resize_wait;
+
 	struct fdtable __rcu *fdt;
 	struct fdtable fdtab;
   /*
-- 
cgit v1.2.3


From fbabfd0f4ee2e8847bf56edf481249ad1bb8c44d Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 9 May 2015 15:54:49 -0500
Subject: fs: Add helper functions for permanently empty directories.

To ensure it is safe to mount proc and sysfs I need to check if
filesystems that are mounted on top of them are mounted on truly empty
directories.  Given that some directories can gain entries over time,
knowing that a directory is empty right now is insufficient.

Therefore add supporting infrastructure for permantently empty
directories that proc and sysfs can use when they create mount points
for filesystems and fs_fully_visible can use to test for permanently
empty directories to ensure that nothing will be gained by mounting a
fresh copy of proc or sysfs.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/libfs.c         | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index cb1fb4b9b637..02813592e121 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1093,3 +1093,99 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 	return -EINVAL;
 }
 EXPORT_SYMBOL(simple_nosetlease);
+
+
+/*
+ * Operations for a permanently empty directory.
+ */
+static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	return ERR_PTR(-ENOENT);
+}
+
+static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				 struct kstat *stat)
+{
+	struct inode *inode = d_inode(dentry);
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	return -EPERM;
+}
+
+static int empty_dir_setxattr(struct dentry *dentry, const char *name,
+			      const void *value, size_t size, int flags)
+{
+	return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name,
+				  void *value, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static int empty_dir_removexattr(struct dentry *dentry, const char *name)
+{
+	return -EOPNOTSUPP;
+}
+
+static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct inode_operations empty_dir_inode_operations = {
+	.lookup		= empty_dir_lookup,
+	.permission	= generic_permission,
+	.setattr	= empty_dir_setattr,
+	.getattr	= empty_dir_getattr,
+	.setxattr	= empty_dir_setxattr,
+	.getxattr	= empty_dir_getxattr,
+	.removexattr	= empty_dir_removexattr,
+	.listxattr	= empty_dir_listxattr,
+};
+
+static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	/* An empty directory has two entries . and .. at offsets 0 and 1 */
+	return generic_file_llseek_size(file, offset, whence, 2, 2);
+}
+
+static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
+{
+	dir_emit_dots(file, ctx);
+	return 0;
+}
+
+static const struct file_operations empty_dir_operations = {
+	.llseek		= empty_dir_llseek,
+	.read		= generic_read_dir,
+	.iterate	= empty_dir_readdir,
+	.fsync		= noop_fsync,
+};
+
+
+void make_empty_dir_inode(struct inode *inode)
+{
+	set_nlink(inode, 2);
+	inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
+	inode->i_uid = GLOBAL_ROOT_UID;
+	inode->i_gid = GLOBAL_ROOT_GID;
+	inode->i_rdev = 0;
+	inode->i_size = 2;
+	inode->i_blkbits = PAGE_SHIFT;
+	inode->i_blocks = 0;
+
+	inode->i_op = &empty_dir_inode_operations;
+	inode->i_fop = &empty_dir_operations;
+}
+
+bool is_empty_dir_inode(struct inode *inode)
+{
+	return (inode->i_fop == &empty_dir_operations) &&
+		(inode->i_op == &empty_dir_inode_operations);
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2d24eeb8e59c..571aab91bfc0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2780,6 +2780,8 @@ extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned in
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
 extern const struct file_operations simple_dir_operations;
 extern const struct inode_operations simple_dir_inode_operations;
+extern void make_empty_dir_inode(struct inode *inode);
+extern bool is_empty_dir_inode(struct inode *inode);
 struct tree_descr { char *name; const struct file_operations *ops; int mode; };
 struct dentry *d_alloc_name(struct dentry *, const char *);
 extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *);
-- 
cgit v1.2.3


From f9bd6733d3f11e24f3949becf277507d422ee1eb Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 9 May 2015 22:09:14 -0500
Subject: sysctl: Allow creating permanently empty directories that serve as
 mountpoints.

Add a magic sysctl table sysctl_mount_point that when used to
create a directory forces that directory to be permanently empty.

Update the code to use make_empty_dir_inode when accessing permanently
empty directories.

Update the code to not allow adding to permanently empty directories.

Update /proc/sys/fs/binfmt_misc to be a permanently empty directory.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/proc/proc_sysctl.c  | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/sysctl.h |  3 +++
 kernel/sysctl.c        |  8 +-------
 3 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fea2561d773b..fdda62e6115e 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -19,6 +19,28 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+/* Support for permanently empty directories */
+
+struct ctl_table sysctl_mount_point[] = {
+	{ }
+};
+
+static bool is_empty_dir(struct ctl_table_header *head)
+{
+	return head->ctl_table[0].child == sysctl_mount_point;
+}
+
+static void set_empty_dir(struct ctl_dir *dir)
+{
+	dir->header.ctl_table[0].child = sysctl_mount_point;
+}
+
+static void clear_empty_dir(struct ctl_dir *dir)
+
+{
+	dir->header.ctl_table[0].child = NULL;
+}
+
 void proc_sys_poll_notify(struct ctl_table_poll *poll)
 {
 	if (!poll)
@@ -187,6 +209,17 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
 	struct ctl_table *entry;
 	int err;
 
+	/* Is this a permanently empty directory? */
+	if (is_empty_dir(&dir->header))
+		return -EROFS;
+
+	/* Am I creating a permanently empty directory? */
+	if (header->ctl_table == sysctl_mount_point) {
+		if (!RB_EMPTY_ROOT(&dir->root))
+			return -EINVAL;
+		set_empty_dir(dir);
+	}
+
 	dir->header.nreg++;
 	header->parent = dir;
 	err = insert_links(header);
@@ -202,6 +235,8 @@ fail:
 	erase_header(header);
 	put_links(header);
 fail_links:
+	if (header->ctl_table == sysctl_mount_point)
+		clear_empty_dir(dir);
 	header->parent = NULL;
 	drop_sysctl_table(&dir->header);
 	return err;
@@ -419,6 +454,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 		inode->i_mode |= S_IFDIR;
 		inode->i_op = &proc_sys_dir_operations;
 		inode->i_fop = &proc_sys_dir_file_operations;
+		if (is_empty_dir(head))
+			make_empty_dir_inode(inode);
 	}
 out:
 	return inode;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 795d5fea5697..fa7bc29925c9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -188,6 +188,9 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
 void unregister_sysctl_table(struct ctl_table_header * table);
 
 extern int sysctl_init(void);
+
+extern struct ctl_table sysctl_mount_point[];
+
 #else /* CONFIG_SYSCTL */
 static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2082b1a88fb9..c3eee4c6d6c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1531,12 +1531,6 @@ static struct ctl_table vm_table[] = {
 	{ }
 };
 
-#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
-static struct ctl_table binfmt_misc_table[] = {
-	{ }
-};
-#endif
-
 static struct ctl_table fs_table[] = {
 	{
 		.procname	= "inode-nr",
@@ -1690,7 +1684,7 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "binfmt_misc",
 		.mode		= 0555,
-		.child		= binfmt_misc_table,
+		.child		= sysctl_mount_point,
 	},
 #endif
 	{
-- 
cgit v1.2.3


From ea015218f2f7ace2dad9cedd21ed95bdba2886d7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 13 May 2015 16:09:29 -0500
Subject: kernfs: Add support for always empty directories.

Add a new function kernfs_create_empty_dir that can be used to create
directory that can not be modified.

Update the code to use make_empty_dir_inode when reporting a
permanently empty directory to the vfs.

Update the code to not allow adding to permanently empty directories.

Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/kernfs/dir.c        | 38 +++++++++++++++++++++++++++++++++++++-
 fs/kernfs/inode.c      |  2 ++
 include/linux/kernfs.h |  3 +++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index f131fc23ffc4..47dc636d80ed 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -585,6 +585,9 @@ int kernfs_add_one(struct kernfs_node *kn)
 		goto out_unlock;
 
 	ret = -ENOENT;
+	if (parent->flags & KERNFS_EMPTY_DIR)
+		goto out_unlock;
+
 	if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
 		goto out_unlock;
 
@@ -776,6 +779,38 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 	return ERR_PTR(rc);
 }
 
+/**
+ * kernfs_create_empty_dir - create an always empty directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
+					    const char *name)
+{
+	struct kernfs_node *kn;
+	int rc;
+
+	/* allocate */
+	kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->flags |= KERNFS_EMPTY_DIR;
+	kn->dir.root = parent->dir.root;
+	kn->ns = NULL;
+	kn->priv = NULL;
+
+	/* link in */
+	rc = kernfs_add_one(kn);
+	if (!rc)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(rc);
+}
+
 static struct dentry *kernfs_iop_lookup(struct inode *dir,
 					struct dentry *dentry,
 					unsigned int flags)
@@ -1247,7 +1282,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	mutex_lock(&kernfs_mutex);
 
 	error = -ENOENT;
-	if (!kernfs_active(kn) || !kernfs_active(new_parent))
+	if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
+	    (new_parent->flags & KERNFS_EMPTY_DIR))
 		goto out;
 
 	error = 0;
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 2da8493a380b..756dd56aaf60 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -296,6 +296,8 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	case KERNFS_DIR:
 		inode->i_op = &kernfs_dir_iops;
 		inode->i_fop = &kernfs_dir_fops;
+		if (kn->flags & KERNFS_EMPTY_DIR)
+			make_empty_dir_inode(inode);
 		break;
 	case KERNFS_FILE:
 		inode->i_size = kn->attr.size;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 71ecdab1671b..29d1896c3ba5 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -45,6 +45,7 @@ enum kernfs_node_flag {
 	KERNFS_LOCKDEP		= 0x0100,
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
+	KERNFS_EMPTY_DIR	= 0x1000,
 };
 
 /* @flags for kernfs_create_root() */
@@ -285,6 +286,8 @@ void kernfs_destroy_root(struct kernfs_root *root);
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
 					 void *priv, const void *ns);
+struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
+					    const char *name);
 struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 const char *name,
 					 umode_t mode, loff_t size,
-- 
cgit v1.2.3


From 87d2846fcf88113fae2341da1ca9a71f0d916f2c Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Wed, 13 May 2015 16:31:40 -0500
Subject: sysfs: Add support for permanently empty directories to serve as
 mount points.

Add two functions sysfs_create_mount_point and
sysfs_remove_mount_point that hang a permanently empty directory off
of a kobject or remove a permanently emptpy directory hanging from a
kobject.  Export these new functions so modular filesystems can use
them.

Cc: stable@vger.kernel.org
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/sysfs/dir.c        | 34 ++++++++++++++++++++++++++++++++++
 include/linux/sysfs.h | 15 +++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0b45ff42f374..94374e435025 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -121,3 +121,37 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 
 	return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
 }
+
+/**
+ * sysfs_create_mount_point - create an always empty directory
+ * @parent_kobj:  kobject that will contain this always empty directory
+ * @name: The name of the always empty directory to add
+ */
+int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name)
+{
+	struct kernfs_node *kn, *parent = parent_kobj->sd;
+
+	kn = kernfs_create_empty_dir(parent, name);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, name);
+		return PTR_ERR(kn);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(sysfs_create_mount_point);
+
+/**
+ *	sysfs_remove_mount_point - remove an always empty directory.
+ *	@parent_kobj: kobject that will contain this always empty directory
+ *	@name: The name of the always empty directory to remove
+ *
+ */
+void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name)
+{
+	struct kernfs_node *parent = parent_kobj->sd;
+
+	kernfs_remove_by_name_ns(parent, name, NULL);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_mount_point);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 99382c0df17e..9f65758311a4 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -210,6 +210,10 @@ int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 int __must_check sysfs_move_dir_ns(struct kobject *kobj,
 				   struct kobject *new_parent_kobj,
 				   const void *new_ns);
+int __must_check sysfs_create_mount_point(struct kobject *parent_kobj,
+					  const char *name);
+void sysfs_remove_mount_point(struct kobject *parent_kobj,
+			      const char *name);
 
 int __must_check sysfs_create_file_ns(struct kobject *kobj,
 				      const struct attribute *attr,
@@ -298,6 +302,17 @@ static inline int sysfs_move_dir_ns(struct kobject *kobj,
 	return 0;
 }
 
+static inline int sysfs_create_mount_point(struct kobject *parent_kobj,
+					   const char *name)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_mount_point(struct kobject *parent_kobj,
+					    const char *name)
+{
+}
+
 static inline int sysfs_create_file_ns(struct kobject *kobj,
 				       const struct attribute *attr,
 				       const void *ns)
-- 
cgit v1.2.3


From bd7ade3cd9b0850264306f5c2b79024a417b6396 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <kernel@kyup.com>
Date: Thu, 2 Jul 2015 01:32:44 -0400
Subject: bufferhead: Add _gfp version for sb_getblk()

sb_getblk() is used during ext4 (and possibly other FSes) writeback
paths. Sometimes such path require allocating memory and guaranteeing
that such allocation won't block. Currently, however, there is no way
to provide user flags for sb_getblk which could lead to deadlocks.

This patch implements a sb_getblk_gfp with the only difference it can
accept user-provided GFP flags.

Signed-off-by: Nikolay Borisov <kernel@kyup.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 include/linux/buffer_head.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 73b45225a7ca..e6797ded700e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -317,6 +317,13 @@ sb_getblk(struct super_block *sb, sector_t block)
 	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE);
 }
 
+
+static inline struct buffer_head *
+sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp)
+{
+	return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp);
+}
+
 static inline struct buffer_head *
 sb_find_get_block(struct super_block *sb, sector_t block)
 {
-- 
cgit v1.2.3


From ec110bc7cc48d7806c9b65094e6afb19452d458f Mon Sep 17 00:00:00 2001
From: Allen Hubbe <Allen.Hubbe@emc.com>
Date: Thu, 7 May 2015 06:45:21 -0400
Subject: NTB: Move files in preparation for NTB abstraction

This patch only moves files to their new locations, before applying the
next two patches adding the NTB Abstraction layer.  Splitting this patch
from the next is intended make distinct which code is changed only due
to moving the files, versus which are substantial code changes in adding
the NTB Abstraction layer.

Signed-off-by: Allen Hubbe <Allen.Hubbe@emc.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/net/Kconfig                 |    4 +-
 drivers/net/ntb_netdev.c            |    6 +-
 drivers/ntb/Makefile                |    4 +-
 drivers/ntb/hw/intel/ntb_hw_intel.c | 1894 ++++++++++++++++++++++++++++++++++
 drivers/ntb/hw/intel/ntb_hw_intel.h |  385 +++++++
 drivers/ntb/ntb_hw.c                | 1895 -----------------------------------
 drivers/ntb/ntb_hw.h                |  256 -----
 drivers/ntb/ntb_regs.h              |  177 ----
 drivers/ntb/ntb_transport.c         |   14 +-
 include/linux/ntb.h                 |   88 --
 include/linux/ntb_transport.h       |   88 ++
 11 files changed, 2381 insertions(+), 2430 deletions(-)
 create mode 100644 drivers/ntb/hw/intel/ntb_hw_intel.c
 create mode 100644 drivers/ntb/hw/intel/ntb_hw_intel.h
 delete mode 100644 drivers/ntb/ntb_hw.c
 delete mode 100644 drivers/ntb/ntb_hw.h
 delete mode 100644 drivers/ntb/ntb_regs.h
 delete mode 100644 include/linux/ntb.h
 create mode 100644 include/linux/ntb_transport.h

(limited to 'include/linux')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index df51d6025a90..bda3cde62bb3 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -203,8 +203,8 @@ config NET_POLL_CONTROLLER
 	def_bool NETPOLL
 
 config NTB_NETDEV
-	tristate "Virtual Ethernet over NTB"
-	depends on NTB
+	tristate "Virtual Ethernet over NTB Transport"
+	depends on NTB_TRANSPORT
 
 config RIONET
 	tristate "RapidIO Ethernet over messaging driver support"
diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c
index 5a7e6397440a..6d3bfa62f5ec 100644
--- a/drivers/net/ntb_netdev.c
+++ b/drivers/net/ntb_netdev.c
@@ -49,7 +49,7 @@
 #include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/ntb.h>
+#include <linux/ntb_transport.h>
 
 #define NTB_NETDEV_VER	"0.7"
 
@@ -410,13 +410,13 @@ static int __init ntb_netdev_init_module(void)
 	rc = ntb_register_client_dev(KBUILD_MODNAME);
 	if (rc)
 		return rc;
-	return ntb_register_client(&ntb_netdev_client);
+	return ntb_transport_register_client(&ntb_netdev_client);
 }
 module_init(ntb_netdev_init_module);
 
 static void __exit ntb_netdev_exit_module(void)
 {
-	ntb_unregister_client(&ntb_netdev_client);
+	ntb_transport_unregister_client(&ntb_netdev_client);
 	ntb_unregister_client_dev(KBUILD_MODNAME);
 }
 module_exit(ntb_netdev_exit_module);
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 15cb59fd354e..545b10a131af 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_NTB) += ntb.o
+obj-$(CONFIG_NTB) += ntb_hw_intel.o
 
-ntb-objs := ntb_hw.o ntb_transport.o
+ntb_hw_intel-objs := hw/intel/ntb_hw_intel.o ntb_transport.o
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c b/drivers/ntb/hw/intel/ntb_hw_intel.c
new file mode 100644
index 000000000000..044534a995f1
--- /dev/null
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -0,0 +1,1894 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include "ntb_hw_intel.h"
+
+#define NTB_NAME	"Intel(R) PCI-E Non-Transparent Bridge Driver"
+#define NTB_VER		"1.0"
+
+MODULE_DESCRIPTION(NTB_NAME);
+MODULE_VERSION(NTB_VER);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+enum {
+	NTB_CONN_TRANSPARENT = 0,
+	NTB_CONN_B2B,
+	NTB_CONN_RP,
+};
+
+enum {
+	NTB_DEV_USD = 0,
+	NTB_DEV_DSD,
+};
+
+enum {
+	SNB_HW = 0,
+	BWD_HW,
+};
+
+static struct dentry *debugfs_dir;
+
+#define BWD_LINK_RECOVERY_TIME	500
+
+/* Translate memory window 0,1,2 to BAR 2,4,5 */
+#define MW_TO_BAR(mw)	(mw == 0 ? 2 : (mw == 1 ? 4 : 5))
+
+static const struct pci_device_id ntb_pci_tbl[] = {
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
+
+static int is_ntb_xeon(struct ntb_device *ndev)
+{
+	switch (ndev->pdev->device) {
+	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+		return 1;
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+static int is_ntb_atom(struct ntb_device *ndev)
+{
+	switch (ndev->pdev->device) {
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_BWD:
+		return 1;
+	default:
+		return 0;
+	}
+
+	return 0;
+}
+
+static void ntb_set_errata_flags(struct ntb_device *ndev)
+{
+	switch (ndev->pdev->device) {
+	/*
+	 * this workaround applies to all platform up to IvyBridge
+	 * Haswell has splitbar support and use a different workaround
+	 */
+	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
+	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
+		ndev->wa_flags |= WA_SNB_ERR;
+		break;
+	}
+}
+
+/**
+ * ntb_register_event_callback() - register event callback
+ * @ndev: pointer to ntb_device instance
+ * @func: callback function to register
+ *
+ * This function registers a callback for any HW driver events such as link
+ * up/down, power management notices and etc.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_register_event_callback(struct ntb_device *ndev,
+				void (*func)(void *handle,
+					     enum ntb_hw_event event))
+{
+	if (ndev->event_cb)
+		return -EINVAL;
+
+	ndev->event_cb = func;
+
+	return 0;
+}
+
+/**
+ * ntb_unregister_event_callback() - unregisters the event callback
+ * @ndev: pointer to ntb_device instance
+ *
+ * This function unregisters the existing callback from transport
+ */
+void ntb_unregister_event_callback(struct ntb_device *ndev)
+{
+	ndev->event_cb = NULL;
+}
+
+static void ntb_irq_work(unsigned long data)
+{
+	struct ntb_db_cb *db_cb = (struct ntb_db_cb *)data;
+	int rc;
+
+	rc = db_cb->callback(db_cb->data, db_cb->db_num);
+	if (rc)
+		tasklet_schedule(&db_cb->irq_work);
+	else {
+		struct ntb_device *ndev = db_cb->ndev;
+		unsigned long mask;
+
+		mask = readw(ndev->reg_ofs.ldb_mask);
+		clear_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
+		writew(mask, ndev->reg_ofs.ldb_mask);
+	}
+}
+
+/**
+ * ntb_register_db_callback() - register a callback for doorbell interrupt
+ * @ndev: pointer to ntb_device instance
+ * @idx: doorbell index to register callback, zero based
+ * @data: pointer to be returned to caller with every callback
+ * @func: callback function to register
+ *
+ * This function registers a callback function for the doorbell interrupt
+ * on the primary side. The function will unmask the doorbell as well to
+ * allow interrupt.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
+			     void *data, int (*func)(void *data, int db_num))
+{
+	unsigned long mask;
+
+	if (idx >= ndev->max_cbs || ndev->db_cb[idx].callback) {
+		dev_warn(&ndev->pdev->dev, "Invalid Index.\n");
+		return -EINVAL;
+	}
+
+	ndev->db_cb[idx].callback = func;
+	ndev->db_cb[idx].data = data;
+	ndev->db_cb[idx].ndev = ndev;
+
+	tasklet_init(&ndev->db_cb[idx].irq_work, ntb_irq_work,
+		     (unsigned long) &ndev->db_cb[idx]);
+
+	/* unmask interrupt */
+	mask = readw(ndev->reg_ofs.ldb_mask);
+	clear_bit(idx * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.ldb_mask);
+
+	return 0;
+}
+
+/**
+ * ntb_unregister_db_callback() - unregister a callback for doorbell interrupt
+ * @ndev: pointer to ntb_device instance
+ * @idx: doorbell index to register callback, zero based
+ *
+ * This function unregisters a callback function for the doorbell interrupt
+ * on the primary side. The function will also mask the said doorbell.
+ */
+void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx)
+{
+	unsigned long mask;
+
+	if (idx >= ndev->max_cbs || !ndev->db_cb[idx].callback)
+		return;
+
+	mask = readw(ndev->reg_ofs.ldb_mask);
+	set_bit(idx * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.ldb_mask);
+
+	tasklet_disable(&ndev->db_cb[idx].irq_work);
+
+	ndev->db_cb[idx].callback = NULL;
+}
+
+/**
+ * ntb_find_transport() - find the transport pointer
+ * @transport: pointer to pci device
+ *
+ * Given the pci device pointer, return the transport pointer passed in when
+ * the transport attached when it was inited.
+ *
+ * RETURNS: pointer to transport.
+ */
+void *ntb_find_transport(struct pci_dev *pdev)
+{
+	struct ntb_device *ndev = pci_get_drvdata(pdev);
+	return ndev->ntb_transport;
+}
+
+/**
+ * ntb_register_transport() - Register NTB transport with NTB HW driver
+ * @transport: transport identifier
+ *
+ * This function allows a transport to reserve the hardware driver for
+ * NTB usage.
+ *
+ * RETURNS: pointer to ntb_device, NULL on error.
+ */
+struct ntb_device *ntb_register_transport(struct pci_dev *pdev, void *transport)
+{
+	struct ntb_device *ndev = pci_get_drvdata(pdev);
+
+	if (ndev->ntb_transport)
+		return NULL;
+
+	ndev->ntb_transport = transport;
+	return ndev;
+}
+
+/**
+ * ntb_unregister_transport() - Unregister the transport with the NTB HW driver
+ * @ndev - ntb_device of the transport to be freed
+ *
+ * This function unregisters the transport from the HW driver and performs any
+ * necessary cleanups.
+ */
+void ntb_unregister_transport(struct ntb_device *ndev)
+{
+	int i;
+
+	if (!ndev->ntb_transport)
+		return;
+
+	for (i = 0; i < ndev->max_cbs; i++)
+		ntb_unregister_db_callback(ndev, i);
+
+	ntb_unregister_event_callback(ndev);
+	ndev->ntb_transport = NULL;
+}
+
+/**
+ * ntb_write_local_spad() - write to the secondary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. This writes over the data mirrored to the local scratchpad register
+ * by the remote system.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	dev_dbg(&ndev->pdev->dev, "Writing %x to local scratch pad index %d\n",
+		val, idx);
+	writel(val, ndev->reg_ofs.spad_read + idx * 4);
+
+	return 0;
+}
+
+/**
+ * ntb_read_local_spad() - read from the primary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.  This allows the local system to read data
+ * written and mirrored to the scratchpad register by the remote system.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	*val = readl(ndev->reg_ofs.spad_write + idx * 4);
+	dev_dbg(&ndev->pdev->dev,
+		"Reading %x from local scratch pad index %d\n", *val, idx);
+
+	return 0;
+}
+
+/**
+ * ntb_write_remote_spad() - write to the secondary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to the scratchpad register, 0 based
+ * @val: the data value to put into the register
+ *
+ * This function allows writing of a 32bit value to the indexed scratchpad
+ * register. The register resides on the secondary (external) side.  This allows
+ * the local system to write data to be mirrored to the remote systems
+ * scratchpad register.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	dev_dbg(&ndev->pdev->dev, "Writing %x to remote scratch pad index %d\n",
+		val, idx);
+	writel(val, ndev->reg_ofs.spad_write + idx * 4);
+
+	return 0;
+}
+
+/**
+ * ntb_read_remote_spad() - read from the primary scratchpad register
+ * @ndev: pointer to ntb_device instance
+ * @idx: index to scratchpad register, 0 based
+ * @val: pointer to 32bit integer for storing the register value
+ *
+ * This function allows reading of the 32bit scratchpad register on
+ * the primary (internal) side.  This alloows the local system to read the data
+ * it wrote to be mirrored on the remote system.
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+{
+	if (idx >= ndev->limits.max_spads)
+		return -EINVAL;
+
+	*val = readl(ndev->reg_ofs.spad_read + idx * 4);
+	dev_dbg(&ndev->pdev->dev,
+		"Reading %x from remote scratch pad index %d\n", *val, idx);
+
+	return 0;
+}
+
+/**
+ * ntb_get_mw_base() - get addr for the NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the base address of the memory window specified.
+ *
+ * RETURNS: address, or NULL on error.
+ */
+resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
+{
+	if (mw >= ntb_max_mw(ndev))
+		return 0;
+
+	return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
+}
+
+/**
+ * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the base virtual address of the memory window
+ * specified.
+ *
+ * RETURNS: pointer to virtual address, or NULL on error.
+ */
+void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw)
+{
+	if (mw >= ntb_max_mw(ndev))
+		return NULL;
+
+	return ndev->mw[mw].vbase;
+}
+
+/**
+ * ntb_get_mw_size() - return size of NTB memory window
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ *
+ * This function provides the physical size of the memory window specified
+ *
+ * RETURNS: the size of the memory window or zero on error
+ */
+u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw)
+{
+	if (mw >= ntb_max_mw(ndev))
+		return 0;
+
+	return ndev->mw[mw].bar_sz;
+}
+
+/**
+ * ntb_set_mw_addr - set the memory window address
+ * @ndev: pointer to ntb_device instance
+ * @mw: memory window number
+ * @addr: base address for data
+ *
+ * This function sets the base physical address of the memory window.  This
+ * memory address is where data from the remote system will be transfered into
+ * or out of depending on how the transport is configured.
+ */
+void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr)
+{
+	if (mw >= ntb_max_mw(ndev))
+		return;
+
+	dev_dbg(&ndev->pdev->dev, "Writing addr %Lx to BAR %d\n", addr,
+		MW_TO_BAR(mw));
+
+	ndev->mw[mw].phys_addr = addr;
+
+	switch (MW_TO_BAR(mw)) {
+	case NTB_BAR_23:
+		writeq(addr, ndev->reg_ofs.bar2_xlat);
+		break;
+	case NTB_BAR_4:
+		if (ndev->split_bar)
+			writel(addr, ndev->reg_ofs.bar4_xlat);
+		else
+			writeq(addr, ndev->reg_ofs.bar4_xlat);
+		break;
+	case NTB_BAR_5:
+		writel(addr, ndev->reg_ofs.bar5_xlat);
+		break;
+	}
+}
+
+/**
+ * ntb_ring_doorbell() - Set the doorbell on the secondary/external side
+ * @ndev: pointer to ntb_device instance
+ * @db: doorbell to ring
+ *
+ * This function allows triggering of a doorbell on the secondary/external
+ * side that will initiate an interrupt on the remote host
+ *
+ * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
+ */
+void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int db)
+{
+	dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
+
+	if (ndev->hw_type == BWD_HW)
+		writeq((u64) 1 << db, ndev->reg_ofs.rdb);
+	else
+		writew(((1 << ndev->bits_per_vector) - 1) <<
+		       (db * ndev->bits_per_vector), ndev->reg_ofs.rdb);
+}
+
+static void bwd_recover_link(struct ntb_device *ndev)
+{
+	u32 status;
+
+	/* Driver resets the NTB ModPhy lanes - magic! */
+	writeb(0xe0, ndev->reg_base + BWD_MODPHY_PCSREG6);
+	writeb(0x40, ndev->reg_base + BWD_MODPHY_PCSREG4);
+	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG4);
+	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG6);
+
+	/* Driver waits 100ms to allow the NTB ModPhy to settle */
+	msleep(100);
+
+	/* Clear AER Errors, write to clear */
+	status = readl(ndev->reg_base + BWD_ERRCORSTS_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "ERRCORSTS = %x\n", status);
+	status &= PCI_ERR_COR_REP_ROLL;
+	writel(status, ndev->reg_base + BWD_ERRCORSTS_OFFSET);
+
+	/* Clear unexpected electrical idle event in LTSSM, write to clear */
+	status = readl(ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "LTSSMERRSTS0 = %x\n", status);
+	status |= BWD_LTSSMERRSTS0_UNEXPECTEDEI;
+	writel(status, ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
+
+	/* Clear DeSkew Buffer error, write to clear */
+	status = readl(ndev->reg_base + BWD_DESKEWSTS_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "DESKEWSTS = %x\n", status);
+	status |= BWD_DESKEWSTS_DBERR;
+	writel(status, ndev->reg_base + BWD_DESKEWSTS_OFFSET);
+
+	status = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "IBSTERRRCRVSTS0 = %x\n", status);
+	status &= BWD_IBIST_ERR_OFLOW;
+	writel(status, ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
+
+	/* Releases the NTB state machine to allow the link to retrain */
+	status = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
+	dev_dbg(&ndev->pdev->dev, "LTSSMSTATEJMP = %x\n", status);
+	status &= ~BWD_LTSSMSTATEJMP_FORCEDETECT;
+	writel(status, ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
+}
+
+static void ntb_link_event(struct ntb_device *ndev, int link_state)
+{
+	unsigned int event;
+
+	if (ndev->link_status == link_state)
+		return;
+
+	if (link_state == NTB_LINK_UP) {
+		u16 status;
+
+		dev_info(&ndev->pdev->dev, "Link Up\n");
+		ndev->link_status = NTB_LINK_UP;
+		event = NTB_EVENT_HW_LINK_UP;
+
+		if (is_ntb_atom(ndev) ||
+		    ndev->conn_type == NTB_CONN_TRANSPARENT)
+			status = readw(ndev->reg_ofs.lnk_stat);
+		else {
+			int rc = pci_read_config_word(ndev->pdev,
+						      SNB_LINK_STATUS_OFFSET,
+						      &status);
+			if (rc)
+				return;
+		}
+
+		ndev->link_width = (status & NTB_LINK_WIDTH_MASK) >> 4;
+		ndev->link_speed = (status & NTB_LINK_SPEED_MASK);
+		dev_info(&ndev->pdev->dev, "Link Width %d, Link Speed %d\n",
+			 ndev->link_width, ndev->link_speed);
+	} else {
+		dev_info(&ndev->pdev->dev, "Link Down\n");
+		ndev->link_status = NTB_LINK_DOWN;
+		event = NTB_EVENT_HW_LINK_DOWN;
+		/* Don't modify link width/speed, we need it in link recovery */
+	}
+
+	/* notify the upper layer if we have an event change */
+	if (ndev->event_cb)
+		ndev->event_cb(ndev->ntb_transport, event);
+}
+
+static int ntb_link_status(struct ntb_device *ndev)
+{
+	int link_state;
+
+	if (is_ntb_atom(ndev)) {
+		u32 ntb_cntl;
+
+		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
+		if (ntb_cntl & BWD_CNTL_LINK_DOWN)
+			link_state = NTB_LINK_DOWN;
+		else
+			link_state = NTB_LINK_UP;
+	} else {
+		u16 status;
+		int rc;
+
+		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
+					  &status);
+		if (rc)
+			return rc;
+
+		if (status & NTB_LINK_STATUS_ACTIVE)
+			link_state = NTB_LINK_UP;
+		else
+			link_state = NTB_LINK_DOWN;
+	}
+
+	ntb_link_event(ndev, link_state);
+
+	return 0;
+}
+
+static void bwd_link_recovery(struct work_struct *work)
+{
+	struct ntb_device *ndev = container_of(work, struct ntb_device,
+					       lr_timer.work);
+	u32 status32;
+
+	bwd_recover_link(ndev);
+	/* There is a potential race between the 2 NTB devices recovering at the
+	 * same time.  If the times are the same, the link will not recover and
+	 * the driver will be stuck in this loop forever.  Add a random interval
+	 * to the recovery time to prevent this race.
+	 */
+	msleep(BWD_LINK_RECOVERY_TIME + prandom_u32() % BWD_LINK_RECOVERY_TIME);
+
+	status32 = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
+	if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT)
+		goto retry;
+
+	status32 = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
+	if (status32 & BWD_IBIST_ERR_OFLOW)
+		goto retry;
+
+	status32 = readl(ndev->reg_ofs.lnk_cntl);
+	if (!(status32 & BWD_CNTL_LINK_DOWN)) {
+		unsigned char speed, width;
+		u16 status16;
+
+		status16 = readw(ndev->reg_ofs.lnk_stat);
+		width = (status16 & NTB_LINK_WIDTH_MASK) >> 4;
+		speed = (status16 & NTB_LINK_SPEED_MASK);
+		if (ndev->link_width != width || ndev->link_speed != speed)
+			goto retry;
+	}
+
+	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+	return;
+
+retry:
+	schedule_delayed_work(&ndev->lr_timer, NTB_HB_TIMEOUT);
+}
+
+/* BWD doesn't have link status interrupt, poll on that platform */
+static void bwd_link_poll(struct work_struct *work)
+{
+	struct ntb_device *ndev = container_of(work, struct ntb_device,
+					       hb_timer.work);
+	unsigned long ts = jiffies;
+
+	/* If we haven't gotten an interrupt in a while, check the BWD link
+	 * status bit
+	 */
+	if (ts > ndev->last_ts + NTB_HB_TIMEOUT) {
+		int rc = ntb_link_status(ndev);
+		if (rc)
+			dev_err(&ndev->pdev->dev,
+				"Error determining link status\n");
+
+		/* Check to see if a link error is the cause of the link down */
+		if (ndev->link_status == NTB_LINK_DOWN) {
+			u32 status32 = readl(ndev->reg_base +
+					     BWD_LTSSMSTATEJMP_OFFSET);
+			if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT) {
+				schedule_delayed_work(&ndev->lr_timer, 0);
+				return;
+			}
+		}
+	}
+
+	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+}
+
+static int ntb_xeon_setup(struct ntb_device *ndev)
+{
+	switch (ndev->conn_type) {
+	case NTB_CONN_B2B:
+		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
+		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
+		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
+		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
+		if (ndev->split_bar)
+			ndev->reg_ofs.bar5_xlat =
+				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
+		ndev->limits.max_spads = SNB_MAX_B2B_SPADS;
+
+		/* There is a Xeon hardware errata related to writes to
+		 * SDOORBELL or B2BDOORBELL in conjunction with inbound access
+		 * to NTB MMIO Space, which may hang the system.  To workaround
+		 * this use the second memory window to access the interrupt and
+		 * scratch pad registers on the remote system.
+		 */
+		if (ndev->wa_flags & WA_SNB_ERR) {
+			if (!ndev->mw[ndev->limits.max_mw - 1].bar_sz)
+				return -EINVAL;
+
+			ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
+			ndev->reg_ofs.spad_write =
+				ndev->mw[ndev->limits.max_mw - 1].vbase +
+				SNB_SPAD_OFFSET;
+			ndev->reg_ofs.rdb =
+				ndev->mw[ndev->limits.max_mw - 1].vbase +
+				SNB_PDOORBELL_OFFSET;
+
+			/* Set the Limit register to 4k, the minimum size, to
+			 * prevent an illegal access
+			 */
+			writeq(ndev->mw[1].bar_sz + 0x1000, ndev->reg_base +
+			       SNB_PBAR4LMT_OFFSET);
+			/* HW errata on the Limit registers.  They can only be
+			 * written when the base register is 4GB aligned and
+			 * < 32bit.  This should already be the case based on
+			 * the driver defaults, but write the Limit registers
+			 * first just in case.
+			 */
+
+			ndev->limits.max_mw = SNB_ERRATA_MAX_MW;
+		} else {
+			/* HW Errata on bit 14 of b2bdoorbell register.  Writes
+			 * will not be mirrored to the remote system.  Shrink
+			 * the number of bits by one, since bit 14 is the last
+			 * bit.
+			 */
+			ndev->limits.max_db_bits = SNB_MAX_DB_BITS - 1;
+			ndev->reg_ofs.spad_write = ndev->reg_base +
+						   SNB_B2B_SPAD_OFFSET;
+			ndev->reg_ofs.rdb = ndev->reg_base +
+					    SNB_B2B_DOORBELL_OFFSET;
+
+			/* Disable the Limit register, just incase it is set to
+			 * something silly. A 64bit write should handle it
+			 * regardless of whether it has a split BAR or not.
+			 */
+			writeq(0, ndev->reg_base + SNB_PBAR4LMT_OFFSET);
+			/* HW errata on the Limit registers.  They can only be
+			 * written when the base register is 4GB aligned and
+			 * < 32bit.  This should already be the case based on
+			 * the driver defaults, but write the Limit registers
+			 * first just in case.
+			 */
+			if (ndev->split_bar)
+				ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+			else
+				ndev->limits.max_mw = SNB_MAX_MW;
+		}
+
+		/* The Xeon errata workaround requires setting SBAR Base
+		 * addresses to known values, so that the PBAR XLAT can be
+		 * pointed at SBAR0 of the remote system.
+		 */
+		if (ndev->dev_type == NTB_DEV_USD) {
+			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
+			       SNB_PBAR2XLAT_OFFSET);
+			if (ndev->wa_flags & WA_SNB_ERR)
+				writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
+				       SNB_PBAR4XLAT_OFFSET);
+			else {
+				if (ndev->split_bar) {
+					writel(SNB_MBAR4_DSD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR4XLAT_OFFSET);
+					writel(SNB_MBAR5_DSD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR5XLAT_OFFSET);
+				} else
+					writeq(SNB_MBAR4_DSD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR4XLAT_OFFSET);
+
+				/* B2B_XLAT_OFFSET is a 64bit register, but can
+				 * only take 32bit writes
+				 */
+				writel(SNB_MBAR01_DSD_ADDR & 0xffffffff,
+				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
+				writel(SNB_MBAR01_DSD_ADDR >> 32,
+				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
+			}
+
+			writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
+			       SNB_SBAR0BASE_OFFSET);
+			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
+			       SNB_SBAR2BASE_OFFSET);
+			if (ndev->split_bar) {
+				writel(SNB_MBAR4_USD_ADDR, ndev->reg_base +
+				       SNB_SBAR4BASE_OFFSET);
+				writel(SNB_MBAR5_USD_ADDR, ndev->reg_base +
+				       SNB_SBAR5BASE_OFFSET);
+			} else
+				writeq(SNB_MBAR4_USD_ADDR, ndev->reg_base +
+				       SNB_SBAR4BASE_OFFSET);
+		} else {
+			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
+			       SNB_PBAR2XLAT_OFFSET);
+			if (ndev->wa_flags & WA_SNB_ERR)
+				writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
+				       SNB_PBAR4XLAT_OFFSET);
+			else {
+				if (ndev->split_bar) {
+					writel(SNB_MBAR4_USD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR4XLAT_OFFSET);
+					writel(SNB_MBAR5_USD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR5XLAT_OFFSET);
+				} else
+					writeq(SNB_MBAR4_USD_ADDR,
+					       ndev->reg_base +
+					       SNB_PBAR4XLAT_OFFSET);
+
+				/*
+				 * B2B_XLAT_OFFSET is a 64bit register, but can
+				 * only take 32bit writes
+				 */
+				writel(SNB_MBAR01_USD_ADDR & 0xffffffff,
+				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
+				writel(SNB_MBAR01_USD_ADDR >> 32,
+				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
+			}
+			writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
+			       SNB_SBAR0BASE_OFFSET);
+			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
+			       SNB_SBAR2BASE_OFFSET);
+			if (ndev->split_bar) {
+				writel(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
+				       SNB_SBAR4BASE_OFFSET);
+				writel(SNB_MBAR5_DSD_ADDR, ndev->reg_base +
+				       SNB_SBAR5BASE_OFFSET);
+			} else
+				writeq(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
+				       SNB_SBAR4BASE_OFFSET);
+
+		}
+		break;
+	case NTB_CONN_RP:
+		if (ndev->wa_flags & WA_SNB_ERR) {
+			dev_err(&ndev->pdev->dev,
+				"NTB-RP disabled due to hardware errata.\n");
+			return -EINVAL;
+		}
+
+		/* Scratch pads need to have exclusive access from the primary
+		 * or secondary side.  Halve the num spads so that each side can
+		 * have an equal amount.
+		 */
+		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
+		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
+		/* Note: The SDOORBELL is the cause of the errata.  You REALLY
+		 * don't want to touch it.
+		 */
+		ndev->reg_ofs.rdb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
+		/* Offset the start of the spads to correspond to whether it is
+		 * primary or secondary
+		 */
+		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET +
+					   ndev->limits.max_spads * 4;
+		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
+		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
+		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
+		if (ndev->split_bar) {
+			ndev->reg_ofs.bar5_xlat =
+				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
+			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+		} else
+			ndev->limits.max_mw = SNB_MAX_MW;
+		break;
+	case NTB_CONN_TRANSPARENT:
+		if (ndev->wa_flags & WA_SNB_ERR) {
+			dev_err(&ndev->pdev->dev,
+				"NTB-TRANSPARENT disabled due to hardware errata.\n");
+			return -EINVAL;
+		}
+
+		/* Scratch pads need to have exclusive access from the primary
+		 * or secondary side.  Halve the num spads so that each side can
+		 * have an equal amount.
+		 */
+		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
+		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
+		ndev->reg_ofs.rdb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
+		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_SDBMSK_OFFSET;
+		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET;
+		/* Offset the start of the spads to correspond to whether it is
+		 * primary or secondary
+		 */
+		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET +
+					  ndev->limits.max_spads * 4;
+		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_PBAR2XLAT_OFFSET;
+		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_PBAR4XLAT_OFFSET;
+
+		if (ndev->split_bar) {
+			ndev->reg_ofs.bar5_xlat =
+				ndev->reg_base + SNB_PBAR5XLAT_OFFSET;
+			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+		} else
+			ndev->limits.max_mw = SNB_MAX_MW;
+		break;
+	default:
+		/*
+		 * we should never hit this. the detect function should've
+		 * take cared of everything.
+		 */
+		return -EINVAL;
+	}
+
+	ndev->reg_ofs.lnk_cntl = ndev->reg_base + SNB_NTBCNTL_OFFSET;
+	ndev->reg_ofs.lnk_stat = ndev->reg_base + SNB_SLINK_STATUS_OFFSET;
+	ndev->reg_ofs.spci_cmd = ndev->reg_base + SNB_PCICMD_OFFSET;
+
+	ndev->limits.msix_cnt = SNB_MSIX_CNT;
+	ndev->bits_per_vector = SNB_DB_BITS_PER_VEC;
+
+	return 0;
+}
+
+static int ntb_bwd_setup(struct ntb_device *ndev)
+{
+	int rc;
+	u32 val;
+
+	ndev->hw_type = BWD_HW;
+
+	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &val);
+	if (rc)
+		return rc;
+
+	switch ((val & BWD_PPD_CONN_TYPE) >> 8) {
+	case NTB_CONN_B2B:
+		ndev->conn_type = NTB_CONN_B2B;
+		break;
+	case NTB_CONN_RP:
+	default:
+		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
+		return -EINVAL;
+	}
+
+	if (val & BWD_PPD_DEV_TYPE)
+		ndev->dev_type = NTB_DEV_DSD;
+	else
+		ndev->dev_type = NTB_DEV_USD;
+
+	/* Initiate PCI-E link training */
+	rc = pci_write_config_dword(ndev->pdev, NTB_PPD_OFFSET,
+				    val | BWD_PPD_INIT_LINK);
+	if (rc)
+		return rc;
+
+	ndev->reg_ofs.ldb = ndev->reg_base + BWD_PDOORBELL_OFFSET;
+	ndev->reg_ofs.ldb_mask = ndev->reg_base + BWD_PDBMSK_OFFSET;
+	ndev->reg_ofs.rdb = ndev->reg_base + BWD_B2B_DOORBELL_OFFSET;
+	ndev->reg_ofs.bar2_xlat = ndev->reg_base + BWD_SBAR2XLAT_OFFSET;
+	ndev->reg_ofs.bar4_xlat = ndev->reg_base + BWD_SBAR4XLAT_OFFSET;
+	ndev->reg_ofs.lnk_cntl = ndev->reg_base + BWD_NTBCNTL_OFFSET;
+	ndev->reg_ofs.lnk_stat = ndev->reg_base + BWD_LINK_STATUS_OFFSET;
+	ndev->reg_ofs.spad_read = ndev->reg_base + BWD_SPAD_OFFSET;
+	ndev->reg_ofs.spad_write = ndev->reg_base + BWD_B2B_SPAD_OFFSET;
+	ndev->reg_ofs.spci_cmd = ndev->reg_base + BWD_PCICMD_OFFSET;
+	ndev->limits.max_mw = BWD_MAX_MW;
+	ndev->limits.max_spads = BWD_MAX_SPADS;
+	ndev->limits.max_db_bits = BWD_MAX_DB_BITS;
+	ndev->limits.msix_cnt = BWD_MSIX_CNT;
+	ndev->bits_per_vector = BWD_DB_BITS_PER_VEC;
+
+	/* Since bwd doesn't have a link interrupt, setup a poll timer */
+	INIT_DELAYED_WORK(&ndev->hb_timer, bwd_link_poll);
+	INIT_DELAYED_WORK(&ndev->lr_timer, bwd_link_recovery);
+	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+
+	return 0;
+}
+
+static int ntb_device_setup(struct ntb_device *ndev)
+{
+	int rc;
+
+	if (is_ntb_xeon(ndev))
+		rc = ntb_xeon_setup(ndev);
+	else if (is_ntb_atom(ndev))
+		rc = ntb_bwd_setup(ndev);
+	else
+		rc = -ENODEV;
+
+	if (rc)
+		return rc;
+
+	if (ndev->conn_type == NTB_CONN_B2B)
+		/* Enable Bus Master and Memory Space on the secondary side */
+		writew(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
+		       ndev->reg_ofs.spci_cmd);
+
+	return 0;
+}
+
+static void ntb_device_free(struct ntb_device *ndev)
+{
+	if (is_ntb_atom(ndev)) {
+		cancel_delayed_work_sync(&ndev->hb_timer);
+		cancel_delayed_work_sync(&ndev->lr_timer);
+	}
+}
+
+static irqreturn_t bwd_callback_msix_irq(int irq, void *data)
+{
+	struct ntb_db_cb *db_cb = data;
+	struct ntb_device *ndev = db_cb->ndev;
+	unsigned long mask;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
+		db_cb->db_num);
+
+	mask = readw(ndev->reg_ofs.ldb_mask);
+	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.ldb_mask);
+
+	tasklet_schedule(&db_cb->irq_work);
+
+	/* No need to check for the specific HB irq, any interrupt means
+	 * we're connected.
+	 */
+	ndev->last_ts = jiffies;
+
+	writeq((u64) 1 << db_cb->db_num, ndev->reg_ofs.ldb);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t xeon_callback_msix_irq(int irq, void *data)
+{
+	struct ntb_db_cb *db_cb = data;
+	struct ntb_device *ndev = db_cb->ndev;
+	unsigned long mask;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
+		db_cb->db_num);
+
+	mask = readw(ndev->reg_ofs.ldb_mask);
+	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
+	writew(mask, ndev->reg_ofs.ldb_mask);
+
+	tasklet_schedule(&db_cb->irq_work);
+
+	/* On Sandybridge, there are 16 bits in the interrupt register
+	 * but only 4 vectors.  So, 5 bits are assigned to the first 3
+	 * vectors, with the 4th having a single bit for link
+	 * interrupts.
+	 */
+	writew(((1 << ndev->bits_per_vector) - 1) <<
+	       (db_cb->db_num * ndev->bits_per_vector), ndev->reg_ofs.ldb);
+
+	return IRQ_HANDLED;
+}
+
+/* Since we do not have a HW doorbell in BWD, this is only used in JF/JT */
+static irqreturn_t xeon_event_msix_irq(int irq, void *dev)
+{
+	struct ntb_device *ndev = dev;
+	int rc;
+
+	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for Events\n", irq);
+
+	rc = ntb_link_status(ndev);
+	if (rc)
+		dev_err(&ndev->pdev->dev, "Error determining link status\n");
+
+	/* bit 15 is always the link bit */
+	writew(1 << SNB_LINK_DB, ndev->reg_ofs.ldb);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ntb_interrupt(int irq, void *dev)
+{
+	struct ntb_device *ndev = dev;
+	unsigned int i = 0;
+
+	if (is_ntb_atom(ndev)) {
+		u64 ldb = readq(ndev->reg_ofs.ldb);
+
+		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %Lx\n", irq, ldb);
+
+		while (ldb) {
+			i = __ffs(ldb);
+			ldb &= ldb - 1;
+			bwd_callback_msix_irq(irq, &ndev->db_cb[i]);
+		}
+	} else {
+		u16 ldb = readw(ndev->reg_ofs.ldb);
+
+		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %x\n", irq, ldb);
+
+		if (ldb & SNB_DB_HW_LINK) {
+			xeon_event_msix_irq(irq, dev);
+			ldb &= ~SNB_DB_HW_LINK;
+		}
+
+		while (ldb) {
+			i = __ffs(ldb);
+			ldb &= ldb - 1;
+			xeon_callback_msix_irq(irq, &ndev->db_cb[i]);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int ntb_setup_snb_msix(struct ntb_device *ndev, int msix_entries)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	struct msix_entry *msix;
+	int rc, i;
+
+	if (msix_entries < ndev->limits.msix_cnt)
+		return -ENOSPC;
+
+	rc = pci_enable_msix_exact(pdev, ndev->msix_entries, msix_entries);
+	if (rc < 0)
+		return rc;
+
+	for (i = 0; i < msix_entries; i++) {
+		msix = &ndev->msix_entries[i];
+		WARN_ON(!msix->vector);
+
+		if (i == msix_entries - 1) {
+			rc = request_irq(msix->vector,
+					 xeon_event_msix_irq, 0,
+					 "ntb-event-msix", ndev);
+			if (rc)
+				goto err;
+		} else {
+			rc = request_irq(msix->vector,
+					 xeon_callback_msix_irq, 0,
+					 "ntb-callback-msix",
+					 &ndev->db_cb[i]);
+			if (rc)
+				goto err;
+		}
+	}
+
+	ndev->num_msix = msix_entries;
+	ndev->max_cbs = msix_entries - 1;
+
+	return 0;
+
+err:
+	while (--i >= 0) {
+		/* Code never reaches here for entry nr 'ndev->num_msix - 1' */
+		msix = &ndev->msix_entries[i];
+		free_irq(msix->vector, &ndev->db_cb[i]);
+	}
+
+	pci_disable_msix(pdev);
+	ndev->num_msix = 0;
+
+	return rc;
+}
+
+static int ntb_setup_bwd_msix(struct ntb_device *ndev, int msix_entries)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	struct msix_entry *msix;
+	int rc, i;
+
+	msix_entries = pci_enable_msix_range(pdev, ndev->msix_entries,
+					     1, msix_entries);
+	if (msix_entries < 0)
+		return msix_entries;
+
+	for (i = 0; i < msix_entries; i++) {
+		msix = &ndev->msix_entries[i];
+		WARN_ON(!msix->vector);
+
+		rc = request_irq(msix->vector, bwd_callback_msix_irq, 0,
+				 "ntb-callback-msix", &ndev->db_cb[i]);
+		if (rc)
+			goto err;
+	}
+
+	ndev->num_msix = msix_entries;
+	ndev->max_cbs = msix_entries;
+
+	return 0;
+
+err:
+	while (--i >= 0)
+		free_irq(msix->vector, &ndev->db_cb[i]);
+
+	pci_disable_msix(pdev);
+	ndev->num_msix = 0;
+
+	return rc;
+}
+
+static int ntb_setup_msix(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int msix_entries;
+	int rc, i;
+
+	msix_entries = pci_msix_vec_count(pdev);
+	if (msix_entries < 0) {
+		rc = msix_entries;
+		goto err;
+	} else if (msix_entries > ndev->limits.msix_cnt) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	ndev->msix_entries = kmalloc(sizeof(struct msix_entry) * msix_entries,
+				     GFP_KERNEL);
+	if (!ndev->msix_entries) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < msix_entries; i++)
+		ndev->msix_entries[i].entry = i;
+
+	if (is_ntb_atom(ndev))
+		rc = ntb_setup_bwd_msix(ndev, msix_entries);
+	else
+		rc = ntb_setup_snb_msix(ndev, msix_entries);
+	if (rc)
+		goto err1;
+
+	return 0;
+
+err1:
+	kfree(ndev->msix_entries);
+err:
+	dev_err(&pdev->dev, "Error allocating MSI-X interrupt\n");
+	return rc;
+}
+
+static int ntb_setup_msi(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int rc;
+
+	rc = pci_enable_msi(pdev);
+	if (rc)
+		return rc;
+
+	rc = request_irq(pdev->irq, ntb_interrupt, 0, "ntb-msi", ndev);
+	if (rc) {
+		pci_disable_msi(pdev);
+		dev_err(&pdev->dev, "Error allocating MSI interrupt\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+static int ntb_setup_intx(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+	int rc;
+
+	/* Verify intx is enabled */
+	pci_intx(pdev, 1);
+
+	rc = request_irq(pdev->irq, ntb_interrupt, IRQF_SHARED, "ntb-intx",
+			 ndev);
+	if (rc)
+		return rc;
+
+	return 0;
+}
+
+static int ntb_setup_interrupts(struct ntb_device *ndev)
+{
+	int rc;
+
+	/* On BWD, disable all interrupts.  On SNB, disable all but Link
+	 * Interrupt.  The rest will be unmasked as callbacks are registered.
+	 */
+	if (is_ntb_atom(ndev))
+		writeq(~0, ndev->reg_ofs.ldb_mask);
+	else {
+		u16 var = 1 << SNB_LINK_DB;
+		writew(~var, ndev->reg_ofs.ldb_mask);
+	}
+
+	rc = ntb_setup_msix(ndev);
+	if (!rc)
+		goto done;
+
+	ndev->bits_per_vector = 1;
+	ndev->max_cbs = ndev->limits.max_db_bits;
+
+	rc = ntb_setup_msi(ndev);
+	if (!rc)
+		goto done;
+
+	rc = ntb_setup_intx(ndev);
+	if (rc) {
+		dev_err(&ndev->pdev->dev, "no usable interrupts\n");
+		return rc;
+	}
+
+done:
+	return 0;
+}
+
+static void ntb_free_interrupts(struct ntb_device *ndev)
+{
+	struct pci_dev *pdev = ndev->pdev;
+
+	/* mask interrupts */
+	if (is_ntb_atom(ndev))
+		writeq(~0, ndev->reg_ofs.ldb_mask);
+	else
+		writew(~0, ndev->reg_ofs.ldb_mask);
+
+	if (ndev->num_msix) {
+		struct msix_entry *msix;
+		u32 i;
+
+		for (i = 0; i < ndev->num_msix; i++) {
+			msix = &ndev->msix_entries[i];
+			if (is_ntb_xeon(ndev) && i == ndev->num_msix - 1)
+				free_irq(msix->vector, ndev);
+			else
+				free_irq(msix->vector, &ndev->db_cb[i]);
+		}
+		pci_disable_msix(pdev);
+		kfree(ndev->msix_entries);
+	} else {
+		free_irq(pdev->irq, ndev);
+
+		if (pci_dev_msi_enabled(pdev))
+			pci_disable_msi(pdev);
+	}
+}
+
+static int ntb_create_callbacks(struct ntb_device *ndev)
+{
+	int i;
+
+	/* Chicken-egg issue.  We won't know how many callbacks are necessary
+	 * until we see how many MSI-X vectors we get, but these pointers need
+	 * to be passed into the MSI-X register function.  So, we allocate the
+	 * max, knowing that they might not all be used, to work around this.
+	 */
+	ndev->db_cb = kcalloc(ndev->limits.max_db_bits,
+			      sizeof(struct ntb_db_cb),
+			      GFP_KERNEL);
+	if (!ndev->db_cb)
+		return -ENOMEM;
+
+	for (i = 0; i < ndev->limits.max_db_bits; i++) {
+		ndev->db_cb[i].db_num = i;
+		ndev->db_cb[i].ndev = ndev;
+	}
+
+	return 0;
+}
+
+static void ntb_free_callbacks(struct ntb_device *ndev)
+{
+	int i;
+
+	for (i = 0; i < ndev->limits.max_db_bits; i++)
+		ntb_unregister_db_callback(ndev, i);
+
+	kfree(ndev->db_cb);
+}
+
+static ssize_t ntb_debugfs_read(struct file *filp, char __user *ubuf,
+				size_t count, loff_t *offp)
+{
+	struct ntb_device *ndev;
+	char *buf;
+	ssize_t ret, offset, out_count;
+
+	out_count = 500;
+
+	buf = kmalloc(out_count, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	ndev = filp->private_data;
+	offset = 0;
+	offset += snprintf(buf + offset, out_count - offset,
+			   "NTB Device Information:\n");
+	offset += snprintf(buf + offset, out_count - offset,
+			   "Connection Type - \t\t%s\n",
+			   ndev->conn_type == NTB_CONN_TRANSPARENT ?
+			   "Transparent" : (ndev->conn_type == NTB_CONN_B2B) ?
+			   "Back to back" : "Root Port");
+	offset += snprintf(buf + offset, out_count - offset,
+			   "Device Type - \t\t\t%s\n",
+			   ndev->dev_type == NTB_DEV_USD ?
+			   "DSD/USP" : "USD/DSP");
+	offset += snprintf(buf + offset, out_count - offset,
+			   "Max Number of Callbacks - \t%u\n",
+			   ntb_max_cbs(ndev));
+	offset += snprintf(buf + offset, out_count - offset,
+			   "Link Status - \t\t\t%s\n",
+			   ntb_hw_link_status(ndev) ? "Up" : "Down");
+	if (ntb_hw_link_status(ndev)) {
+		offset += snprintf(buf + offset, out_count - offset,
+				   "Link Speed - \t\t\tPCI-E Gen %u\n",
+				   ndev->link_speed);
+		offset += snprintf(buf + offset, out_count - offset,
+				   "Link Width - \t\t\tx%u\n",
+				   ndev->link_width);
+	}
+
+	if (is_ntb_xeon(ndev)) {
+		u32 status32;
+		u16 status16;
+		int rc;
+
+		offset += snprintf(buf + offset, out_count - offset,
+				   "\nNTB Device Statistics:\n");
+		offset += snprintf(buf + offset, out_count - offset,
+				   "Upstream Memory Miss - \t%u\n",
+				   readw(ndev->reg_base +
+					 SNB_USMEMMISS_OFFSET));
+
+		offset += snprintf(buf + offset, out_count - offset,
+				   "\nNTB Hardware Errors:\n");
+
+		rc = pci_read_config_word(ndev->pdev, SNB_DEVSTS_OFFSET,
+					  &status16);
+		if (!rc)
+			offset += snprintf(buf + offset, out_count - offset,
+					   "DEVSTS - \t%#06x\n", status16);
+
+		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
+					  &status16);
+		if (!rc)
+			offset += snprintf(buf + offset, out_count - offset,
+					   "LNKSTS - \t%#06x\n", status16);
+
+		rc = pci_read_config_dword(ndev->pdev, SNB_UNCERRSTS_OFFSET,
+					   &status32);
+		if (!rc)
+			offset += snprintf(buf + offset, out_count - offset,
+					   "UNCERRSTS - \t%#010x\n", status32);
+
+		rc = pci_read_config_dword(ndev->pdev, SNB_CORERRSTS_OFFSET,
+					   &status32);
+		if (!rc)
+			offset += snprintf(buf + offset, out_count - offset,
+					   "CORERRSTS - \t%#010x\n", status32);
+	}
+
+	if (offset > out_count)
+		offset = out_count;
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, offset);
+	kfree(buf);
+	return ret;
+}
+
+static const struct file_operations ntb_debugfs_info = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = ntb_debugfs_read,
+};
+
+static void ntb_setup_debugfs(struct ntb_device *ndev)
+{
+	if (!debugfs_initialized())
+		return;
+
+	if (!debugfs_dir)
+		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	ndev->debugfs_dir = debugfs_create_dir(pci_name(ndev->pdev),
+					       debugfs_dir);
+	if (ndev->debugfs_dir)
+		ndev->debugfs_info = debugfs_create_file("info", S_IRUSR,
+							 ndev->debugfs_dir,
+							 ndev,
+							 &ntb_debugfs_info);
+}
+
+static void ntb_free_debugfs(struct ntb_device *ndev)
+{
+	debugfs_remove_recursive(ndev->debugfs_dir);
+
+	if (debugfs_dir && simple_empty(debugfs_dir)) {
+		debugfs_remove_recursive(debugfs_dir);
+		debugfs_dir = NULL;
+	}
+}
+
+static void ntb_hw_link_up(struct ntb_device *ndev)
+{
+	if (ndev->conn_type == NTB_CONN_TRANSPARENT)
+		ntb_link_event(ndev, NTB_LINK_UP);
+	else {
+		u32 ntb_cntl;
+
+		/* Let's bring the NTB link up */
+		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
+		ntb_cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
+		ntb_cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
+		ntb_cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
+		if (ndev->split_bar)
+			ntb_cntl |= NTB_CNTL_P2S_BAR5_SNOOP |
+				    NTB_CNTL_S2P_BAR5_SNOOP;
+
+		writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+	}
+}
+
+static void ntb_hw_link_down(struct ntb_device *ndev)
+{
+	u32 ntb_cntl;
+
+	if (ndev->conn_type == NTB_CONN_TRANSPARENT) {
+		ntb_link_event(ndev, NTB_LINK_DOWN);
+		return;
+	}
+
+	/* Bring NTB link down */
+	ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
+	ntb_cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
+	ntb_cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
+	if (ndev->split_bar)
+		ntb_cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP |
+			      NTB_CNTL_S2P_BAR5_SNOOP);
+	ntb_cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
+	writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+}
+
+static void ntb_max_mw_detect(struct ntb_device *ndev)
+{
+	if (ndev->split_bar)
+		ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+	else
+		ndev->limits.max_mw = SNB_MAX_MW;
+}
+
+static int ntb_xeon_detect(struct ntb_device *ndev)
+{
+	int rc, bars_mask;
+	u32 bars;
+	u8 ppd;
+
+	ndev->hw_type = SNB_HW;
+
+	rc = pci_read_config_byte(ndev->pdev, NTB_PPD_OFFSET, &ppd);
+	if (rc)
+		return -EIO;
+
+	if (ppd & SNB_PPD_DEV_TYPE)
+		ndev->dev_type = NTB_DEV_USD;
+	else
+		ndev->dev_type = NTB_DEV_DSD;
+
+	ndev->split_bar = (ppd & SNB_PPD_SPLIT_BAR) ? 1 : 0;
+
+	switch (ppd & SNB_PPD_CONN_TYPE) {
+	case NTB_CONN_B2B:
+		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
+		ndev->conn_type = NTB_CONN_B2B;
+		break;
+	case NTB_CONN_RP:
+		dev_info(&ndev->pdev->dev, "Conn Type = RP\n");
+		ndev->conn_type = NTB_CONN_RP;
+		break;
+	case NTB_CONN_TRANSPARENT:
+		dev_info(&ndev->pdev->dev, "Conn Type = TRANSPARENT\n");
+		ndev->conn_type = NTB_CONN_TRANSPARENT;
+		/*
+		 * This mode is default to USD/DSP. HW does not report
+		 * properly in transparent mode as it has no knowledge of
+		 * NTB. We will just force correct here.
+		 */
+		ndev->dev_type = NTB_DEV_USD;
+
+		/*
+		 * This is a way for transparent BAR to figure out if we
+		 * are doing split BAR or not. There is no way for the hw
+		 * on the transparent side to know and set the PPD.
+		 */
+		bars_mask = pci_select_bars(ndev->pdev, IORESOURCE_MEM);
+		bars = hweight32(bars_mask);
+		if (bars == (HSX_SPLITBAR_MAX_MW + 1))
+			ndev->split_bar = 1;
+
+		break;
+	default:
+		dev_err(&ndev->pdev->dev, "Unknown PPD %x\n", ppd);
+		return -ENODEV;
+	}
+
+	ntb_max_mw_detect(ndev);
+
+	return 0;
+}
+
+static int ntb_atom_detect(struct ntb_device *ndev)
+{
+	int rc;
+	u32 ppd;
+
+	ndev->hw_type = BWD_HW;
+	ndev->limits.max_mw = BWD_MAX_MW;
+
+	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &ppd);
+	if (rc)
+		return rc;
+
+	switch ((ppd & BWD_PPD_CONN_TYPE) >> 8) {
+	case NTB_CONN_B2B:
+		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
+		ndev->conn_type = NTB_CONN_B2B;
+		break;
+	case NTB_CONN_RP:
+	default:
+		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
+		return -EINVAL;
+	}
+
+	if (ppd & BWD_PPD_DEV_TYPE)
+		ndev->dev_type = NTB_DEV_DSD;
+	else
+		ndev->dev_type = NTB_DEV_USD;
+
+	return 0;
+}
+
+static int ntb_device_detect(struct ntb_device *ndev)
+{
+	int rc;
+
+	if (is_ntb_xeon(ndev))
+		rc = ntb_xeon_detect(ndev);
+	else if (is_ntb_atom(ndev))
+		rc = ntb_atom_detect(ndev);
+	else
+		rc = -ENODEV;
+
+	dev_info(&ndev->pdev->dev, "Device Type = %s\n",
+		 ndev->dev_type == NTB_DEV_USD ? "USD/DSP" : "DSD/USP");
+
+	return 0;
+}
+
+static int ntb_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct ntb_device *ndev;
+	int rc, i;
+
+	ndev = kzalloc(sizeof(struct ntb_device), GFP_KERNEL);
+	if (!ndev)
+		return -ENOMEM;
+
+	ndev->pdev = pdev;
+
+	ntb_set_errata_flags(ndev);
+
+	ndev->link_status = NTB_LINK_DOWN;
+	pci_set_drvdata(pdev, ndev);
+	ntb_setup_debugfs(ndev);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		goto err;
+
+	pci_set_master(ndev->pdev);
+
+	rc = ntb_device_detect(ndev);
+	if (rc)
+		goto err;
+
+	ndev->mw = kcalloc(ndev->limits.max_mw, sizeof(struct ntb_mw),
+			   GFP_KERNEL);
+	if (!ndev->mw) {
+		rc = -ENOMEM;
+		goto err1;
+	}
+
+	if (ndev->split_bar)
+		rc = pci_request_selected_regions(pdev, NTB_SPLITBAR_MASK,
+						  KBUILD_MODNAME);
+	else
+		rc = pci_request_selected_regions(pdev, NTB_BAR_MASK,
+						  KBUILD_MODNAME);
+
+	if (rc)
+		goto err2;
+
+	ndev->reg_base = pci_ioremap_bar(pdev, NTB_BAR_MMIO);
+	if (!ndev->reg_base) {
+		dev_warn(&pdev->dev, "Cannot remap BAR 0\n");
+		rc = -EIO;
+		goto err3;
+	}
+
+	for (i = 0; i < ndev->limits.max_mw; i++) {
+		ndev->mw[i].bar_sz = pci_resource_len(pdev, MW_TO_BAR(i));
+
+		/*
+		 * with the errata we need to steal last of the memory
+		 * windows for workarounds and they point to MMIO registers.
+		 */
+		if ((ndev->wa_flags & WA_SNB_ERR) &&
+		    (i == (ndev->limits.max_mw - 1))) {
+			ndev->mw[i].vbase =
+				ioremap_nocache(pci_resource_start(pdev,
+							MW_TO_BAR(i)),
+						ndev->mw[i].bar_sz);
+		} else {
+			ndev->mw[i].vbase =
+				ioremap_wc(pci_resource_start(pdev,
+							MW_TO_BAR(i)),
+					   ndev->mw[i].bar_sz);
+		}
+
+		dev_info(&pdev->dev, "MW %d size %llu\n", i,
+			 (unsigned long long) ndev->mw[i].bar_sz);
+		if (!ndev->mw[i].vbase) {
+			dev_warn(&pdev->dev, "Cannot remap BAR %d\n",
+				 MW_TO_BAR(i));
+			rc = -EIO;
+			goto err4;
+		}
+	}
+
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err4;
+
+		dev_warn(&pdev->dev, "Cannot DMA highmem\n");
+	}
+
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err4;
+
+		dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n");
+	}
+
+	rc = ntb_device_setup(ndev);
+	if (rc)
+		goto err4;
+
+	rc = ntb_create_callbacks(ndev);
+	if (rc)
+		goto err5;
+
+	rc = ntb_setup_interrupts(ndev);
+	if (rc)
+		goto err6;
+
+	/* The scratchpad registers keep the values between rmmod/insmod,
+	 * blast them now
+	 */
+	for (i = 0; i < ndev->limits.max_spads; i++) {
+		ntb_write_local_spad(ndev, i, 0);
+		ntb_write_remote_spad(ndev, i, 0);
+	}
+
+	rc = ntb_transport_init(pdev);
+	if (rc)
+		goto err7;
+
+	ntb_hw_link_up(ndev);
+
+	return 0;
+
+err7:
+	ntb_free_interrupts(ndev);
+err6:
+	ntb_free_callbacks(ndev);
+err5:
+	ntb_device_free(ndev);
+err4:
+	for (i--; i >= 0; i--)
+		iounmap(ndev->mw[i].vbase);
+	iounmap(ndev->reg_base);
+err3:
+	if (ndev->split_bar)
+		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
+	else
+		pci_release_selected_regions(pdev, NTB_BAR_MASK);
+err2:
+	kfree(ndev->mw);
+err1:
+	pci_disable_device(pdev);
+err:
+	ntb_free_debugfs(ndev);
+	kfree(ndev);
+
+	dev_err(&pdev->dev, "Error loading %s module\n", KBUILD_MODNAME);
+	return rc;
+}
+
+static void ntb_pci_remove(struct pci_dev *pdev)
+{
+	struct ntb_device *ndev = pci_get_drvdata(pdev);
+	int i;
+
+	ntb_hw_link_down(ndev);
+
+	ntb_transport_free(ndev->ntb_transport);
+
+	ntb_free_interrupts(ndev);
+	ntb_free_callbacks(ndev);
+	ntb_device_free(ndev);
+
+	/* need to reset max_mw limits so we can unmap properly */
+	if (ndev->hw_type == SNB_HW)
+		ntb_max_mw_detect(ndev);
+
+	for (i = 0; i < ndev->limits.max_mw; i++)
+		iounmap(ndev->mw[i].vbase);
+
+	kfree(ndev->mw);
+	iounmap(ndev->reg_base);
+	if (ndev->split_bar)
+		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
+	else
+		pci_release_selected_regions(pdev, NTB_BAR_MASK);
+	pci_disable_device(pdev);
+	ntb_free_debugfs(ndev);
+	kfree(ndev);
+}
+
+static struct pci_driver ntb_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = ntb_pci_tbl,
+	.probe = ntb_pci_probe,
+	.remove = ntb_pci_remove,
+};
+
+module_pci_driver(ntb_pci_driver);
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h
new file mode 100644
index 000000000000..935610454f70
--- /dev/null
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.h
@@ -0,0 +1,385 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+#include <linux/ntb_transport.h>
+
+#define NTB_LINK_STATUS_ACTIVE	0x2000
+#define NTB_LINK_SPEED_MASK	0x000f
+#define NTB_LINK_WIDTH_MASK	0x03f0
+
+#define SNB_MSIX_CNT		4
+#define SNB_MAX_B2B_SPADS	16
+#define SNB_MAX_COMPAT_SPADS	16
+/* Reserve the uppermost bit for link interrupt */
+#define SNB_MAX_DB_BITS		15
+#define SNB_LINK_DB		15
+#define SNB_DB_BITS_PER_VEC	5
+#define HSX_SPLITBAR_MAX_MW	3
+#define SNB_MAX_MW		2
+#define SNB_ERRATA_MAX_MW	1
+
+#define SNB_DB_HW_LINK		0x8000
+
+#define SNB_UNCERRSTS_OFFSET	0x014C
+#define SNB_CORERRSTS_OFFSET	0x0158
+#define SNB_LINK_STATUS_OFFSET	0x01A2
+#define SNB_PCICMD_OFFSET	0x0504
+#define SNB_DEVCTRL_OFFSET	0x0598
+#define SNB_DEVSTS_OFFSET	0x059A
+#define SNB_SLINK_STATUS_OFFSET	0x05A2
+
+#define SNB_PBAR2LMT_OFFSET	0x0000
+#define SNB_PBAR4LMT_OFFSET	0x0008
+#define SNB_PBAR5LMT_OFFSET	0x000C
+#define SNB_PBAR2XLAT_OFFSET	0x0010
+#define SNB_PBAR4XLAT_OFFSET	0x0018
+#define SNB_PBAR5XLAT_OFFSET	0x001C
+#define SNB_SBAR2LMT_OFFSET	0x0020
+#define SNB_SBAR4LMT_OFFSET	0x0028
+#define SNB_SBAR5LMT_OFFSET	0x002C
+#define SNB_SBAR2XLAT_OFFSET	0x0030
+#define SNB_SBAR4XLAT_OFFSET	0x0038
+#define SNB_SBAR5XLAT_OFFSET	0x003C
+#define SNB_SBAR0BASE_OFFSET	0x0040
+#define SNB_SBAR2BASE_OFFSET	0x0048
+#define SNB_SBAR4BASE_OFFSET	0x0050
+#define SNB_SBAR5BASE_OFFSET	0x0054
+#define SNB_NTBCNTL_OFFSET	0x0058
+#define SNB_SBDF_OFFSET		0x005C
+#define SNB_PDOORBELL_OFFSET	0x0060
+#define SNB_PDBMSK_OFFSET	0x0062
+#define SNB_SDOORBELL_OFFSET	0x0064
+#define SNB_SDBMSK_OFFSET	0x0066
+#define SNB_USMEMMISS_OFFSET	0x0070
+#define SNB_SPAD_OFFSET		0x0080
+#define SNB_SPADSEMA4_OFFSET	0x00c0
+#define SNB_WCCNTRL_OFFSET	0x00e0
+#define SNB_B2B_SPAD_OFFSET	0x0100
+#define SNB_B2B_DOORBELL_OFFSET	0x0140
+#define SNB_B2B_XLAT_OFFSETL	0x0144
+#define SNB_B2B_XLAT_OFFSETU	0x0148
+
+/*
+ * The addresses are setup so the 32bit BARs can function. Thus
+ * the addresses are all in 32bit space
+ */
+#define SNB_MBAR01_USD_ADDR	0x000000002100000CULL
+#define SNB_MBAR23_USD_ADDR	0x000000004100000CULL
+#define SNB_MBAR4_USD_ADDR	0x000000008100000CULL
+#define SNB_MBAR5_USD_ADDR	0x00000000A100000CULL
+#define SNB_MBAR01_DSD_ADDR	0x000000002000000CULL
+#define SNB_MBAR23_DSD_ADDR	0x000000004000000CULL
+#define SNB_MBAR4_DSD_ADDR	0x000000008000000CULL
+#define SNB_MBAR5_DSD_ADDR	0x00000000A000000CULL
+
+#define BWD_MSIX_CNT		34
+#define BWD_MAX_SPADS		16
+#define BWD_MAX_DB_BITS		34
+#define BWD_DB_BITS_PER_VEC	1
+#define BWD_MAX_MW		2
+
+#define BWD_PCICMD_OFFSET	0xb004
+#define BWD_MBAR23_OFFSET	0xb018
+#define BWD_MBAR45_OFFSET	0xb020
+#define BWD_DEVCTRL_OFFSET	0xb048
+#define BWD_LINK_STATUS_OFFSET	0xb052
+#define BWD_ERRCORSTS_OFFSET	0xb110
+
+#define BWD_SBAR2XLAT_OFFSET	0x0008
+#define BWD_SBAR4XLAT_OFFSET	0x0010
+#define BWD_PDOORBELL_OFFSET	0x0020
+#define BWD_PDBMSK_OFFSET	0x0028
+#define BWD_NTBCNTL_OFFSET	0x0060
+#define BWD_EBDF_OFFSET		0x0064
+#define BWD_SPAD_OFFSET		0x0080
+#define BWD_SPADSEMA_OFFSET	0x00c0
+#define BWD_STKYSPAD_OFFSET	0x00c4
+#define BWD_PBAR2XLAT_OFFSET	0x8008
+#define BWD_PBAR4XLAT_OFFSET	0x8010
+#define BWD_B2B_DOORBELL_OFFSET	0x8020
+#define BWD_B2B_SPAD_OFFSET	0x8080
+#define BWD_B2B_SPADSEMA_OFFSET	0x80c0
+#define BWD_B2B_STKYSPAD_OFFSET	0x80c4
+
+#define BWD_MODPHY_PCSREG4	0x1c004
+#define BWD_MODPHY_PCSREG6	0x1c006
+
+#define BWD_IP_BASE		0xC000
+#define BWD_DESKEWSTS_OFFSET	(BWD_IP_BASE + 0x3024)
+#define BWD_LTSSMERRSTS0_OFFSET (BWD_IP_BASE + 0x3180)
+#define BWD_LTSSMSTATEJMP_OFFSET	(BWD_IP_BASE + 0x3040)
+#define BWD_IBSTERRRCRVSTS0_OFFSET	(BWD_IP_BASE + 0x3324)
+
+#define BWD_DESKEWSTS_DBERR	(1 << 15)
+#define BWD_LTSSMERRSTS0_UNEXPECTEDEI	(1 << 20)
+#define BWD_LTSSMSTATEJMP_FORCEDETECT	(1 << 2)
+#define BWD_IBIST_ERR_OFLOW	0x7FFF7FFF
+
+#define NTB_CNTL_CFG_LOCK		(1 << 0)
+#define NTB_CNTL_LINK_DISABLE		(1 << 1)
+#define NTB_CNTL_S2P_BAR23_SNOOP	(1 << 2)
+#define NTB_CNTL_P2S_BAR23_SNOOP	(1 << 4)
+#define NTB_CNTL_S2P_BAR4_SNOOP	(1 << 6)
+#define NTB_CNTL_P2S_BAR4_SNOOP	(1 << 8)
+#define NTB_CNTL_S2P_BAR5_SNOOP	(1 << 12)
+#define NTB_CNTL_P2S_BAR5_SNOOP	(1 << 14)
+#define BWD_CNTL_LINK_DOWN		(1 << 16)
+
+#define NTB_PPD_OFFSET		0x00D4
+#define SNB_PPD_CONN_TYPE	0x0003
+#define SNB_PPD_DEV_TYPE	0x0010
+#define SNB_PPD_SPLIT_BAR	(1 << 6)
+#define BWD_PPD_INIT_LINK	0x0008
+#define BWD_PPD_CONN_TYPE	0x0300
+#define BWD_PPD_DEV_TYPE	0x1000
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF		0x3725
+#define PCI_DEVICE_ID_INTEL_NTB_PS_JSF		0x3726
+#define PCI_DEVICE_ID_INTEL_NTB_SS_JSF		0x3727
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB		0x3C0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_SNB		0x3C0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_SNB		0x3C0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_IVT		0x0E0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_IVT		0x0E0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_IVT		0x0E0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_HSX		0x2F0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_HSX		0x2F0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_HSX		0x2F0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD		0x0C4E
+
+#ifndef readq
+static inline u64 readq(void __iomem *addr)
+{
+	return readl(addr) | (((u64) readl(addr + 4)) << 32LL);
+}
+#endif
+
+#ifndef writeq
+static inline void writeq(u64 val, void __iomem *addr)
+{
+	writel(val & 0xffffffff, addr);
+	writel(val >> 32, addr + 4);
+}
+#endif
+
+#define NTB_BAR_MMIO		0
+#define NTB_BAR_23		2
+#define NTB_BAR_4		4
+#define NTB_BAR_5		5
+
+#define NTB_BAR_MASK		((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
+				 (1 << NTB_BAR_4))
+#define NTB_SPLITBAR_MASK	((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
+				 (1 << NTB_BAR_4) | (1 << NTB_BAR_5))
+
+#define NTB_HB_TIMEOUT		msecs_to_jiffies(1000)
+
+enum ntb_hw_event {
+	NTB_EVENT_SW_EVENT0 = 0,
+	NTB_EVENT_SW_EVENT1,
+	NTB_EVENT_SW_EVENT2,
+	NTB_EVENT_HW_ERROR,
+	NTB_EVENT_HW_LINK_UP,
+	NTB_EVENT_HW_LINK_DOWN,
+};
+
+struct ntb_mw {
+	dma_addr_t phys_addr;
+	void __iomem *vbase;
+	resource_size_t bar_sz;
+};
+
+struct ntb_db_cb {
+	int (*callback)(void *data, int db_num);
+	unsigned int db_num;
+	void *data;
+	struct ntb_device *ndev;
+	struct tasklet_struct irq_work;
+};
+
+#define WA_SNB_ERR	0x00000001
+
+struct ntb_device {
+	struct pci_dev *pdev;
+	struct msix_entry *msix_entries;
+	void __iomem *reg_base;
+	struct ntb_mw *mw;
+	struct {
+		unsigned char max_mw;
+		unsigned char max_spads;
+		unsigned char max_db_bits;
+		unsigned char msix_cnt;
+	} limits;
+	struct {
+		void __iomem *ldb;
+		void __iomem *ldb_mask;
+		void __iomem *rdb;
+		void __iomem *bar2_xlat;
+		void __iomem *bar4_xlat;
+		void __iomem *bar5_xlat;
+		void __iomem *spad_write;
+		void __iomem *spad_read;
+		void __iomem *lnk_cntl;
+		void __iomem *lnk_stat;
+		void __iomem *spci_cmd;
+	} reg_ofs;
+	struct ntb_transport *ntb_transport;
+	void (*event_cb)(void *handle, enum ntb_hw_event event);
+
+	struct ntb_db_cb *db_cb;
+	unsigned char hw_type;
+	unsigned char conn_type;
+	unsigned char dev_type;
+	unsigned char num_msix;
+	unsigned char bits_per_vector;
+	unsigned char max_cbs;
+	unsigned char link_width;
+	unsigned char link_speed;
+	unsigned char link_status;
+	unsigned char split_bar;
+
+	struct delayed_work hb_timer;
+	unsigned long last_ts;
+
+	struct delayed_work lr_timer;
+
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_info;
+
+	unsigned int wa_flags;
+};
+
+/**
+ * ntb_max_cbs() - return the max callbacks
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer, return the maximum number of callbacks
+ *
+ * RETURNS: the maximum number of callbacks
+ */
+static inline unsigned char ntb_max_cbs(struct ntb_device *ndev)
+{
+	return ndev->max_cbs;
+}
+
+/**
+ * ntb_max_mw() - return the max number of memory windows
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer, return the maximum number of memory windows
+ *
+ * RETURNS: the maximum number of memory windows
+ */
+static inline unsigned char ntb_max_mw(struct ntb_device *ndev)
+{
+	return ndev->limits.max_mw;
+}
+
+/**
+ * ntb_hw_link_status() - return the hardware link status
+ * @ndev: pointer to ntb_device instance
+ *
+ * Returns true if the hardware is connected to the remote system
+ *
+ * RETURNS: true or false based on the hardware link state
+ */
+static inline bool ntb_hw_link_status(struct ntb_device *ndev)
+{
+	return ndev->link_status == NTB_LINK_UP;
+}
+
+/**
+ * ntb_query_pdev() - return the pci_dev pointer
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer, return the pci_dev pointer for the NTB hardware device
+ *
+ * RETURNS: a pointer to the ntb pci_dev
+ */
+static inline struct pci_dev *ntb_query_pdev(struct ntb_device *ndev)
+{
+	return ndev->pdev;
+}
+
+/**
+ * ntb_query_debugfs() - return the debugfs pointer
+ * @ndev: pointer to ntb_device instance
+ *
+ * Given the ntb pointer, return the debugfs directory pointer for the NTB
+ * hardware device
+ *
+ * RETURNS: a pointer to the debugfs directory
+ */
+static inline struct dentry *ntb_query_debugfs(struct ntb_device *ndev)
+{
+	return ndev->debugfs_dir;
+}
+
+struct ntb_device *ntb_register_transport(struct pci_dev *pdev,
+					  void *transport);
+void ntb_unregister_transport(struct ntb_device *ndev);
+void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr);
+int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
+			     void *data, int (*db_cb_func)(void *data,
+							   int db_num));
+void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx);
+int ntb_register_event_callback(struct ntb_device *ndev,
+				void (*event_cb_func)(void *handle,
+						      enum ntb_hw_event event));
+void ntb_unregister_event_callback(struct ntb_device *ndev);
+int ntb_get_max_spads(struct ntb_device *ndev);
+int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
+int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
+int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
+int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
+resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
+void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
+u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
+void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int idx);
+void *ntb_find_transport(struct pci_dev *pdev);
+
+int ntb_transport_init(struct pci_dev *pdev);
+void ntb_transport_free(void *transport);
diff --git a/drivers/ntb/ntb_hw.c b/drivers/ntb/ntb_hw.c
deleted file mode 100644
index 3f6738612f45..000000000000
--- a/drivers/ntb/ntb_hw.c
+++ /dev/null
@@ -1,1895 +0,0 @@
-/*
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- *   redistributing this file, you may do so under either license.
- *
- *   GPL LICENSE SUMMARY
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   BSD LICENSE
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copy
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Intel PCIe NTB Linux driver
- *
- * Contact Information:
- * Jon Mason <jon.mason@intel.com>
- */
-#include <linux/debugfs.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include "ntb_hw.h"
-#include "ntb_regs.h"
-
-#define NTB_NAME	"Intel(R) PCI-E Non-Transparent Bridge Driver"
-#define NTB_VER		"1.0"
-
-MODULE_DESCRIPTION(NTB_NAME);
-MODULE_VERSION(NTB_VER);
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Intel Corporation");
-
-enum {
-	NTB_CONN_TRANSPARENT = 0,
-	NTB_CONN_B2B,
-	NTB_CONN_RP,
-};
-
-enum {
-	NTB_DEV_USD = 0,
-	NTB_DEV_DSD,
-};
-
-enum {
-	SNB_HW = 0,
-	BWD_HW,
-};
-
-static struct dentry *debugfs_dir;
-
-#define BWD_LINK_RECOVERY_TIME	500
-
-/* Translate memory window 0,1,2 to BAR 2,4,5 */
-#define MW_TO_BAR(mw)	(mw == 0 ? 2 : (mw == 1 ? 4 : 5))
-
-static const struct pci_device_id ntb_pci_tbl[] = {
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
-	{0}
-};
-MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
-
-static int is_ntb_xeon(struct ntb_device *ndev)
-{
-	switch (ndev->pdev->device) {
-	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
-		return 1;
-	default:
-		return 0;
-	}
-
-	return 0;
-}
-
-static int is_ntb_atom(struct ntb_device *ndev)
-{
-	switch (ndev->pdev->device) {
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_BWD:
-		return 1;
-	default:
-		return 0;
-	}
-
-	return 0;
-}
-
-static void ntb_set_errata_flags(struct ntb_device *ndev)
-{
-	switch (ndev->pdev->device) {
-	/*
-	 * this workaround applies to all platform up to IvyBridge
-	 * Haswell has splitbar support and use a different workaround
-	 */
-	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
-		ndev->wa_flags |= WA_SNB_ERR;
-		break;
-	}
-}
-
-/**
- * ntb_register_event_callback() - register event callback
- * @ndev: pointer to ntb_device instance
- * @func: callback function to register
- *
- * This function registers a callback for any HW driver events such as link
- * up/down, power management notices and etc.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_register_event_callback(struct ntb_device *ndev,
-				void (*func)(void *handle,
-					     enum ntb_hw_event event))
-{
-	if (ndev->event_cb)
-		return -EINVAL;
-
-	ndev->event_cb = func;
-
-	return 0;
-}
-
-/**
- * ntb_unregister_event_callback() - unregisters the event callback
- * @ndev: pointer to ntb_device instance
- *
- * This function unregisters the existing callback from transport
- */
-void ntb_unregister_event_callback(struct ntb_device *ndev)
-{
-	ndev->event_cb = NULL;
-}
-
-static void ntb_irq_work(unsigned long data)
-{
-	struct ntb_db_cb *db_cb = (struct ntb_db_cb *)data;
-	int rc;
-
-	rc = db_cb->callback(db_cb->data, db_cb->db_num);
-	if (rc)
-		tasklet_schedule(&db_cb->irq_work);
-	else {
-		struct ntb_device *ndev = db_cb->ndev;
-		unsigned long mask;
-
-		mask = readw(ndev->reg_ofs.ldb_mask);
-		clear_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-		writew(mask, ndev->reg_ofs.ldb_mask);
-	}
-}
-
-/**
- * ntb_register_db_callback() - register a callback for doorbell interrupt
- * @ndev: pointer to ntb_device instance
- * @idx: doorbell index to register callback, zero based
- * @data: pointer to be returned to caller with every callback
- * @func: callback function to register
- *
- * This function registers a callback function for the doorbell interrupt
- * on the primary side. The function will unmask the doorbell as well to
- * allow interrupt.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
-			     void *data, int (*func)(void *data, int db_num))
-{
-	unsigned long mask;
-
-	if (idx >= ndev->max_cbs || ndev->db_cb[idx].callback) {
-		dev_warn(&ndev->pdev->dev, "Invalid Index.\n");
-		return -EINVAL;
-	}
-
-	ndev->db_cb[idx].callback = func;
-	ndev->db_cb[idx].data = data;
-	ndev->db_cb[idx].ndev = ndev;
-
-	tasklet_init(&ndev->db_cb[idx].irq_work, ntb_irq_work,
-		     (unsigned long) &ndev->db_cb[idx]);
-
-	/* unmask interrupt */
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	clear_bit(idx * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
-
-	return 0;
-}
-
-/**
- * ntb_unregister_db_callback() - unregister a callback for doorbell interrupt
- * @ndev: pointer to ntb_device instance
- * @idx: doorbell index to register callback, zero based
- *
- * This function unregisters a callback function for the doorbell interrupt
- * on the primary side. The function will also mask the said doorbell.
- */
-void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx)
-{
-	unsigned long mask;
-
-	if (idx >= ndev->max_cbs || !ndev->db_cb[idx].callback)
-		return;
-
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(idx * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
-
-	tasklet_disable(&ndev->db_cb[idx].irq_work);
-
-	ndev->db_cb[idx].callback = NULL;
-}
-
-/**
- * ntb_find_transport() - find the transport pointer
- * @transport: pointer to pci device
- *
- * Given the pci device pointer, return the transport pointer passed in when
- * the transport attached when it was inited.
- *
- * RETURNS: pointer to transport.
- */
-void *ntb_find_transport(struct pci_dev *pdev)
-{
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-	return ndev->ntb_transport;
-}
-
-/**
- * ntb_register_transport() - Register NTB transport with NTB HW driver
- * @transport: transport identifier
- *
- * This function allows a transport to reserve the hardware driver for
- * NTB usage.
- *
- * RETURNS: pointer to ntb_device, NULL on error.
- */
-struct ntb_device *ntb_register_transport(struct pci_dev *pdev, void *transport)
-{
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-
-	if (ndev->ntb_transport)
-		return NULL;
-
-	ndev->ntb_transport = transport;
-	return ndev;
-}
-
-/**
- * ntb_unregister_transport() - Unregister the transport with the NTB HW driver
- * @ndev - ntb_device of the transport to be freed
- *
- * This function unregisters the transport from the HW driver and performs any
- * necessary cleanups.
- */
-void ntb_unregister_transport(struct ntb_device *ndev)
-{
-	int i;
-
-	if (!ndev->ntb_transport)
-		return;
-
-	for (i = 0; i < ndev->max_cbs; i++)
-		ntb_unregister_db_callback(ndev, i);
-
-	ntb_unregister_event_callback(ndev);
-	ndev->ntb_transport = NULL;
-}
-
-/**
- * ntb_write_local_spad() - write to the secondary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. This writes over the data mirrored to the local scratchpad register
- * by the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
-{
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
-
-	dev_dbg(&ndev->pdev->dev, "Writing %x to local scratch pad index %d\n",
-		val, idx);
-	writel(val, ndev->reg_ofs.spad_read + idx * 4);
-
-	return 0;
-}
-
-/**
- * ntb_read_local_spad() - read from the primary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.  This allows the local system to read data
- * written and mirrored to the scratchpad register by the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
-{
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
-
-	*val = readl(ndev->reg_ofs.spad_write + idx * 4);
-	dev_dbg(&ndev->pdev->dev,
-		"Reading %x from local scratch pad index %d\n", *val, idx);
-
-	return 0;
-}
-
-/**
- * ntb_write_remote_spad() - write to the secondary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.  This allows
- * the local system to write data to be mirrored to the remote systems
- * scratchpad register.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
-{
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
-
-	dev_dbg(&ndev->pdev->dev, "Writing %x to remote scratch pad index %d\n",
-		val, idx);
-	writel(val, ndev->reg_ofs.spad_write + idx * 4);
-
-	return 0;
-}
-
-/**
- * ntb_read_remote_spad() - read from the primary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.  This alloows the local system to read the data
- * it wrote to be mirrored on the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
-{
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
-
-	*val = readl(ndev->reg_ofs.spad_read + idx * 4);
-	dev_dbg(&ndev->pdev->dev,
-		"Reading %x from remote scratch pad index %d\n", *val, idx);
-
-	return 0;
-}
-
-/**
- * ntb_get_mw_base() - get addr for the NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the base address of the memory window specified.
- *
- * RETURNS: address, or NULL on error.
- */
-resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
-{
-	if (mw >= ntb_max_mw(ndev))
-		return 0;
-
-	return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
-}
-
-/**
- * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the base virtual address of the memory window
- * specified.
- *
- * RETURNS: pointer to virtual address, or NULL on error.
- */
-void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw)
-{
-	if (mw >= ntb_max_mw(ndev))
-		return NULL;
-
-	return ndev->mw[mw].vbase;
-}
-
-/**
- * ntb_get_mw_size() - return size of NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the physical size of the memory window specified
- *
- * RETURNS: the size of the memory window or zero on error
- */
-u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw)
-{
-	if (mw >= ntb_max_mw(ndev))
-		return 0;
-
-	return ndev->mw[mw].bar_sz;
-}
-
-/**
- * ntb_set_mw_addr - set the memory window address
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- * @addr: base address for data
- *
- * This function sets the base physical address of the memory window.  This
- * memory address is where data from the remote system will be transfered into
- * or out of depending on how the transport is configured.
- */
-void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr)
-{
-	if (mw >= ntb_max_mw(ndev))
-		return;
-
-	dev_dbg(&ndev->pdev->dev, "Writing addr %Lx to BAR %d\n", addr,
-		MW_TO_BAR(mw));
-
-	ndev->mw[mw].phys_addr = addr;
-
-	switch (MW_TO_BAR(mw)) {
-	case NTB_BAR_23:
-		writeq(addr, ndev->reg_ofs.bar2_xlat);
-		break;
-	case NTB_BAR_4:
-		if (ndev->split_bar)
-			writel(addr, ndev->reg_ofs.bar4_xlat);
-		else
-			writeq(addr, ndev->reg_ofs.bar4_xlat);
-		break;
-	case NTB_BAR_5:
-		writel(addr, ndev->reg_ofs.bar5_xlat);
-		break;
-	}
-}
-
-/**
- * ntb_ring_doorbell() - Set the doorbell on the secondary/external side
- * @ndev: pointer to ntb_device instance
- * @db: doorbell to ring
- *
- * This function allows triggering of a doorbell on the secondary/external
- * side that will initiate an interrupt on the remote host
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int db)
-{
-	dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
-
-	if (ndev->hw_type == BWD_HW)
-		writeq((u64) 1 << db, ndev->reg_ofs.rdb);
-	else
-		writew(((1 << ndev->bits_per_vector) - 1) <<
-		       (db * ndev->bits_per_vector), ndev->reg_ofs.rdb);
-}
-
-static void bwd_recover_link(struct ntb_device *ndev)
-{
-	u32 status;
-
-	/* Driver resets the NTB ModPhy lanes - magic! */
-	writeb(0xe0, ndev->reg_base + BWD_MODPHY_PCSREG6);
-	writeb(0x40, ndev->reg_base + BWD_MODPHY_PCSREG4);
-	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG4);
-	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG6);
-
-	/* Driver waits 100ms to allow the NTB ModPhy to settle */
-	msleep(100);
-
-	/* Clear AER Errors, write to clear */
-	status = readl(ndev->reg_base + BWD_ERRCORSTS_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "ERRCORSTS = %x\n", status);
-	status &= PCI_ERR_COR_REP_ROLL;
-	writel(status, ndev->reg_base + BWD_ERRCORSTS_OFFSET);
-
-	/* Clear unexpected electrical idle event in LTSSM, write to clear */
-	status = readl(ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "LTSSMERRSTS0 = %x\n", status);
-	status |= BWD_LTSSMERRSTS0_UNEXPECTEDEI;
-	writel(status, ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
-
-	/* Clear DeSkew Buffer error, write to clear */
-	status = readl(ndev->reg_base + BWD_DESKEWSTS_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "DESKEWSTS = %x\n", status);
-	status |= BWD_DESKEWSTS_DBERR;
-	writel(status, ndev->reg_base + BWD_DESKEWSTS_OFFSET);
-
-	status = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "IBSTERRRCRVSTS0 = %x\n", status);
-	status &= BWD_IBIST_ERR_OFLOW;
-	writel(status, ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-
-	/* Releases the NTB state machine to allow the link to retrain */
-	status = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "LTSSMSTATEJMP = %x\n", status);
-	status &= ~BWD_LTSSMSTATEJMP_FORCEDETECT;
-	writel(status, ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-}
-
-static void ntb_link_event(struct ntb_device *ndev, int link_state)
-{
-	unsigned int event;
-
-	if (ndev->link_status == link_state)
-		return;
-
-	if (link_state == NTB_LINK_UP) {
-		u16 status;
-
-		dev_info(&ndev->pdev->dev, "Link Up\n");
-		ndev->link_status = NTB_LINK_UP;
-		event = NTB_EVENT_HW_LINK_UP;
-
-		if (is_ntb_atom(ndev) ||
-		    ndev->conn_type == NTB_CONN_TRANSPARENT)
-			status = readw(ndev->reg_ofs.lnk_stat);
-		else {
-			int rc = pci_read_config_word(ndev->pdev,
-						      SNB_LINK_STATUS_OFFSET,
-						      &status);
-			if (rc)
-				return;
-		}
-
-		ndev->link_width = (status & NTB_LINK_WIDTH_MASK) >> 4;
-		ndev->link_speed = (status & NTB_LINK_SPEED_MASK);
-		dev_info(&ndev->pdev->dev, "Link Width %d, Link Speed %d\n",
-			 ndev->link_width, ndev->link_speed);
-	} else {
-		dev_info(&ndev->pdev->dev, "Link Down\n");
-		ndev->link_status = NTB_LINK_DOWN;
-		event = NTB_EVENT_HW_LINK_DOWN;
-		/* Don't modify link width/speed, we need it in link recovery */
-	}
-
-	/* notify the upper layer if we have an event change */
-	if (ndev->event_cb)
-		ndev->event_cb(ndev->ntb_transport, event);
-}
-
-static int ntb_link_status(struct ntb_device *ndev)
-{
-	int link_state;
-
-	if (is_ntb_atom(ndev)) {
-		u32 ntb_cntl;
-
-		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-		if (ntb_cntl & BWD_CNTL_LINK_DOWN)
-			link_state = NTB_LINK_DOWN;
-		else
-			link_state = NTB_LINK_UP;
-	} else {
-		u16 status;
-		int rc;
-
-		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
-					  &status);
-		if (rc)
-			return rc;
-
-		if (status & NTB_LINK_STATUS_ACTIVE)
-			link_state = NTB_LINK_UP;
-		else
-			link_state = NTB_LINK_DOWN;
-	}
-
-	ntb_link_event(ndev, link_state);
-
-	return 0;
-}
-
-static void bwd_link_recovery(struct work_struct *work)
-{
-	struct ntb_device *ndev = container_of(work, struct ntb_device,
-					       lr_timer.work);
-	u32 status32;
-
-	bwd_recover_link(ndev);
-	/* There is a potential race between the 2 NTB devices recovering at the
-	 * same time.  If the times are the same, the link will not recover and
-	 * the driver will be stuck in this loop forever.  Add a random interval
-	 * to the recovery time to prevent this race.
-	 */
-	msleep(BWD_LINK_RECOVERY_TIME + prandom_u32() % BWD_LINK_RECOVERY_TIME);
-
-	status32 = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-	if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT)
-		goto retry;
-
-	status32 = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-	if (status32 & BWD_IBIST_ERR_OFLOW)
-		goto retry;
-
-	status32 = readl(ndev->reg_ofs.lnk_cntl);
-	if (!(status32 & BWD_CNTL_LINK_DOWN)) {
-		unsigned char speed, width;
-		u16 status16;
-
-		status16 = readw(ndev->reg_ofs.lnk_stat);
-		width = (status16 & NTB_LINK_WIDTH_MASK) >> 4;
-		speed = (status16 & NTB_LINK_SPEED_MASK);
-		if (ndev->link_width != width || ndev->link_speed != speed)
-			goto retry;
-	}
-
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
-	return;
-
-retry:
-	schedule_delayed_work(&ndev->lr_timer, NTB_HB_TIMEOUT);
-}
-
-/* BWD doesn't have link status interrupt, poll on that platform */
-static void bwd_link_poll(struct work_struct *work)
-{
-	struct ntb_device *ndev = container_of(work, struct ntb_device,
-					       hb_timer.work);
-	unsigned long ts = jiffies;
-
-	/* If we haven't gotten an interrupt in a while, check the BWD link
-	 * status bit
-	 */
-	if (ts > ndev->last_ts + NTB_HB_TIMEOUT) {
-		int rc = ntb_link_status(ndev);
-		if (rc)
-			dev_err(&ndev->pdev->dev,
-				"Error determining link status\n");
-
-		/* Check to see if a link error is the cause of the link down */
-		if (ndev->link_status == NTB_LINK_DOWN) {
-			u32 status32 = readl(ndev->reg_base +
-					     BWD_LTSSMSTATEJMP_OFFSET);
-			if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT) {
-				schedule_delayed_work(&ndev->lr_timer, 0);
-				return;
-			}
-		}
-	}
-
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
-}
-
-static int ntb_xeon_setup(struct ntb_device *ndev)
-{
-	switch (ndev->conn_type) {
-	case NTB_CONN_B2B:
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
-		if (ndev->split_bar)
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
-		ndev->limits.max_spads = SNB_MAX_B2B_SPADS;
-
-		/* There is a Xeon hardware errata related to writes to
-		 * SDOORBELL or B2BDOORBELL in conjunction with inbound access
-		 * to NTB MMIO Space, which may hang the system.  To workaround
-		 * this use the second memory window to access the interrupt and
-		 * scratch pad registers on the remote system.
-		 */
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			if (!ndev->mw[ndev->limits.max_mw - 1].bar_sz)
-				return -EINVAL;
-
-			ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-			ndev->reg_ofs.spad_write =
-				ndev->mw[ndev->limits.max_mw - 1].vbase +
-				SNB_SPAD_OFFSET;
-			ndev->reg_ofs.rdb =
-				ndev->mw[ndev->limits.max_mw - 1].vbase +
-				SNB_PDOORBELL_OFFSET;
-
-			/* Set the Limit register to 4k, the minimum size, to
-			 * prevent an illegal access
-			 */
-			writeq(ndev->mw[1].bar_sz + 0x1000, ndev->reg_base +
-			       SNB_PBAR4LMT_OFFSET);
-			/* HW errata on the Limit registers.  They can only be
-			 * written when the base register is 4GB aligned and
-			 * < 32bit.  This should already be the case based on
-			 * the driver defaults, but write the Limit registers
-			 * first just in case.
-			 */
-
-			ndev->limits.max_mw = SNB_ERRATA_MAX_MW;
-		} else {
-			/* HW Errata on bit 14 of b2bdoorbell register.  Writes
-			 * will not be mirrored to the remote system.  Shrink
-			 * the number of bits by one, since bit 14 is the last
-			 * bit.
-			 */
-			ndev->limits.max_db_bits = SNB_MAX_DB_BITS - 1;
-			ndev->reg_ofs.spad_write = ndev->reg_base +
-						   SNB_B2B_SPAD_OFFSET;
-			ndev->reg_ofs.rdb = ndev->reg_base +
-					    SNB_B2B_DOORBELL_OFFSET;
-
-			/* Disable the Limit register, just incase it is set to
-			 * something silly. A 64bit write should handle it
-			 * regardless of whether it has a split BAR or not.
-			 */
-			writeq(0, ndev->reg_base + SNB_PBAR4LMT_OFFSET);
-			/* HW errata on the Limit registers.  They can only be
-			 * written when the base register is 4GB aligned and
-			 * < 32bit.  This should already be the case based on
-			 * the driver defaults, but write the Limit registers
-			 * first just in case.
-			 */
-			if (ndev->split_bar)
-				ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-			else
-				ndev->limits.max_mw = SNB_MAX_MW;
-		}
-
-		/* The Xeon errata workaround requires setting SBAR Base
-		 * addresses to known values, so that the PBAR XLAT can be
-		 * pointed at SBAR0 of the remote system.
-		 */
-		if (ndev->dev_type == NTB_DEV_USD) {
-			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
-			       SNB_PBAR2XLAT_OFFSET);
-			if (ndev->wa_flags & WA_SNB_ERR)
-				writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
-				       SNB_PBAR4XLAT_OFFSET);
-			else {
-				if (ndev->split_bar) {
-					writel(SNB_MBAR4_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-					writel(SNB_MBAR5_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR5XLAT_OFFSET);
-				} else
-					writeq(SNB_MBAR4_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-
-				/* B2B_XLAT_OFFSET is a 64bit register, but can
-				 * only take 32bit writes
-				 */
-				writel(SNB_MBAR01_DSD_ADDR & 0xffffffff,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
-				writel(SNB_MBAR01_DSD_ADDR >> 32,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
-			}
-
-			writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
-			       SNB_SBAR0BASE_OFFSET);
-			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
-			       SNB_SBAR2BASE_OFFSET);
-			if (ndev->split_bar) {
-				writel(SNB_MBAR4_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-				writel(SNB_MBAR5_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR5BASE_OFFSET);
-			} else
-				writeq(SNB_MBAR4_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-		} else {
-			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
-			       SNB_PBAR2XLAT_OFFSET);
-			if (ndev->wa_flags & WA_SNB_ERR)
-				writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
-				       SNB_PBAR4XLAT_OFFSET);
-			else {
-				if (ndev->split_bar) {
-					writel(SNB_MBAR4_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-					writel(SNB_MBAR5_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR5XLAT_OFFSET);
-				} else
-					writeq(SNB_MBAR4_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-
-				/*
-				 * B2B_XLAT_OFFSET is a 64bit register, but can
-				 * only take 32bit writes
-				 */
-				writel(SNB_MBAR01_USD_ADDR & 0xffffffff,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
-				writel(SNB_MBAR01_USD_ADDR >> 32,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
-			}
-			writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
-			       SNB_SBAR0BASE_OFFSET);
-			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
-			       SNB_SBAR2BASE_OFFSET);
-			if (ndev->split_bar) {
-				writel(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-				writel(SNB_MBAR5_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR5BASE_OFFSET);
-			} else
-				writeq(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-
-		}
-		break;
-	case NTB_CONN_RP:
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			dev_err(&ndev->pdev->dev,
-				"NTB-RP disabled due to hardware errata.\n");
-			return -EINVAL;
-		}
-
-		/* Scratch pads need to have exclusive access from the primary
-		 * or secondary side.  Halve the num spads so that each side can
-		 * have an equal amount.
-		 */
-		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
-		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-		/* Note: The SDOORBELL is the cause of the errata.  You REALLY
-		 * don't want to touch it.
-		 */
-		ndev->reg_ofs.rdb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
-		/* Offset the start of the spads to correspond to whether it is
-		 * primary or secondary
-		 */
-		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET +
-					   ndev->limits.max_spads * 4;
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
-		if (ndev->split_bar) {
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
-			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-		} else
-			ndev->limits.max_mw = SNB_MAX_MW;
-		break;
-	case NTB_CONN_TRANSPARENT:
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			dev_err(&ndev->pdev->dev,
-				"NTB-TRANSPARENT disabled due to hardware errata.\n");
-			return -EINVAL;
-		}
-
-		/* Scratch pads need to have exclusive access from the primary
-		 * or secondary side.  Halve the num spads so that each side can
-		 * have an equal amount.
-		 */
-		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
-		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-		ndev->reg_ofs.rdb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_SDBMSK_OFFSET;
-		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET;
-		/* Offset the start of the spads to correspond to whether it is
-		 * primary or secondary
-		 */
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET +
-					  ndev->limits.max_spads * 4;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_PBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_PBAR4XLAT_OFFSET;
-
-		if (ndev->split_bar) {
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_PBAR5XLAT_OFFSET;
-			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-		} else
-			ndev->limits.max_mw = SNB_MAX_MW;
-		break;
-	default:
-		/*
-		 * we should never hit this. the detect function should've
-		 * take cared of everything.
-		 */
-		return -EINVAL;
-	}
-
-	ndev->reg_ofs.lnk_cntl = ndev->reg_base + SNB_NTBCNTL_OFFSET;
-	ndev->reg_ofs.lnk_stat = ndev->reg_base + SNB_SLINK_STATUS_OFFSET;
-	ndev->reg_ofs.spci_cmd = ndev->reg_base + SNB_PCICMD_OFFSET;
-
-	ndev->limits.msix_cnt = SNB_MSIX_CNT;
-	ndev->bits_per_vector = SNB_DB_BITS_PER_VEC;
-
-	return 0;
-}
-
-static int ntb_bwd_setup(struct ntb_device *ndev)
-{
-	int rc;
-	u32 val;
-
-	ndev->hw_type = BWD_HW;
-
-	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &val);
-	if (rc)
-		return rc;
-
-	switch ((val & BWD_PPD_CONN_TYPE) >> 8) {
-	case NTB_CONN_B2B:
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-	default:
-		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
-		return -EINVAL;
-	}
-
-	if (val & BWD_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_DSD;
-	else
-		ndev->dev_type = NTB_DEV_USD;
-
-	/* Initiate PCI-E link training */
-	rc = pci_write_config_dword(ndev->pdev, NTB_PPD_OFFSET,
-				    val | BWD_PPD_INIT_LINK);
-	if (rc)
-		return rc;
-
-	ndev->reg_ofs.ldb = ndev->reg_base + BWD_PDOORBELL_OFFSET;
-	ndev->reg_ofs.ldb_mask = ndev->reg_base + BWD_PDBMSK_OFFSET;
-	ndev->reg_ofs.rdb = ndev->reg_base + BWD_B2B_DOORBELL_OFFSET;
-	ndev->reg_ofs.bar2_xlat = ndev->reg_base + BWD_SBAR2XLAT_OFFSET;
-	ndev->reg_ofs.bar4_xlat = ndev->reg_base + BWD_SBAR4XLAT_OFFSET;
-	ndev->reg_ofs.lnk_cntl = ndev->reg_base + BWD_NTBCNTL_OFFSET;
-	ndev->reg_ofs.lnk_stat = ndev->reg_base + BWD_LINK_STATUS_OFFSET;
-	ndev->reg_ofs.spad_read = ndev->reg_base + BWD_SPAD_OFFSET;
-	ndev->reg_ofs.spad_write = ndev->reg_base + BWD_B2B_SPAD_OFFSET;
-	ndev->reg_ofs.spci_cmd = ndev->reg_base + BWD_PCICMD_OFFSET;
-	ndev->limits.max_mw = BWD_MAX_MW;
-	ndev->limits.max_spads = BWD_MAX_SPADS;
-	ndev->limits.max_db_bits = BWD_MAX_DB_BITS;
-	ndev->limits.msix_cnt = BWD_MSIX_CNT;
-	ndev->bits_per_vector = BWD_DB_BITS_PER_VEC;
-
-	/* Since bwd doesn't have a link interrupt, setup a poll timer */
-	INIT_DELAYED_WORK(&ndev->hb_timer, bwd_link_poll);
-	INIT_DELAYED_WORK(&ndev->lr_timer, bwd_link_recovery);
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
-
-	return 0;
-}
-
-static int ntb_device_setup(struct ntb_device *ndev)
-{
-	int rc;
-
-	if (is_ntb_xeon(ndev))
-		rc = ntb_xeon_setup(ndev);
-	else if (is_ntb_atom(ndev))
-		rc = ntb_bwd_setup(ndev);
-	else
-		rc = -ENODEV;
-
-	if (rc)
-		return rc;
-
-	if (ndev->conn_type == NTB_CONN_B2B)
-		/* Enable Bus Master and Memory Space on the secondary side */
-		writew(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
-		       ndev->reg_ofs.spci_cmd);
-
-	return 0;
-}
-
-static void ntb_device_free(struct ntb_device *ndev)
-{
-	if (is_ntb_atom(ndev)) {
-		cancel_delayed_work_sync(&ndev->hb_timer);
-		cancel_delayed_work_sync(&ndev->lr_timer);
-	}
-}
-
-static irqreturn_t bwd_callback_msix_irq(int irq, void *data)
-{
-	struct ntb_db_cb *db_cb = data;
-	struct ntb_device *ndev = db_cb->ndev;
-	unsigned long mask;
-
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
-		db_cb->db_num);
-
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
-
-	tasklet_schedule(&db_cb->irq_work);
-
-	/* No need to check for the specific HB irq, any interrupt means
-	 * we're connected.
-	 */
-	ndev->last_ts = jiffies;
-
-	writeq((u64) 1 << db_cb->db_num, ndev->reg_ofs.ldb);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t xeon_callback_msix_irq(int irq, void *data)
-{
-	struct ntb_db_cb *db_cb = data;
-	struct ntb_device *ndev = db_cb->ndev;
-	unsigned long mask;
-
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
-		db_cb->db_num);
-
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
-
-	tasklet_schedule(&db_cb->irq_work);
-
-	/* On Sandybridge, there are 16 bits in the interrupt register
-	 * but only 4 vectors.  So, 5 bits are assigned to the first 3
-	 * vectors, with the 4th having a single bit for link
-	 * interrupts.
-	 */
-	writew(((1 << ndev->bits_per_vector) - 1) <<
-	       (db_cb->db_num * ndev->bits_per_vector), ndev->reg_ofs.ldb);
-
-	return IRQ_HANDLED;
-}
-
-/* Since we do not have a HW doorbell in BWD, this is only used in JF/JT */
-static irqreturn_t xeon_event_msix_irq(int irq, void *dev)
-{
-	struct ntb_device *ndev = dev;
-	int rc;
-
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for Events\n", irq);
-
-	rc = ntb_link_status(ndev);
-	if (rc)
-		dev_err(&ndev->pdev->dev, "Error determining link status\n");
-
-	/* bit 15 is always the link bit */
-	writew(1 << SNB_LINK_DB, ndev->reg_ofs.ldb);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t ntb_interrupt(int irq, void *dev)
-{
-	struct ntb_device *ndev = dev;
-	unsigned int i = 0;
-
-	if (is_ntb_atom(ndev)) {
-		u64 ldb = readq(ndev->reg_ofs.ldb);
-
-		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %Lx\n", irq, ldb);
-
-		while (ldb) {
-			i = __ffs(ldb);
-			ldb &= ldb - 1;
-			bwd_callback_msix_irq(irq, &ndev->db_cb[i]);
-		}
-	} else {
-		u16 ldb = readw(ndev->reg_ofs.ldb);
-
-		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %x\n", irq, ldb);
-
-		if (ldb & SNB_DB_HW_LINK) {
-			xeon_event_msix_irq(irq, dev);
-			ldb &= ~SNB_DB_HW_LINK;
-		}
-
-		while (ldb) {
-			i = __ffs(ldb);
-			ldb &= ldb - 1;
-			xeon_callback_msix_irq(irq, &ndev->db_cb[i]);
-		}
-	}
-
-	return IRQ_HANDLED;
-}
-
-static int ntb_setup_snb_msix(struct ntb_device *ndev, int msix_entries)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	struct msix_entry *msix;
-	int rc, i;
-
-	if (msix_entries < ndev->limits.msix_cnt)
-		return -ENOSPC;
-
-	rc = pci_enable_msix_exact(pdev, ndev->msix_entries, msix_entries);
-	if (rc < 0)
-		return rc;
-
-	for (i = 0; i < msix_entries; i++) {
-		msix = &ndev->msix_entries[i];
-		WARN_ON(!msix->vector);
-
-		if (i == msix_entries - 1) {
-			rc = request_irq(msix->vector,
-					 xeon_event_msix_irq, 0,
-					 "ntb-event-msix", ndev);
-			if (rc)
-				goto err;
-		} else {
-			rc = request_irq(msix->vector,
-					 xeon_callback_msix_irq, 0,
-					 "ntb-callback-msix",
-					 &ndev->db_cb[i]);
-			if (rc)
-				goto err;
-		}
-	}
-
-	ndev->num_msix = msix_entries;
-	ndev->max_cbs = msix_entries - 1;
-
-	return 0;
-
-err:
-	while (--i >= 0) {
-		/* Code never reaches here for entry nr 'ndev->num_msix - 1' */
-		msix = &ndev->msix_entries[i];
-		free_irq(msix->vector, &ndev->db_cb[i]);
-	}
-
-	pci_disable_msix(pdev);
-	ndev->num_msix = 0;
-
-	return rc;
-}
-
-static int ntb_setup_bwd_msix(struct ntb_device *ndev, int msix_entries)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	struct msix_entry *msix;
-	int rc, i;
-
-	msix_entries = pci_enable_msix_range(pdev, ndev->msix_entries,
-					     1, msix_entries);
-	if (msix_entries < 0)
-		return msix_entries;
-
-	for (i = 0; i < msix_entries; i++) {
-		msix = &ndev->msix_entries[i];
-		WARN_ON(!msix->vector);
-
-		rc = request_irq(msix->vector, bwd_callback_msix_irq, 0,
-				 "ntb-callback-msix", &ndev->db_cb[i]);
-		if (rc)
-			goto err;
-	}
-
-	ndev->num_msix = msix_entries;
-	ndev->max_cbs = msix_entries;
-
-	return 0;
-
-err:
-	while (--i >= 0)
-		free_irq(msix->vector, &ndev->db_cb[i]);
-
-	pci_disable_msix(pdev);
-	ndev->num_msix = 0;
-
-	return rc;
-}
-
-static int ntb_setup_msix(struct ntb_device *ndev)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	int msix_entries;
-	int rc, i;
-
-	msix_entries = pci_msix_vec_count(pdev);
-	if (msix_entries < 0) {
-		rc = msix_entries;
-		goto err;
-	} else if (msix_entries > ndev->limits.msix_cnt) {
-		rc = -EINVAL;
-		goto err;
-	}
-
-	ndev->msix_entries = kmalloc(sizeof(struct msix_entry) * msix_entries,
-				     GFP_KERNEL);
-	if (!ndev->msix_entries) {
-		rc = -ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < msix_entries; i++)
-		ndev->msix_entries[i].entry = i;
-
-	if (is_ntb_atom(ndev))
-		rc = ntb_setup_bwd_msix(ndev, msix_entries);
-	else
-		rc = ntb_setup_snb_msix(ndev, msix_entries);
-	if (rc)
-		goto err1;
-
-	return 0;
-
-err1:
-	kfree(ndev->msix_entries);
-err:
-	dev_err(&pdev->dev, "Error allocating MSI-X interrupt\n");
-	return rc;
-}
-
-static int ntb_setup_msi(struct ntb_device *ndev)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	int rc;
-
-	rc = pci_enable_msi(pdev);
-	if (rc)
-		return rc;
-
-	rc = request_irq(pdev->irq, ntb_interrupt, 0, "ntb-msi", ndev);
-	if (rc) {
-		pci_disable_msi(pdev);
-		dev_err(&pdev->dev, "Error allocating MSI interrupt\n");
-		return rc;
-	}
-
-	return 0;
-}
-
-static int ntb_setup_intx(struct ntb_device *ndev)
-{
-	struct pci_dev *pdev = ndev->pdev;
-	int rc;
-
-	/* Verify intx is enabled */
-	pci_intx(pdev, 1);
-
-	rc = request_irq(pdev->irq, ntb_interrupt, IRQF_SHARED, "ntb-intx",
-			 ndev);
-	if (rc)
-		return rc;
-
-	return 0;
-}
-
-static int ntb_setup_interrupts(struct ntb_device *ndev)
-{
-	int rc;
-
-	/* On BWD, disable all interrupts.  On SNB, disable all but Link
-	 * Interrupt.  The rest will be unmasked as callbacks are registered.
-	 */
-	if (is_ntb_atom(ndev))
-		writeq(~0, ndev->reg_ofs.ldb_mask);
-	else {
-		u16 var = 1 << SNB_LINK_DB;
-		writew(~var, ndev->reg_ofs.ldb_mask);
-	}
-
-	rc = ntb_setup_msix(ndev);
-	if (!rc)
-		goto done;
-
-	ndev->bits_per_vector = 1;
-	ndev->max_cbs = ndev->limits.max_db_bits;
-
-	rc = ntb_setup_msi(ndev);
-	if (!rc)
-		goto done;
-
-	rc = ntb_setup_intx(ndev);
-	if (rc) {
-		dev_err(&ndev->pdev->dev, "no usable interrupts\n");
-		return rc;
-	}
-
-done:
-	return 0;
-}
-
-static void ntb_free_interrupts(struct ntb_device *ndev)
-{
-	struct pci_dev *pdev = ndev->pdev;
-
-	/* mask interrupts */
-	if (is_ntb_atom(ndev))
-		writeq(~0, ndev->reg_ofs.ldb_mask);
-	else
-		writew(~0, ndev->reg_ofs.ldb_mask);
-
-	if (ndev->num_msix) {
-		struct msix_entry *msix;
-		u32 i;
-
-		for (i = 0; i < ndev->num_msix; i++) {
-			msix = &ndev->msix_entries[i];
-			if (is_ntb_xeon(ndev) && i == ndev->num_msix - 1)
-				free_irq(msix->vector, ndev);
-			else
-				free_irq(msix->vector, &ndev->db_cb[i]);
-		}
-		pci_disable_msix(pdev);
-		kfree(ndev->msix_entries);
-	} else {
-		free_irq(pdev->irq, ndev);
-
-		if (pci_dev_msi_enabled(pdev))
-			pci_disable_msi(pdev);
-	}
-}
-
-static int ntb_create_callbacks(struct ntb_device *ndev)
-{
-	int i;
-
-	/* Chicken-egg issue.  We won't know how many callbacks are necessary
-	 * until we see how many MSI-X vectors we get, but these pointers need
-	 * to be passed into the MSI-X register function.  So, we allocate the
-	 * max, knowing that they might not all be used, to work around this.
-	 */
-	ndev->db_cb = kcalloc(ndev->limits.max_db_bits,
-			      sizeof(struct ntb_db_cb),
-			      GFP_KERNEL);
-	if (!ndev->db_cb)
-		return -ENOMEM;
-
-	for (i = 0; i < ndev->limits.max_db_bits; i++) {
-		ndev->db_cb[i].db_num = i;
-		ndev->db_cb[i].ndev = ndev;
-	}
-
-	return 0;
-}
-
-static void ntb_free_callbacks(struct ntb_device *ndev)
-{
-	int i;
-
-	for (i = 0; i < ndev->limits.max_db_bits; i++)
-		ntb_unregister_db_callback(ndev, i);
-
-	kfree(ndev->db_cb);
-}
-
-static ssize_t ntb_debugfs_read(struct file *filp, char __user *ubuf,
-				size_t count, loff_t *offp)
-{
-	struct ntb_device *ndev;
-	char *buf;
-	ssize_t ret, offset, out_count;
-
-	out_count = 500;
-
-	buf = kmalloc(out_count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	ndev = filp->private_data;
-	offset = 0;
-	offset += snprintf(buf + offset, out_count - offset,
-			   "NTB Device Information:\n");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Connection Type - \t\t%s\n",
-			   ndev->conn_type == NTB_CONN_TRANSPARENT ?
-			   "Transparent" : (ndev->conn_type == NTB_CONN_B2B) ?
-			   "Back to back" : "Root Port");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Device Type - \t\t\t%s\n",
-			   ndev->dev_type == NTB_DEV_USD ?
-			   "DSD/USP" : "USD/DSP");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Max Number of Callbacks - \t%u\n",
-			   ntb_max_cbs(ndev));
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Link Status - \t\t\t%s\n",
-			   ntb_hw_link_status(ndev) ? "Up" : "Down");
-	if (ntb_hw_link_status(ndev)) {
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Link Speed - \t\t\tPCI-E Gen %u\n",
-				   ndev->link_speed);
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Link Width - \t\t\tx%u\n",
-				   ndev->link_width);
-	}
-
-	if (is_ntb_xeon(ndev)) {
-		u32 status32;
-		u16 status16;
-		int rc;
-
-		offset += snprintf(buf + offset, out_count - offset,
-				   "\nNTB Device Statistics:\n");
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Upstream Memory Miss - \t%u\n",
-				   readw(ndev->reg_base +
-					 SNB_USMEMMISS_OFFSET));
-
-		offset += snprintf(buf + offset, out_count - offset,
-				   "\nNTB Hardware Errors:\n");
-
-		rc = pci_read_config_word(ndev->pdev, SNB_DEVSTS_OFFSET,
-					  &status16);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "DEVSTS - \t%#06x\n", status16);
-
-		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
-					  &status16);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "LNKSTS - \t%#06x\n", status16);
-
-		rc = pci_read_config_dword(ndev->pdev, SNB_UNCERRSTS_OFFSET,
-					   &status32);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "UNCERRSTS - \t%#010x\n", status32);
-
-		rc = pci_read_config_dword(ndev->pdev, SNB_CORERRSTS_OFFSET,
-					   &status32);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "CORERRSTS - \t%#010x\n", status32);
-	}
-
-	if (offset > out_count)
-		offset = out_count;
-
-	ret = simple_read_from_buffer(ubuf, count, offp, buf, offset);
-	kfree(buf);
-	return ret;
-}
-
-static const struct file_operations ntb_debugfs_info = {
-	.owner = THIS_MODULE,
-	.open = simple_open,
-	.read = ntb_debugfs_read,
-};
-
-static void ntb_setup_debugfs(struct ntb_device *ndev)
-{
-	if (!debugfs_initialized())
-		return;
-
-	if (!debugfs_dir)
-		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
-
-	ndev->debugfs_dir = debugfs_create_dir(pci_name(ndev->pdev),
-					       debugfs_dir);
-	if (ndev->debugfs_dir)
-		ndev->debugfs_info = debugfs_create_file("info", S_IRUSR,
-							 ndev->debugfs_dir,
-							 ndev,
-							 &ntb_debugfs_info);
-}
-
-static void ntb_free_debugfs(struct ntb_device *ndev)
-{
-	debugfs_remove_recursive(ndev->debugfs_dir);
-
-	if (debugfs_dir && simple_empty(debugfs_dir)) {
-		debugfs_remove_recursive(debugfs_dir);
-		debugfs_dir = NULL;
-	}
-}
-
-static void ntb_hw_link_up(struct ntb_device *ndev)
-{
-	if (ndev->conn_type == NTB_CONN_TRANSPARENT)
-		ntb_link_event(ndev, NTB_LINK_UP);
-	else {
-		u32 ntb_cntl;
-
-		/* Let's bring the NTB link up */
-		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-		ntb_cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
-		ntb_cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
-		ntb_cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
-		if (ndev->split_bar)
-			ntb_cntl |= NTB_CNTL_P2S_BAR5_SNOOP |
-				    NTB_CNTL_S2P_BAR5_SNOOP;
-
-		writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
-	}
-}
-
-static void ntb_hw_link_down(struct ntb_device *ndev)
-{
-	u32 ntb_cntl;
-
-	if (ndev->conn_type == NTB_CONN_TRANSPARENT) {
-		ntb_link_event(ndev, NTB_LINK_DOWN);
-		return;
-	}
-
-	/* Bring NTB link down */
-	ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-	ntb_cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
-	ntb_cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
-	if (ndev->split_bar)
-		ntb_cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP |
-			      NTB_CNTL_S2P_BAR5_SNOOP);
-	ntb_cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
-	writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
-}
-
-static void ntb_max_mw_detect(struct ntb_device *ndev)
-{
-	if (ndev->split_bar)
-		ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-	else
-		ndev->limits.max_mw = SNB_MAX_MW;
-}
-
-static int ntb_xeon_detect(struct ntb_device *ndev)
-{
-	int rc, bars_mask;
-	u32 bars;
-	u8 ppd;
-
-	ndev->hw_type = SNB_HW;
-
-	rc = pci_read_config_byte(ndev->pdev, NTB_PPD_OFFSET, &ppd);
-	if (rc)
-		return -EIO;
-
-	if (ppd & SNB_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_USD;
-	else
-		ndev->dev_type = NTB_DEV_DSD;
-
-	ndev->split_bar = (ppd & SNB_PPD_SPLIT_BAR) ? 1 : 0;
-
-	switch (ppd & SNB_PPD_CONN_TYPE) {
-	case NTB_CONN_B2B:
-		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-		dev_info(&ndev->pdev->dev, "Conn Type = RP\n");
-		ndev->conn_type = NTB_CONN_RP;
-		break;
-	case NTB_CONN_TRANSPARENT:
-		dev_info(&ndev->pdev->dev, "Conn Type = TRANSPARENT\n");
-		ndev->conn_type = NTB_CONN_TRANSPARENT;
-		/*
-		 * This mode is default to USD/DSP. HW does not report
-		 * properly in transparent mode as it has no knowledge of
-		 * NTB. We will just force correct here.
-		 */
-		ndev->dev_type = NTB_DEV_USD;
-
-		/*
-		 * This is a way for transparent BAR to figure out if we
-		 * are doing split BAR or not. There is no way for the hw
-		 * on the transparent side to know and set the PPD.
-		 */
-		bars_mask = pci_select_bars(ndev->pdev, IORESOURCE_MEM);
-		bars = hweight32(bars_mask);
-		if (bars == (HSX_SPLITBAR_MAX_MW + 1))
-			ndev->split_bar = 1;
-
-		break;
-	default:
-		dev_err(&ndev->pdev->dev, "Unknown PPD %x\n", ppd);
-		return -ENODEV;
-	}
-
-	ntb_max_mw_detect(ndev);
-
-	return 0;
-}
-
-static int ntb_atom_detect(struct ntb_device *ndev)
-{
-	int rc;
-	u32 ppd;
-
-	ndev->hw_type = BWD_HW;
-	ndev->limits.max_mw = BWD_MAX_MW;
-
-	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &ppd);
-	if (rc)
-		return rc;
-
-	switch ((ppd & BWD_PPD_CONN_TYPE) >> 8) {
-	case NTB_CONN_B2B:
-		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-	default:
-		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
-		return -EINVAL;
-	}
-
-	if (ppd & BWD_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_DSD;
-	else
-		ndev->dev_type = NTB_DEV_USD;
-
-	return 0;
-}
-
-static int ntb_device_detect(struct ntb_device *ndev)
-{
-	int rc;
-
-	if (is_ntb_xeon(ndev))
-		rc = ntb_xeon_detect(ndev);
-	else if (is_ntb_atom(ndev))
-		rc = ntb_atom_detect(ndev);
-	else
-		rc = -ENODEV;
-
-	dev_info(&ndev->pdev->dev, "Device Type = %s\n",
-		 ndev->dev_type == NTB_DEV_USD ? "USD/DSP" : "DSD/USP");
-
-	return 0;
-}
-
-static int ntb_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
-	struct ntb_device *ndev;
-	int rc, i;
-
-	ndev = kzalloc(sizeof(struct ntb_device), GFP_KERNEL);
-	if (!ndev)
-		return -ENOMEM;
-
-	ndev->pdev = pdev;
-
-	ntb_set_errata_flags(ndev);
-
-	ndev->link_status = NTB_LINK_DOWN;
-	pci_set_drvdata(pdev, ndev);
-	ntb_setup_debugfs(ndev);
-
-	rc = pci_enable_device(pdev);
-	if (rc)
-		goto err;
-
-	pci_set_master(ndev->pdev);
-
-	rc = ntb_device_detect(ndev);
-	if (rc)
-		goto err;
-
-	ndev->mw = kcalloc(ndev->limits.max_mw, sizeof(struct ntb_mw),
-			   GFP_KERNEL);
-	if (!ndev->mw) {
-		rc = -ENOMEM;
-		goto err1;
-	}
-
-	if (ndev->split_bar)
-		rc = pci_request_selected_regions(pdev, NTB_SPLITBAR_MASK,
-						  KBUILD_MODNAME);
-	else
-		rc = pci_request_selected_regions(pdev, NTB_BAR_MASK,
-						  KBUILD_MODNAME);
-
-	if (rc)
-		goto err2;
-
-	ndev->reg_base = pci_ioremap_bar(pdev, NTB_BAR_MMIO);
-	if (!ndev->reg_base) {
-		dev_warn(&pdev->dev, "Cannot remap BAR 0\n");
-		rc = -EIO;
-		goto err3;
-	}
-
-	for (i = 0; i < ndev->limits.max_mw; i++) {
-		ndev->mw[i].bar_sz = pci_resource_len(pdev, MW_TO_BAR(i));
-
-		/*
-		 * with the errata we need to steal last of the memory
-		 * windows for workarounds and they point to MMIO registers.
-		 */
-		if ((ndev->wa_flags & WA_SNB_ERR) &&
-		    (i == (ndev->limits.max_mw - 1))) {
-			ndev->mw[i].vbase =
-				ioremap_nocache(pci_resource_start(pdev,
-							MW_TO_BAR(i)),
-						ndev->mw[i].bar_sz);
-		} else {
-			ndev->mw[i].vbase =
-				ioremap_wc(pci_resource_start(pdev,
-							MW_TO_BAR(i)),
-					   ndev->mw[i].bar_sz);
-		}
-
-		dev_info(&pdev->dev, "MW %d size %llu\n", i,
-			 (unsigned long long) ndev->mw[i].bar_sz);
-		if (!ndev->mw[i].vbase) {
-			dev_warn(&pdev->dev, "Cannot remap BAR %d\n",
-				 MW_TO_BAR(i));
-			rc = -EIO;
-			goto err4;
-		}
-	}
-
-	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (rc)
-			goto err4;
-
-		dev_warn(&pdev->dev, "Cannot DMA highmem\n");
-	}
-
-	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (rc)
-			goto err4;
-
-		dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n");
-	}
-
-	rc = ntb_device_setup(ndev);
-	if (rc)
-		goto err4;
-
-	rc = ntb_create_callbacks(ndev);
-	if (rc)
-		goto err5;
-
-	rc = ntb_setup_interrupts(ndev);
-	if (rc)
-		goto err6;
-
-	/* The scratchpad registers keep the values between rmmod/insmod,
-	 * blast them now
-	 */
-	for (i = 0; i < ndev->limits.max_spads; i++) {
-		ntb_write_local_spad(ndev, i, 0);
-		ntb_write_remote_spad(ndev, i, 0);
-	}
-
-	rc = ntb_transport_init(pdev);
-	if (rc)
-		goto err7;
-
-	ntb_hw_link_up(ndev);
-
-	return 0;
-
-err7:
-	ntb_free_interrupts(ndev);
-err6:
-	ntb_free_callbacks(ndev);
-err5:
-	ntb_device_free(ndev);
-err4:
-	for (i--; i >= 0; i--)
-		iounmap(ndev->mw[i].vbase);
-	iounmap(ndev->reg_base);
-err3:
-	if (ndev->split_bar)
-		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
-	else
-		pci_release_selected_regions(pdev, NTB_BAR_MASK);
-err2:
-	kfree(ndev->mw);
-err1:
-	pci_disable_device(pdev);
-err:
-	ntb_free_debugfs(ndev);
-	kfree(ndev);
-
-	dev_err(&pdev->dev, "Error loading %s module\n", KBUILD_MODNAME);
-	return rc;
-}
-
-static void ntb_pci_remove(struct pci_dev *pdev)
-{
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-	int i;
-
-	ntb_hw_link_down(ndev);
-
-	ntb_transport_free(ndev->ntb_transport);
-
-	ntb_free_interrupts(ndev);
-	ntb_free_callbacks(ndev);
-	ntb_device_free(ndev);
-
-	/* need to reset max_mw limits so we can unmap properly */
-	if (ndev->hw_type == SNB_HW)
-		ntb_max_mw_detect(ndev);
-
-	for (i = 0; i < ndev->limits.max_mw; i++)
-		iounmap(ndev->mw[i].vbase);
-
-	kfree(ndev->mw);
-	iounmap(ndev->reg_base);
-	if (ndev->split_bar)
-		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
-	else
-		pci_release_selected_regions(pdev, NTB_BAR_MASK);
-	pci_disable_device(pdev);
-	ntb_free_debugfs(ndev);
-	kfree(ndev);
-}
-
-static struct pci_driver ntb_pci_driver = {
-	.name = KBUILD_MODNAME,
-	.id_table = ntb_pci_tbl,
-	.probe = ntb_pci_probe,
-	.remove = ntb_pci_remove,
-};
-
-module_pci_driver(ntb_pci_driver);
diff --git a/drivers/ntb/ntb_hw.h b/drivers/ntb/ntb_hw.h
deleted file mode 100644
index 96de5fc95f90..000000000000
--- a/drivers/ntb/ntb_hw.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- *   redistributing this file, you may do so under either license.
- *
- *   GPL LICENSE SUMMARY
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   BSD LICENSE
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copy
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Intel PCIe NTB Linux driver
- *
- * Contact Information:
- * Jon Mason <jon.mason@intel.com>
- */
-#include <linux/ntb.h>
-
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF		0x3725
-#define PCI_DEVICE_ID_INTEL_NTB_PS_JSF		0x3726
-#define PCI_DEVICE_ID_INTEL_NTB_SS_JSF		0x3727
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB		0x3C0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_SNB		0x3C0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_SNB		0x3C0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_IVT		0x0E0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_IVT		0x0E0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_IVT		0x0E0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_HSX		0x2F0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_HSX		0x2F0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_HSX		0x2F0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD		0x0C4E
-
-#ifndef readq
-static inline u64 readq(void __iomem *addr)
-{
-	return readl(addr) | (((u64) readl(addr + 4)) << 32LL);
-}
-#endif
-
-#ifndef writeq
-static inline void writeq(u64 val, void __iomem *addr)
-{
-	writel(val & 0xffffffff, addr);
-	writel(val >> 32, addr + 4);
-}
-#endif
-
-#define NTB_BAR_MMIO		0
-#define NTB_BAR_23		2
-#define NTB_BAR_4		4
-#define NTB_BAR_5		5
-
-#define NTB_BAR_MASK		((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
-				 (1 << NTB_BAR_4))
-#define NTB_SPLITBAR_MASK	((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
-				 (1 << NTB_BAR_4) | (1 << NTB_BAR_5))
-
-#define NTB_HB_TIMEOUT		msecs_to_jiffies(1000)
-
-enum ntb_hw_event {
-	NTB_EVENT_SW_EVENT0 = 0,
-	NTB_EVENT_SW_EVENT1,
-	NTB_EVENT_SW_EVENT2,
-	NTB_EVENT_HW_ERROR,
-	NTB_EVENT_HW_LINK_UP,
-	NTB_EVENT_HW_LINK_DOWN,
-};
-
-struct ntb_mw {
-	dma_addr_t phys_addr;
-	void __iomem *vbase;
-	resource_size_t bar_sz;
-};
-
-struct ntb_db_cb {
-	int (*callback)(void *data, int db_num);
-	unsigned int db_num;
-	void *data;
-	struct ntb_device *ndev;
-	struct tasklet_struct irq_work;
-};
-
-#define WA_SNB_ERR	0x00000001
-
-struct ntb_device {
-	struct pci_dev *pdev;
-	struct msix_entry *msix_entries;
-	void __iomem *reg_base;
-	struct ntb_mw *mw;
-	struct {
-		unsigned char max_mw;
-		unsigned char max_spads;
-		unsigned char max_db_bits;
-		unsigned char msix_cnt;
-	} limits;
-	struct {
-		void __iomem *ldb;
-		void __iomem *ldb_mask;
-		void __iomem *rdb;
-		void __iomem *bar2_xlat;
-		void __iomem *bar4_xlat;
-		void __iomem *bar5_xlat;
-		void __iomem *spad_write;
-		void __iomem *spad_read;
-		void __iomem *lnk_cntl;
-		void __iomem *lnk_stat;
-		void __iomem *spci_cmd;
-	} reg_ofs;
-	struct ntb_transport *ntb_transport;
-	void (*event_cb)(void *handle, enum ntb_hw_event event);
-
-	struct ntb_db_cb *db_cb;
-	unsigned char hw_type;
-	unsigned char conn_type;
-	unsigned char dev_type;
-	unsigned char num_msix;
-	unsigned char bits_per_vector;
-	unsigned char max_cbs;
-	unsigned char link_width;
-	unsigned char link_speed;
-	unsigned char link_status;
-	unsigned char split_bar;
-
-	struct delayed_work hb_timer;
-	unsigned long last_ts;
-
-	struct delayed_work lr_timer;
-
-	struct dentry *debugfs_dir;
-	struct dentry *debugfs_info;
-
-	unsigned int wa_flags;
-};
-
-/**
- * ntb_max_cbs() - return the max callbacks
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the maximum number of callbacks
- *
- * RETURNS: the maximum number of callbacks
- */
-static inline unsigned char ntb_max_cbs(struct ntb_device *ndev)
-{
-	return ndev->max_cbs;
-}
-
-/**
- * ntb_max_mw() - return the max number of memory windows
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the maximum number of memory windows
- *
- * RETURNS: the maximum number of memory windows
- */
-static inline unsigned char ntb_max_mw(struct ntb_device *ndev)
-{
-	return ndev->limits.max_mw;
-}
-
-/**
- * ntb_hw_link_status() - return the hardware link status
- * @ndev: pointer to ntb_device instance
- *
- * Returns true if the hardware is connected to the remote system
- *
- * RETURNS: true or false based on the hardware link state
- */
-static inline bool ntb_hw_link_status(struct ntb_device *ndev)
-{
-	return ndev->link_status == NTB_LINK_UP;
-}
-
-/**
- * ntb_query_pdev() - return the pci_dev pointer
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the pci_dev pointer for the NTB hardware device
- *
- * RETURNS: a pointer to the ntb pci_dev
- */
-static inline struct pci_dev *ntb_query_pdev(struct ntb_device *ndev)
-{
-	return ndev->pdev;
-}
-
-/**
- * ntb_query_debugfs() - return the debugfs pointer
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the debugfs directory pointer for the NTB
- * hardware device
- *
- * RETURNS: a pointer to the debugfs directory
- */
-static inline struct dentry *ntb_query_debugfs(struct ntb_device *ndev)
-{
-	return ndev->debugfs_dir;
-}
-
-struct ntb_device *ntb_register_transport(struct pci_dev *pdev,
-					  void *transport);
-void ntb_unregister_transport(struct ntb_device *ndev);
-void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr);
-int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
-			     void *data, int (*db_cb_func)(void *data,
-							   int db_num));
-void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx);
-int ntb_register_event_callback(struct ntb_device *ndev,
-				void (*event_cb_func)(void *handle,
-						      enum ntb_hw_event event));
-void ntb_unregister_event_callback(struct ntb_device *ndev);
-int ntb_get_max_spads(struct ntb_device *ndev);
-int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
-int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
-int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
-int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
-resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
-void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
-u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
-void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int idx);
-void *ntb_find_transport(struct pci_dev *pdev);
-
-int ntb_transport_init(struct pci_dev *pdev);
-void ntb_transport_free(void *transport);
diff --git a/drivers/ntb/ntb_regs.h b/drivers/ntb/ntb_regs.h
deleted file mode 100644
index f028ff81fd77..000000000000
--- a/drivers/ntb/ntb_regs.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- *   redistributing this file, you may do so under either license.
- *
- *   GPL LICENSE SUMMARY
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   BSD LICENSE
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copy
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Intel PCIe NTB Linux driver
- *
- * Contact Information:
- * Jon Mason <jon.mason@intel.com>
- */
-
-#define NTB_LINK_STATUS_ACTIVE	0x2000
-#define NTB_LINK_SPEED_MASK	0x000f
-#define NTB_LINK_WIDTH_MASK	0x03f0
-
-#define SNB_MSIX_CNT		4
-#define SNB_MAX_B2B_SPADS	16
-#define SNB_MAX_COMPAT_SPADS	16
-/* Reserve the uppermost bit for link interrupt */
-#define SNB_MAX_DB_BITS		15
-#define SNB_LINK_DB		15
-#define SNB_DB_BITS_PER_VEC	5
-#define HSX_SPLITBAR_MAX_MW	3
-#define SNB_MAX_MW		2
-#define SNB_ERRATA_MAX_MW	1
-
-#define SNB_DB_HW_LINK		0x8000
-
-#define SNB_UNCERRSTS_OFFSET	0x014C
-#define SNB_CORERRSTS_OFFSET	0x0158
-#define SNB_LINK_STATUS_OFFSET	0x01A2
-#define SNB_PCICMD_OFFSET	0x0504
-#define SNB_DEVCTRL_OFFSET	0x0598
-#define SNB_DEVSTS_OFFSET	0x059A
-#define SNB_SLINK_STATUS_OFFSET	0x05A2
-
-#define SNB_PBAR2LMT_OFFSET	0x0000
-#define SNB_PBAR4LMT_OFFSET	0x0008
-#define SNB_PBAR5LMT_OFFSET	0x000C
-#define SNB_PBAR2XLAT_OFFSET	0x0010
-#define SNB_PBAR4XLAT_OFFSET	0x0018
-#define SNB_PBAR5XLAT_OFFSET	0x001C
-#define SNB_SBAR2LMT_OFFSET	0x0020
-#define SNB_SBAR4LMT_OFFSET	0x0028
-#define SNB_SBAR5LMT_OFFSET	0x002C
-#define SNB_SBAR2XLAT_OFFSET	0x0030
-#define SNB_SBAR4XLAT_OFFSET	0x0038
-#define SNB_SBAR5XLAT_OFFSET	0x003C
-#define SNB_SBAR0BASE_OFFSET	0x0040
-#define SNB_SBAR2BASE_OFFSET	0x0048
-#define SNB_SBAR4BASE_OFFSET	0x0050
-#define SNB_SBAR5BASE_OFFSET	0x0054
-#define SNB_NTBCNTL_OFFSET	0x0058
-#define SNB_SBDF_OFFSET		0x005C
-#define SNB_PDOORBELL_OFFSET	0x0060
-#define SNB_PDBMSK_OFFSET	0x0062
-#define SNB_SDOORBELL_OFFSET	0x0064
-#define SNB_SDBMSK_OFFSET	0x0066
-#define SNB_USMEMMISS_OFFSET	0x0070
-#define SNB_SPAD_OFFSET		0x0080
-#define SNB_SPADSEMA4_OFFSET	0x00c0
-#define SNB_WCCNTRL_OFFSET	0x00e0
-#define SNB_B2B_SPAD_OFFSET	0x0100
-#define SNB_B2B_DOORBELL_OFFSET	0x0140
-#define SNB_B2B_XLAT_OFFSETL	0x0144
-#define SNB_B2B_XLAT_OFFSETU	0x0148
-
-/*
- * The addresses are setup so the 32bit BARs can function. Thus
- * the addresses are all in 32bit space
- */
-#define SNB_MBAR01_USD_ADDR	0x000000002100000CULL
-#define SNB_MBAR23_USD_ADDR	0x000000004100000CULL
-#define SNB_MBAR4_USD_ADDR	0x000000008100000CULL
-#define SNB_MBAR5_USD_ADDR	0x00000000A100000CULL
-#define SNB_MBAR01_DSD_ADDR	0x000000002000000CULL
-#define SNB_MBAR23_DSD_ADDR	0x000000004000000CULL
-#define SNB_MBAR4_DSD_ADDR	0x000000008000000CULL
-#define SNB_MBAR5_DSD_ADDR	0x00000000A000000CULL
-
-#define BWD_MSIX_CNT		34
-#define BWD_MAX_SPADS		16
-#define BWD_MAX_DB_BITS		34
-#define BWD_DB_BITS_PER_VEC	1
-#define BWD_MAX_MW		2
-
-#define BWD_PCICMD_OFFSET	0xb004
-#define BWD_MBAR23_OFFSET	0xb018
-#define BWD_MBAR45_OFFSET	0xb020
-#define BWD_DEVCTRL_OFFSET	0xb048
-#define BWD_LINK_STATUS_OFFSET	0xb052
-#define BWD_ERRCORSTS_OFFSET	0xb110
-
-#define BWD_SBAR2XLAT_OFFSET	0x0008
-#define BWD_SBAR4XLAT_OFFSET	0x0010
-#define BWD_PDOORBELL_OFFSET	0x0020
-#define BWD_PDBMSK_OFFSET	0x0028
-#define BWD_NTBCNTL_OFFSET	0x0060
-#define BWD_EBDF_OFFSET		0x0064
-#define BWD_SPAD_OFFSET		0x0080
-#define BWD_SPADSEMA_OFFSET	0x00c0
-#define BWD_STKYSPAD_OFFSET	0x00c4
-#define BWD_PBAR2XLAT_OFFSET	0x8008
-#define BWD_PBAR4XLAT_OFFSET	0x8010
-#define BWD_B2B_DOORBELL_OFFSET	0x8020
-#define BWD_B2B_SPAD_OFFSET	0x8080
-#define BWD_B2B_SPADSEMA_OFFSET	0x80c0
-#define BWD_B2B_STKYSPAD_OFFSET	0x80c4
-
-#define BWD_MODPHY_PCSREG4	0x1c004
-#define BWD_MODPHY_PCSREG6	0x1c006
-
-#define BWD_IP_BASE		0xC000
-#define BWD_DESKEWSTS_OFFSET	(BWD_IP_BASE + 0x3024)
-#define BWD_LTSSMERRSTS0_OFFSET (BWD_IP_BASE + 0x3180)
-#define BWD_LTSSMSTATEJMP_OFFSET	(BWD_IP_BASE + 0x3040)
-#define BWD_IBSTERRRCRVSTS0_OFFSET	(BWD_IP_BASE + 0x3324)
-
-#define BWD_DESKEWSTS_DBERR	(1 << 15)
-#define BWD_LTSSMERRSTS0_UNEXPECTEDEI	(1 << 20)
-#define BWD_LTSSMSTATEJMP_FORCEDETECT	(1 << 2)
-#define BWD_IBIST_ERR_OFLOW	0x7FFF7FFF
-
-#define NTB_CNTL_CFG_LOCK		(1 << 0)
-#define NTB_CNTL_LINK_DISABLE		(1 << 1)
-#define NTB_CNTL_S2P_BAR23_SNOOP	(1 << 2)
-#define NTB_CNTL_P2S_BAR23_SNOOP	(1 << 4)
-#define NTB_CNTL_S2P_BAR4_SNOOP	(1 << 6)
-#define NTB_CNTL_P2S_BAR4_SNOOP	(1 << 8)
-#define NTB_CNTL_S2P_BAR5_SNOOP	(1 << 12)
-#define NTB_CNTL_P2S_BAR5_SNOOP	(1 << 14)
-#define BWD_CNTL_LINK_DOWN		(1 << 16)
-
-#define NTB_PPD_OFFSET		0x00D4
-#define SNB_PPD_CONN_TYPE	0x0003
-#define SNB_PPD_DEV_TYPE	0x0010
-#define SNB_PPD_SPLIT_BAR	(1 << 6)
-#define BWD_PPD_INIT_LINK	0x0008
-#define BWD_PPD_CONN_TYPE	0x0300
-#define BWD_PPD_DEV_TYPE	0x1000
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index e9bf2f47b61a..c5f26cda9f97 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -56,7 +56,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include "ntb_hw.h"
+#include "hw/intel/ntb_hw_intel.h"
 
 #define NTB_TRANSPORT_VERSION	3
 
@@ -360,14 +360,14 @@ err:
 EXPORT_SYMBOL_GPL(ntb_register_client_dev);
 
 /**
- * ntb_register_client - Register NTB client driver
+ * ntb_transport_register_client - Register NTB client driver
  * @drv: NTB client driver to be registered
  *
  * Register an NTB client driver with the NTB transport layer
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
-int ntb_register_client(struct ntb_client *drv)
+int ntb_transport_register_client(struct ntb_client *drv)
 {
 	drv->driver.bus = &ntb_bus_type;
 
@@ -376,21 +376,21 @@ int ntb_register_client(struct ntb_client *drv)
 
 	return driver_register(&drv->driver);
 }
-EXPORT_SYMBOL_GPL(ntb_register_client);
+EXPORT_SYMBOL_GPL(ntb_transport_register_client);
 
 /**
- * ntb_unregister_client - Unregister NTB client driver
+ * ntb_transport_unregister_client - Unregister NTB client driver
  * @drv: NTB client driver to be unregistered
  *
  * Unregister an NTB client driver with the NTB transport layer
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
-void ntb_unregister_client(struct ntb_client *drv)
+void ntb_transport_unregister_client(struct ntb_client *drv)
 {
 	driver_unregister(&drv->driver);
 }
-EXPORT_SYMBOL_GPL(ntb_unregister_client);
+EXPORT_SYMBOL_GPL(ntb_transport_unregister_client);
 
 static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
 			    loff_t *offp)
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
deleted file mode 100644
index 9ac1a62fc6f5..000000000000
--- a/include/linux/ntb.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * This file is provided under a dual BSD/GPLv2 license.  When using or
- *   redistributing this file, you may do so under either license.
- *
- *   GPL LICENSE SUMMARY
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   This program is free software; you can redistribute it and/or modify
- *   it under the terms of version 2 of the GNU General Public License as
- *   published by the Free Software Foundation.
- *
- *   BSD LICENSE
- *
- *   Copyright(c) 2012 Intel Corporation. All rights reserved.
- *
- *   Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions
- *   are met:
- *
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copy
- *       notice, this list of conditions and the following disclaimer in
- *       the documentation and/or other materials provided with the
- *       distribution.
- *     * Neither the name of Intel Corporation nor the names of its
- *       contributors may be used to endorse or promote products derived
- *       from this software without specific prior written permission.
- *
- *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * Intel PCIe NTB Linux driver
- *
- * Contact Information:
- * Jon Mason <jon.mason@intel.com>
- */
-
-struct ntb_transport_qp;
-
-struct ntb_client {
-	struct device_driver driver;
-	int (*probe)(struct pci_dev *pdev);
-	void (*remove)(struct pci_dev *pdev);
-};
-
-enum {
-	NTB_LINK_DOWN = 0,
-	NTB_LINK_UP,
-};
-
-int ntb_register_client(struct ntb_client *drvr);
-void ntb_unregister_client(struct ntb_client *drvr);
-int ntb_register_client_dev(char *device_name);
-void ntb_unregister_client_dev(char *device_name);
-
-struct ntb_queue_handlers {
-	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-			   void *data, int len);
-	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
-			   void *data, int len);
-	void (*event_handler)(void *data, int status);
-};
-
-unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
-unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
-struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct pci_dev *pdev,
-			   const struct ntb_queue_handlers *handlers);
-void ntb_transport_free_queue(struct ntb_transport_qp *qp);
-int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
-			     unsigned int len);
-int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
-			     unsigned int len);
-void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
-void ntb_transport_link_up(struct ntb_transport_qp *qp);
-void ntb_transport_link_down(struct ntb_transport_qp *qp);
-bool ntb_transport_link_query(struct ntb_transport_qp *qp);
diff --git a/include/linux/ntb_transport.h b/include/linux/ntb_transport.h
new file mode 100644
index 000000000000..f78b64cc09d5
--- /dev/null
+++ b/include/linux/ntb_transport.h
@@ -0,0 +1,88 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Jon Mason <jon.mason@intel.com>
+ */
+
+struct ntb_transport_qp;
+
+struct ntb_client {
+	struct device_driver driver;
+	int (*probe)(struct pci_dev *pdev);
+	void (*remove)(struct pci_dev *pdev);
+};
+
+enum {
+	NTB_LINK_DOWN = 0,
+	NTB_LINK_UP,
+};
+
+int ntb_transport_register_client(struct ntb_client *drvr);
+void ntb_transport_unregister_client(struct ntb_client *drvr);
+int ntb_register_client_dev(char *device_name);
+void ntb_unregister_client_dev(char *device_name);
+
+struct ntb_queue_handlers {
+	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+			   void *data, int len);
+	void (*tx_handler)(struct ntb_transport_qp *qp, void *qp_data,
+			   void *data, int len);
+	void (*event_handler)(void *data, int status);
+};
+
+unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
+unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
+struct ntb_transport_qp *
+ntb_transport_create_queue(void *data, struct pci_dev *pdev,
+			   const struct ntb_queue_handlers *handlers);
+void ntb_transport_free_queue(struct ntb_transport_qp *qp);
+int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
+			     unsigned int len);
+void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len);
+void ntb_transport_link_up(struct ntb_transport_qp *qp);
+void ntb_transport_link_down(struct ntb_transport_qp *qp);
+bool ntb_transport_link_query(struct ntb_transport_qp *qp);
-- 
cgit v1.2.3


From a13f35e8714009145e32ebe2bf25b84e1376e314 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Jul 2015 08:44:34 -0600
Subject: writeback: don't embed root bdi_writeback_congested in bdi_writeback

52ebea749aae ("writeback: make backing_dev_info host cgroup-specific
bdi_writebacks") made bdi (backing_dev_info) host per-cgroup wb's
(bdi_writeback's).  As the congested state needs to be per-wb and
referenced from blkcg side and multiple wbs, the patch made all
non-root cong's (bdi_writeback_congested's) reference counted and
indexed on bdi.

When a bdi is destroyed, cgwb_bdi_destroy() tries to drain all
non-root cong's; however, this can hang indefinitely because wb's can
also be referenced from blkcg_gq's which are destroyed after bdi
destruction is complete.

To fix the bug, bdi destruction will be updated to not wait for cong's
to drain, which naturally means that cong's may outlive the associated
bdi.  This is fine for non-root cong's but is problematic for the root
cong's which are embedded in their bdi's as they may end up getting
dereferenced after the containing bdi's are freed.

This patch makes root cong's behave the same as non-root cong's.  They
are no longer embedded in their bdi's but allocated separately during
bdi initialization, indexed and reference counted the same way.

* As cong handling is the same for all wb's, wb->congested
  initialization is moved into wb_init().

* When !CONFIG_CGROUP_WRITEBACK, there was no indexing or refcnting.
  bdi->wb_congested is now a pointer pointing to the root cong
  allocated during bdi init and minimal refcnting operations are
  implemented.

* The above makes root wb init paths diverge depending on
  CONFIG_CGROUP_WRITEBACK.  root wb init is moved to cgwb_bdi_init().

This patch in itself shouldn't cause any consequential behavior
differences but prepares for the actual fix.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jon Christopherson <jon@jons.org>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=100681
Tested-by: Jon Christopherson <jon@jons.org>

Added <linux/slab.h> include to backing-dev.h for kfree() definition.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  5 ++-
 include/linux/backing-dev.h      |  6 ++-
 mm/backing-dev.c                 | 87 +++++++++++++++++++++-------------------
 3 files changed, 54 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a48d90e3bcbb..a23209b43842 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -50,10 +50,10 @@ enum wb_stat_item {
  */
 struct bdi_writeback_congested {
 	unsigned long state;		/* WB_[a]sync_congested flags */
+	atomic_t refcnt;		/* nr of attached wb's and blkg */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct backing_dev_info *bdi;	/* the associated bdi */
-	atomic_t refcnt;		/* nr of attached wb's and blkg */
 	int blkcg_id;			/* ID of the associated blkcg */
 	struct rb_node rb_node;		/* on bdi->cgwb_congestion_tree */
 #endif
@@ -150,11 +150,12 @@ struct backing_dev_info {
 	atomic_long_t tot_write_bandwidth;
 
 	struct bdi_writeback wb;  /* the root writeback info for this bdi */
-	struct bdi_writeback_congested wb_congested; /* its congested state */
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
 	atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#else
+	struct bdi_writeback_congested *wb_congested;
 #endif
 	wait_queue_head_t wb_waitq;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0e6d4828a77a..0fe9df983ab7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -15,6 +15,7 @@
 #include <linux/writeback.h>
 #include <linux/blk-cgroup.h>
 #include <linux/backing-dev-defs.h>
+#include <linux/slab.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
@@ -465,11 +466,14 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
 static inline struct bdi_writeback_congested *
 wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
 {
-	return bdi->wb.congested;
+	atomic_inc(&bdi->wb_congested->refcnt);
+	return bdi->wb_congested;
 }
 
 static inline void wb_congested_put(struct bdi_writeback_congested *congested)
 {
+	if (atomic_dec_and_test(&congested->refcnt))
+		kfree(congested);
 }
 
 static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7756da31b02b..51cc461e7256 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -287,7 +287,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
 #define INIT_BW		(100 << (20 - PAGE_SHIFT))
 
 static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
-		   gfp_t gfp)
+		   int blkcg_id, gfp_t gfp)
 {
 	int i, err;
 
@@ -311,21 +311,29 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 	INIT_LIST_HEAD(&wb->work_list);
 	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 
+	wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
+	if (!wb->congested)
+		return -ENOMEM;
+
 	err = fprop_local_init_percpu(&wb->completions, gfp);
 	if (err)
-		return err;
+		goto out_put_cong;
 
 	for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
 		err = percpu_counter_init(&wb->stat[i], 0, gfp);
-		if (err) {
-			while (--i)
-				percpu_counter_destroy(&wb->stat[i]);
-			fprop_local_destroy_percpu(&wb->completions);
-			return err;
-		}
+		if (err)
+			goto out_destroy_stat;
 	}
 
 	return 0;
+
+out_destroy_stat:
+	while (--i)
+		percpu_counter_destroy(&wb->stat[i]);
+	fprop_local_destroy_percpu(&wb->completions);
+out_put_cong:
+	wb_congested_put(wb->congested);
+	return err;
 }
 
 /*
@@ -361,6 +369,7 @@ static void wb_exit(struct bdi_writeback *wb)
 		percpu_counter_destroy(&wb->stat[i]);
 
 	fprop_local_destroy_percpu(&wb->completions);
+	wb_congested_put(wb->congested);
 }
 
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -392,9 +401,6 @@ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
 	struct bdi_writeback_congested *new_congested = NULL, *congested;
 	struct rb_node **node, *parent;
 	unsigned long flags;
-
-	if (blkcg_id == 1)
-		return &bdi->wb_congested;
 retry:
 	spin_lock_irqsave(&cgwb_lock, flags);
 
@@ -453,9 +459,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
 	struct backing_dev_info *bdi = congested->bdi;
 	unsigned long flags;
 
-	if (congested->blkcg_id == 1)
-		return;
-
 	local_irq_save(flags);
 	if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
 		local_irq_restore(flags);
@@ -480,7 +483,6 @@ static void cgwb_release_workfn(struct work_struct *work)
 
 	css_put(wb->memcg_css);
 	css_put(wb->blkcg_css);
-	wb_congested_put(wb->congested);
 
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 	percpu_ref_exit(&wb->refcnt);
@@ -541,7 +543,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	if (!wb)
 		return -ENOMEM;
 
-	ret = wb_init(wb, bdi, gfp);
+	ret = wb_init(wb, bdi, blkcg_css->id, gfp);
 	if (ret)
 		goto err_free;
 
@@ -553,12 +555,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	if (ret)
 		goto err_ref_exit;
 
-	wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
-	if (!wb->congested) {
-		ret = -ENOMEM;
-		goto err_fprop_exit;
-	}
-
 	wb->memcg_css = memcg_css;
 	wb->blkcg_css = blkcg_css;
 	INIT_WORK(&wb->release_work, cgwb_release_workfn);
@@ -588,12 +584,10 @@ static int cgwb_create(struct backing_dev_info *bdi,
 	if (ret) {
 		if (ret == -EEXIST)
 			ret = 0;
-		goto err_put_congested;
+		goto err_fprop_exit;
 	}
 	goto out_put;
 
-err_put_congested:
-	wb_congested_put(wb->congested);
 err_fprop_exit:
 	fprop_local_destroy_percpu(&wb->memcg_completions);
 err_ref_exit:
@@ -662,14 +656,20 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
 	return wb;
 }
 
-static void cgwb_bdi_init(struct backing_dev_info *bdi)
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
 {
-	bdi->wb.memcg_css = mem_cgroup_root_css;
-	bdi->wb.blkcg_css = blkcg_root_css;
-	bdi->wb_congested.blkcg_id = 1;
+	int ret;
+
 	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 	bdi->cgwb_congested_tree = RB_ROOT;
 	atomic_set(&bdi->usage_cnt, 1);
+
+	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+	if (!ret) {
+		bdi->wb.memcg_css = mem_cgroup_root_css;
+		bdi->wb.blkcg_css = blkcg_root_css;
+	}
+	return ret;
 }
 
 static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
@@ -732,15 +732,28 @@ void wb_blkcg_offline(struct blkcg *blkcg)
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
-static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+	int err;
+
+	bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
+	if (!bdi->wb_congested)
+		return -ENOMEM;
+
+	err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+	if (err) {
+		kfree(bdi->wb_congested);
+		return err;
+	}
+	return 0;
+}
+
 static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
 int bdi_init(struct backing_dev_info *bdi)
 {
-	int err;
-
 	bdi->dev = NULL;
 
 	bdi->min_ratio = 0;
@@ -749,15 +762,7 @@ int bdi_init(struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	init_waitqueue_head(&bdi->wb_waitq);
 
-	err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
-	if (err)
-		return err;
-
-	bdi->wb_congested.state = 0;
-	bdi->wb.congested = &bdi->wb_congested;
-
-	cgwb_bdi_init(bdi);
-	return 0;
+	return cgwb_bdi_init(bdi);
 }
 EXPORT_SYMBOL(bdi_init);
 
-- 
cgit v1.2.3


From 91e20b5040c67c51aad88cf87db4305c5bd7f79d Mon Sep 17 00:00:00 2001
From: Joel Porquet <joel@porquet.org>
Date: Thu, 2 Jul 2015 15:32:00 -0400
Subject: irqchip: Move IRQCHIP_DECLARE macro to include/linux/irqchip.h

At the moment the IRQCHIP_DECLARE macro is only declared locally in
drivers/irqchip/irqchip.h. It prevents from using it directly in arch/*
directories whenever irqchip drivers only exist there, which happens in a few
cases (e.g. arc, arm, microblaze and mips).

This patch makes the macro to be globally defined, i.e. in
include/linux/irqchip.h, and thus usable for arch-specific declarations of
irqchip drivers. In this way, it is very similar to what clocksource does (ie
CLOCKSOURCE_OF_DECLARE is defined in include/linux/clocksource.h).

For now, this patch only moves the declaration of the macro
IRQCHIP_DECLARE to the global header 'include/linux/irqchip.h' and make
'drivers/irqchip/irqchip.h' include 'include/linux/irqchip.h'. Later, other
patches will get rid of 'drivers/irqchip/irqchip.h' and modify all the impacted
irqchip drivers.

Signed-off-by: Joel Porquet <joel@porquet.org>
Cc: Jason Cooper <jason@lakedaemon.net>
Link: http://lkml.kernel.org/r/1435865565-14114-1-git-send-email-joel@porquet.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/irqchip/irqchip.h | 19 +------------------
 include/linux/irqchip.h   | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/irqchip/irqchip.h b/drivers/irqchip/irqchip.h
index 0f6486d4f1b0..0f67ae32464f 100644
--- a/drivers/irqchip/irqchip.h
+++ b/drivers/irqchip/irqchip.h
@@ -8,21 +8,4 @@
  * warranty of any kind, whether express or implied.
  */
 
-#ifndef _IRQCHIP_H
-#define _IRQCHIP_H
-
-#include <linux/of.h>
-
-/*
- * This macro must be used by the different irqchip drivers to declare
- * the association between their DT compatible string and their
- * initialization function.
- *
- * @name: name that must be unique accross all IRQCHIP_DECLARE of the
- * same file.
- * @compstr: compatible string of the irqchip driver
- * @fn: initialization function
- */
-#define IRQCHIP_DECLARE(name, compat, fn) OF_DECLARE_2(irqchip, name, compat, fn)
-
-#endif
+#include <linux/irqchip.h>
diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h
index 14d79131f53d..638887376e58 100644
--- a/include/linux/irqchip.h
+++ b/include/linux/irqchip.h
@@ -11,6 +11,20 @@
 #ifndef _LINUX_IRQCHIP_H
 #define _LINUX_IRQCHIP_H
 
+#include <linux/of.h>
+
+/*
+ * This macro must be used by the different irqchip drivers to declare
+ * the association between their DT compatible string and their
+ * initialization function.
+ *
+ * @name: name that must be unique accross all IRQCHIP_DECLARE of the
+ * same file.
+ * @compstr: compatible string of the irqchip driver
+ * @fn: initialization function
+ */
+#define IRQCHIP_DECLARE(name, compat, fn) OF_DECLARE_2(irqchip, name, compat, fn)
+
 #ifdef CONFIG_IRQCHIP
 void irqchip_init(void);
 #else
-- 
cgit v1.2.3


From 2ecd9d29abb171d6e97a4f3eb29d7456a11401b7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 3 Jul 2015 18:53:58 +0200
Subject: sched, preempt_notifier: separate notifier registration from
 static_key inc/dec

Commit 1cde2930e154 ("sched/preempt: Add static_key() to preempt_notifiers")
had two problems.  First, the preempt-notifier API needs to sleep with the
addition of the static_key, we do however need to hold off preemption
while modifying the preempt notifier list, otherwise a preemption could
observe an inconsistent list state.  KVM correctly registers and
unregisters preempt notifiers with preemption disabled, so the sleep
caused dmesg splats.

Second, KVM registers and unregisters preemption notifiers very often
(in vcpu_load/vcpu_put).  With a single uniprocessor guest the static key
would move between 0 and 1 continuously, hitting the slow path on every
userspace exit.

To fix this, wrap the static_key inc/dec in a new API, and call it from
KVM.

Fixes: 1cde2930e154 ("sched/preempt: Add static_key() to preempt_notifiers")
Reported-by: Pontus Fuchs <pontus.fuchs@gmail.com>
Reported-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/preempt.h |  2 ++
 kernel/sched/core.c     | 17 +++++++++++++++--
 virt/kvm/kvm_main.c     |  3 +++
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 0f1534acaf60..84991f185173 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -293,6 +293,8 @@ struct preempt_notifier {
 	struct preempt_ops *ops;
 };
 
+void preempt_notifier_inc(void);
+void preempt_notifier_dec(void);
 void preempt_notifier_register(struct preempt_notifier *notifier);
 void preempt_notifier_unregister(struct preempt_notifier *notifier);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b803e1b8ab0c..552710ab19e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2320,13 +2320,27 @@ void wake_up_new_task(struct task_struct *p)
 
 static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
 
+void preempt_notifier_inc(void)
+{
+	static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+	static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
 /**
  * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
 {
-	static_key_slow_inc(&preempt_notifier_key);
+	if (!static_key_false(&preempt_notifier_key))
+		WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
 	hlist_add_head(&notifier->link, &current->preempt_notifiers);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2340,7 +2354,6 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
 void preempt_notifier_unregister(struct preempt_notifier *notifier)
 {
 	hlist_del(&notifier->link);
-	static_key_slow_dec(&preempt_notifier_key);
 }
 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 848af90b8091..8b8a44453670 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -553,6 +553,8 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	list_add(&kvm->vm_list, &vm_list);
 	spin_unlock(&kvm_lock);
 
+	preempt_notifier_inc();
+
 	return kvm;
 
 out_err:
@@ -620,6 +622,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	cleanup_srcu_struct(&kvm->irq_srcu);
 	cleanup_srcu_struct(&kvm->srcu);
 	kvm_arch_free_vm(kvm);
+	preempt_notifier_dec();
 	hardware_disable_all();
 	mmdrop(mm);
 }
-- 
cgit v1.2.3


From f6db8347993256b58bd4746b0c4c5b935c32210d Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Thu, 25 Jun 2015 23:53:37 +0530
Subject: sched/stat: Simplify the sched_info accounting dependency

Both CONFIG_SCHEDSTATS=y and CONFIG_TASK_DELAY_ACCT=y track task
sched_info, which results in ugly #if clauses.

Simplify the code by introducing a synthethic CONFIG_SCHED_INFO
switch, selected by both.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: a.p.zijlstra@chello.nl
Cc: ricklind@us.ibm.com
Link: http://lkml.kernel.org/r/8d19eef800811a94b0f91bcbeb27430a884d7433.1435255405.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 6 +++---
 init/Kconfig          | 1 +
 kernel/sched/core.c   | 2 +-
 kernel/sched/stats.h  | 4 ++--
 lib/Kconfig.debug     | 5 +++++
 5 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6633e83e608a..9bf4bc0e3b8b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -849,7 +849,7 @@ extern struct user_struct root_user;
 struct backing_dev_info;
 struct reclaim_state;
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 struct sched_info {
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
@@ -859,7 +859,7 @@ struct sched_info {
 	unsigned long long last_arrival,/* when we last ran on a cpu */
 			   last_queued;	/* when we were last queued to run */
 };
-#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
+#endif /* CONFIG_SCHED_INFO */
 
 #ifdef CONFIG_TASK_DELAY_ACCT
 struct task_delay_info {
@@ -1408,7 +1408,7 @@ struct task_struct {
 	int rcu_tasks_idle_cpu;
 #endif /* #ifdef CONFIG_TASKS_RCU */
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 	struct sched_info sched_info;
 #endif
 
diff --git a/init/Kconfig b/init/Kconfig
index b999fa381bf9..12cb556ad160 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -435,6 +435,7 @@ config TASKSTATS
 config TASK_DELAY_ACCT
 	bool "Enable per-task delay accounting"
 	depends on TASKSTATS
+	select SCHED_INFO
 	help
 	  Collect information on time spent by a task waiting for system
 	  resources like cpu, synchronous block I/O completion and swapping
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c86935a7f1f8..abb8785ed1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1975,7 +1975,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	set_task_cpu(p, cpu);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 	if (likely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 077ebbd5e10f..b0fbc7632de5 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -47,7 +47,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_set(var, val)	do { } while (0)
 #endif
 
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
 static inline void sched_info_reset_dequeued(struct task_struct *t)
 {
 	t->sched_info.last_queued = 0;
@@ -156,7 +156,7 @@ sched_info_switch(struct rq *rq,
 #define sched_info_depart(rq, t)		do { } while (0)
 #define sched_info_arrive(rq, next)		do { } while (0)
 #define sched_info_switch(rq, t, next)		do { } while (0)
-#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
+#endif /* CONFIG_SCHED_INFO */
 
 /*
  * The following are functions that support scheduler-internal time accounting.
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b908048f8d6a..e2894b23efb6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -841,9 +841,14 @@ config SCHED_DEBUG
 	  that can help debug the scheduler. The runtime overhead of this
 	  option is minimal.
 
+config SCHED_INFO
+	bool
+	default n
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS
+	select SCHED_INFO
 	help
 	  If you say Y here, additional code will be inserted into the
 	  scheduler and related routines to collect statistics about
-- 
cgit v1.2.3


From 6b55c9654fccf69ae7ace23ca101dc37b903181b Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Thu, 25 Jun 2015 22:51:41 +0530
Subject: sched/debug: Move print_cfs_rq() declaration to kernel/sched/sched.h

Currently print_cfs_rq() is declared in include/linux/sched.h.
However it's not used outside kernel/sched. Hence move the
declaration to kernel/sched/sched.h

Also some functions are only available for CONFIG_SCHED_DEBUG=y.
Hence move the declarations to within the #ifdef.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Iulia Manda <iulia.manda21@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1435252903-1081-2-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 --
 kernel/sched/sched.h  | 5 +++++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9bf4bc0e3b8b..3505352dd296 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -191,8 +191,6 @@ struct task_group;
 #ifdef CONFIG_SCHED_DEBUG
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
-extern void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #endif
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aea7c1f393cb..7d5895258fe3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1668,9 +1668,14 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 
 extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+
+#ifdef	CONFIG_SCHED_DEBUG
 extern void print_cfs_stats(struct seq_file *m, int cpu);
 extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void print_dl_stats(struct seq_file *m, int cpu);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
+#endif
 
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq);
-- 
cgit v1.2.3


From a1bd3baeb2f18b2b3d0f98ce5fdaa725149b950b Mon Sep 17 00:00:00 2001
From: Allen Hubbe <Allen.Hubbe@emc.com>
Date: Thu, 9 Apr 2015 10:33:20 -0400
Subject: NTB: Add NTB hardware abstraction layer

Abstract the NTB device behind a programming interface, so that it can
support different hardware and client drivers.

Signed-off-by: Allen Hubbe <Allen.Hubbe@emc.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 Documentation/ntb.txt |  32 ++
 MAINTAINERS           |   4 +-
 drivers/ntb/Makefile  |   1 +
 drivers/ntb/ntb.c     | 251 +++++++++++++
 include/linux/ntb.h   | 984 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1271 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ntb.txt
 create mode 100644 drivers/ntb/ntb.c
 create mode 100644 include/linux/ntb.h

(limited to 'include/linux')

diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt
new file mode 100644
index 000000000000..9d46dc9712a8
--- /dev/null
+++ b/Documentation/ntb.txt
@@ -0,0 +1,32 @@
+# NTB Drivers
+
+NTB (Non-Transparent Bridge) is a type of PCI-Express bridge chip that connects
+the separate memory systems of two computers to the same PCI-Express fabric.
+Existing NTB hardware supports a common feature set, including scratchpad
+registers, doorbell registers, and memory translation windows.  Scratchpad
+registers are read-and-writable registers that are accessible from either side
+of the device, so that peers can exchange a small amount of information at a
+fixed address.  Doorbell registers provide a way for peers to send interrupt
+events.  Memory windows allow translated read and write access to the peer
+memory.
+
+## NTB Core Driver (ntb)
+
+The NTB core driver defines an api wrapping the common feature set, and allows
+clients interested in NTB features to discover NTB the devices supported by
+hardware drivers.  The term "client" is used here to mean an upper layer
+component making use of the NTB api.  The term "driver," or "hardware driver,"
+is used here to mean a driver for a specific vendor and model of NTB hardware.
+
+## NTB Client Drivers
+
+NTB client drivers should register with the NTB core driver.  After
+registering, the client probe and remove functions will be called appropriately
+as ntb hardware, or hardware drivers, are inserted and removed.  The
+registration uses the Linux Device framework, so it should feel familiar to
+anyone who has written a pci driver.
+
+## NTB Hardware Drivers
+
+NTB hardware drivers should register devices with the NTB core driver.  After
+registering, clients probe and remove functions will be called.
diff --git a/MAINTAINERS b/MAINTAINERS
index 663bc8ed1860..e2fc9eec2e16 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6995,15 +6995,17 @@ F:	drivers/power/bq27x00_battery.c
 F:	drivers/power/isp1704_charger.c
 F:	drivers/power/rx51_battery.c
 
-NTB DRIVER
+NTB DRIVER CORE
 M:	Jon Mason <jdmason@kudzu.us>
 M:	Dave Jiang <dave.jiang@intel.com>
+M:	Allen Hubbe <Allen.Hubbe@emc.com>
 S:	Supported
 W:	https://github.com/jonmason/ntb/wiki
 T:	git git://github.com/jonmason/ntb.git
 F:	drivers/ntb/
 F:	drivers/net/ntb_netdev.c
 F:	include/linux/ntb.h
+F:	include/linux/ntb_transport.h
 
 NTFS FILESYSTEM
 M:	Anton Altaparmakov <anton@tuxera.com>
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 545b10a131af..712e953a8fda 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,3 +1,4 @@
+obj-$(CONFIG_NTB) += ntb.o
 obj-$(CONFIG_NTB) += ntb_hw_intel.o
 
 ntb_hw_intel-objs := hw/intel/ntb_hw_intel.o ntb_transport.o
diff --git a/drivers/ntb/ntb.c b/drivers/ntb/ntb.c
new file mode 100644
index 000000000000..23435f2a5486
--- /dev/null
+++ b/drivers/ntb/ntb.c
@@ -0,0 +1,251 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Allen Hubbe <Allen.Hubbe@emc.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <linux/ntb.h>
+#include <linux/pci.h>
+
+#define DRIVER_NAME			"ntb"
+#define DRIVER_DESCRIPTION		"PCIe NTB Driver Framework"
+
+#define DRIVER_LICENSE			"Dual BSD/GPL"
+#define DRIVER_VERSION			"1.0"
+#define DRIVER_RELDATE			"24 March 2015"
+#define DRIVER_AUTHOR			"Allen Hubbe <Allen.Hubbe@emc.com>"
+
+MODULE_LICENSE(DRIVER_LICENSE);
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESCRIPTION);
+
+static struct bus_type ntb_bus;
+static void ntb_dev_release(struct device *dev);
+
+int __ntb_register_client(struct ntb_client *client, struct module *mod,
+			  const char *mod_name)
+{
+	if (!client)
+		return -EINVAL;
+	if (!ntb_client_ops_is_valid(&client->ops))
+		return -EINVAL;
+
+	memset(&client->drv, 0, sizeof(client->drv));
+	client->drv.bus = &ntb_bus;
+	client->drv.name = mod_name;
+	client->drv.owner = mod;
+
+	return driver_register(&client->drv);
+}
+EXPORT_SYMBOL(__ntb_register_client);
+
+void ntb_unregister_client(struct ntb_client *client)
+{
+	driver_unregister(&client->drv);
+}
+EXPORT_SYMBOL(ntb_unregister_client);
+
+int ntb_register_device(struct ntb_dev *ntb)
+{
+	if (!ntb)
+		return -EINVAL;
+	if (!ntb->pdev)
+		return -EINVAL;
+	if (!ntb->ops)
+		return -EINVAL;
+	if (!ntb_dev_ops_is_valid(ntb->ops))
+		return -EINVAL;
+
+	init_completion(&ntb->released);
+
+	memset(&ntb->dev, 0, sizeof(ntb->dev));
+	ntb->dev.bus = &ntb_bus;
+	ntb->dev.parent = &ntb->pdev->dev;
+	ntb->dev.release = ntb_dev_release;
+	dev_set_name(&ntb->dev, pci_name(ntb->pdev));
+
+	ntb->ctx = NULL;
+	ntb->ctx_ops = NULL;
+	spin_lock_init(&ntb->ctx_lock);
+
+	return device_register(&ntb->dev);
+}
+EXPORT_SYMBOL(ntb_register_device);
+
+void ntb_unregister_device(struct ntb_dev *ntb)
+{
+	device_unregister(&ntb->dev);
+	wait_for_completion(&ntb->released);
+}
+EXPORT_SYMBOL(ntb_unregister_device);
+
+int ntb_set_ctx(struct ntb_dev *ntb, void *ctx,
+		const struct ntb_ctx_ops *ctx_ops)
+{
+	unsigned long irqflags;
+
+	if (!ntb_ctx_ops_is_valid(ctx_ops))
+		return -EINVAL;
+	if (ntb->ctx_ops)
+		return -EINVAL;
+
+	spin_lock_irqsave(&ntb->ctx_lock, irqflags);
+	{
+		ntb->ctx = ctx;
+		ntb->ctx_ops = ctx_ops;
+	}
+	spin_unlock_irqrestore(&ntb->ctx_lock, irqflags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_set_ctx);
+
+void ntb_clear_ctx(struct ntb_dev *ntb)
+{
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&ntb->ctx_lock, irqflags);
+	{
+		ntb->ctx_ops = NULL;
+		ntb->ctx = NULL;
+	}
+	spin_unlock_irqrestore(&ntb->ctx_lock, irqflags);
+}
+EXPORT_SYMBOL(ntb_clear_ctx);
+
+void ntb_link_event(struct ntb_dev *ntb)
+{
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&ntb->ctx_lock, irqflags);
+	{
+		if (ntb->ctx_ops && ntb->ctx_ops->link_event)
+			ntb->ctx_ops->link_event(ntb->ctx);
+	}
+	spin_unlock_irqrestore(&ntb->ctx_lock, irqflags);
+}
+EXPORT_SYMBOL(ntb_link_event);
+
+void ntb_db_event(struct ntb_dev *ntb, int vector)
+{
+	unsigned long irqflags;
+
+	spin_lock_irqsave(&ntb->ctx_lock, irqflags);
+	{
+		if (ntb->ctx_ops && ntb->ctx_ops->db_event)
+			ntb->ctx_ops->db_event(ntb->ctx, vector);
+	}
+	spin_unlock_irqrestore(&ntb->ctx_lock, irqflags);
+}
+EXPORT_SYMBOL(ntb_db_event);
+
+static int ntb_probe(struct device *dev)
+{
+	struct ntb_dev *ntb;
+	struct ntb_client *client;
+	int rc;
+
+	get_device(dev);
+	ntb = dev_ntb(dev);
+	client = drv_ntb_client(dev->driver);
+
+	rc = client->ops.probe(client, ntb);
+	if (rc)
+		put_device(dev);
+
+	return rc;
+}
+
+static int ntb_remove(struct device *dev)
+{
+	struct ntb_dev *ntb;
+	struct ntb_client *client;
+
+	if (dev->driver) {
+		ntb = dev_ntb(dev);
+		client = drv_ntb_client(dev->driver);
+
+		client->ops.remove(client, ntb);
+		put_device(dev);
+	}
+
+	return 0;
+}
+
+static void ntb_dev_release(struct device *dev)
+{
+	struct ntb_dev *ntb = dev_ntb(dev);
+
+	complete(&ntb->released);
+}
+
+static struct bus_type ntb_bus = {
+	.name = "ntb",
+	.probe = ntb_probe,
+	.remove = ntb_remove,
+};
+
+static int __init ntb_driver_init(void)
+{
+	return bus_register(&ntb_bus);
+}
+module_init(ntb_driver_init);
+
+static void __exit ntb_driver_exit(void)
+{
+	bus_unregister(&ntb_bus);
+}
+module_exit(ntb_driver_exit);
+
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
new file mode 100644
index 000000000000..b02f72bb8e32
--- /dev/null
+++ b/include/linux/ntb.h
@@ -0,0 +1,984 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *   General Public License for more details.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copy
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * PCIe NTB Linux driver
+ *
+ * Contact Information:
+ * Allen Hubbe <Allen.Hubbe@emc.com>
+ */
+
+#ifndef _NTB_H_
+#define _NTB_H_
+
+#include <linux/completion.h>
+#include <linux/device.h>
+
+struct ntb_client;
+struct ntb_dev;
+struct pci_dev;
+
+/**
+ * enum ntb_topo - NTB connection topology
+ * @NTB_TOPO_NONE:	Topology is unknown or invalid.
+ * @NTB_TOPO_PRI:	On primary side of local ntb.
+ * @NTB_TOPO_SEC:	On secondary side of remote ntb.
+ * @NTB_TOPO_B2B_USD:	On primary side of local ntb upstream of remote ntb.
+ * @NTB_TOPO_B2B_DSD:	On primary side of local ntb downstream of remote ntb.
+ */
+enum ntb_topo {
+	NTB_TOPO_NONE = -1,
+	NTB_TOPO_PRI,
+	NTB_TOPO_SEC,
+	NTB_TOPO_B2B_USD,
+	NTB_TOPO_B2B_DSD,
+};
+
+static inline int ntb_topo_is_b2b(enum ntb_topo topo)
+{
+	switch ((int)topo) {
+	case NTB_TOPO_B2B_USD:
+	case NTB_TOPO_B2B_DSD:
+		return 1;
+	}
+	return 0;
+}
+
+static inline char *ntb_topo_string(enum ntb_topo topo)
+{
+	switch (topo) {
+	case NTB_TOPO_NONE:	return "NTB_TOPO_NONE";
+	case NTB_TOPO_PRI:	return "NTB_TOPO_PRI";
+	case NTB_TOPO_SEC:	return "NTB_TOPO_SEC";
+	case NTB_TOPO_B2B_USD:	return "NTB_TOPO_B2B_USD";
+	case NTB_TOPO_B2B_DSD:	return "NTB_TOPO_B2B_DSD";
+	}
+	return "NTB_TOPO_INVALID";
+}
+
+/**
+ * enum ntb_speed - NTB link training speed
+ * @NTB_SPEED_AUTO:	Request the max supported speed.
+ * @NTB_SPEED_NONE:	Link is not trained to any speed.
+ * @NTB_SPEED_GEN1:	Link is trained to gen1 speed.
+ * @NTB_SPEED_GEN2:	Link is trained to gen2 speed.
+ * @NTB_SPEED_GEN3:	Link is trained to gen3 speed.
+ */
+enum ntb_speed {
+	NTB_SPEED_AUTO = -1,
+	NTB_SPEED_NONE = 0,
+	NTB_SPEED_GEN1 = 1,
+	NTB_SPEED_GEN2 = 2,
+	NTB_SPEED_GEN3 = 3,
+};
+
+/**
+ * enum ntb_width - NTB link training width
+ * @NTB_WIDTH_AUTO:	Request the max supported width.
+ * @NTB_WIDTH_NONE:	Link is not trained to any width.
+ * @NTB_WIDTH_1:	Link is trained to 1 lane width.
+ * @NTB_WIDTH_2:	Link is trained to 2 lane width.
+ * @NTB_WIDTH_4:	Link is trained to 4 lane width.
+ * @NTB_WIDTH_8:	Link is trained to 8 lane width.
+ * @NTB_WIDTH_12:	Link is trained to 12 lane width.
+ * @NTB_WIDTH_16:	Link is trained to 16 lane width.
+ * @NTB_WIDTH_32:	Link is trained to 32 lane width.
+ */
+enum ntb_width {
+	NTB_WIDTH_AUTO = -1,
+	NTB_WIDTH_NONE = 0,
+	NTB_WIDTH_1 = 1,
+	NTB_WIDTH_2 = 2,
+	NTB_WIDTH_4 = 4,
+	NTB_WIDTH_8 = 8,
+	NTB_WIDTH_12 = 12,
+	NTB_WIDTH_16 = 16,
+	NTB_WIDTH_32 = 32,
+};
+
+/**
+ * struct ntb_client_ops - ntb client operations
+ * @probe:		Notify client of a new device.
+ * @remove:		Notify client to remove a device.
+ */
+struct ntb_client_ops {
+	int (*probe)(struct ntb_client *client, struct ntb_dev *ntb);
+	void (*remove)(struct ntb_client *client, struct ntb_dev *ntb);
+};
+
+static inline int ntb_client_ops_is_valid(const struct ntb_client_ops *ops)
+{
+	/* commented callbacks are not required: */
+	return
+		ops->probe			&&
+		ops->remove			&&
+		1;
+}
+
+/**
+ * struct ntb_ctx_ops - ntb driver context operations
+ * @link_event:		See ntb_link_event().
+ * @db_event:		See ntb_db_event().
+ */
+struct ntb_ctx_ops {
+	void (*link_event)(void *ctx);
+	void (*db_event)(void *ctx, int db_vector);
+};
+
+static inline int ntb_ctx_ops_is_valid(const struct ntb_ctx_ops *ops)
+{
+	/* commented callbacks are not required: */
+	return
+		/* ops->link_event		&& */
+		/* ops->db_event		&& */
+		1;
+}
+
+/**
+ * struct ntb_ctx_ops - ntb device operations
+ * @mw_count:		See ntb_mw_count().
+ * @mw_get_range:	See ntb_mw_get_range().
+ * @mw_set_trans:	See ntb_mw_set_trans().
+ * @mw_clear_trans:	See ntb_mw_clear_trans().
+ * @link_is_up:		See ntb_link_is_up().
+ * @link_enable:	See ntb_link_enable().
+ * @link_disable:	See ntb_link_disable().
+ * @db_is_unsafe:	See ntb_db_is_unsafe().
+ * @db_valid_mask:	See ntb_db_valid_mask().
+ * @db_vector_count:	See ntb_db_vector_count().
+ * @db_vector_mask:	See ntb_db_vector_mask().
+ * @db_read:		See ntb_db_read().
+ * @db_set:		See ntb_db_set().
+ * @db_clear:		See ntb_db_clear().
+ * @db_read_mask:	See ntb_db_read_mask().
+ * @db_set_mask:	See ntb_db_set_mask().
+ * @db_clear_mask:	See ntb_db_clear_mask().
+ * @peer_db_addr:	See ntb_peer_db_addr().
+ * @peer_db_read:	See ntb_peer_db_read().
+ * @peer_db_set:	See ntb_peer_db_set().
+ * @peer_db_clear:	See ntb_peer_db_clear().
+ * @peer_db_read_mask:	See ntb_peer_db_read_mask().
+ * @peer_db_set_mask:	See ntb_peer_db_set_mask().
+ * @peer_db_clear_mask:	See ntb_peer_db_clear_mask().
+ * @spad_is_unsafe:	See ntb_spad_is_unsafe().
+ * @spad_count:		See ntb_spad_count().
+ * @spad_read:		See ntb_spad_read().
+ * @spad_write:		See ntb_spad_write().
+ * @peer_spad_addr:	See ntb_peer_spad_addr().
+ * @peer_spad_read:	See ntb_peer_spad_read().
+ * @peer_spad_write:	See ntb_peer_spad_write().
+ */
+struct ntb_dev_ops {
+	int (*mw_count)(struct ntb_dev *ntb);
+	int (*mw_get_range)(struct ntb_dev *ntb, int idx,
+			    phys_addr_t *base, resource_size_t *size,
+			resource_size_t *align, resource_size_t *align_size);
+	int (*mw_set_trans)(struct ntb_dev *ntb, int idx,
+			    dma_addr_t addr, resource_size_t size);
+	int (*mw_clear_trans)(struct ntb_dev *ntb, int idx);
+
+	int (*link_is_up)(struct ntb_dev *ntb,
+			  enum ntb_speed *speed, enum ntb_width *width);
+	int (*link_enable)(struct ntb_dev *ntb,
+			   enum ntb_speed max_speed, enum ntb_width max_width);
+	int (*link_disable)(struct ntb_dev *ntb);
+
+	int (*db_is_unsafe)(struct ntb_dev *ntb);
+	u64 (*db_valid_mask)(struct ntb_dev *ntb);
+	int (*db_vector_count)(struct ntb_dev *ntb);
+	u64 (*db_vector_mask)(struct ntb_dev *ntb, int db_vector);
+
+	u64 (*db_read)(struct ntb_dev *ntb);
+	int (*db_set)(struct ntb_dev *ntb, u64 db_bits);
+	int (*db_clear)(struct ntb_dev *ntb, u64 db_bits);
+
+	u64 (*db_read_mask)(struct ntb_dev *ntb);
+	int (*db_set_mask)(struct ntb_dev *ntb, u64 db_bits);
+	int (*db_clear_mask)(struct ntb_dev *ntb, u64 db_bits);
+
+	int (*peer_db_addr)(struct ntb_dev *ntb,
+			    phys_addr_t *db_addr, resource_size_t *db_size);
+	u64 (*peer_db_read)(struct ntb_dev *ntb);
+	int (*peer_db_set)(struct ntb_dev *ntb, u64 db_bits);
+	int (*peer_db_clear)(struct ntb_dev *ntb, u64 db_bits);
+
+	u64 (*peer_db_read_mask)(struct ntb_dev *ntb);
+	int (*peer_db_set_mask)(struct ntb_dev *ntb, u64 db_bits);
+	int (*peer_db_clear_mask)(struct ntb_dev *ntb, u64 db_bits);
+
+	int (*spad_is_unsafe)(struct ntb_dev *ntb);
+	int (*spad_count)(struct ntb_dev *ntb);
+
+	u32 (*spad_read)(struct ntb_dev *ntb, int idx);
+	int (*spad_write)(struct ntb_dev *ntb, int idx, u32 val);
+
+	int (*peer_spad_addr)(struct ntb_dev *ntb, int idx,
+			      phys_addr_t *spad_addr);
+	u32 (*peer_spad_read)(struct ntb_dev *ntb, int idx);
+	int (*peer_spad_write)(struct ntb_dev *ntb, int idx, u32 val);
+};
+
+static inline int ntb_dev_ops_is_valid(const struct ntb_dev_ops *ops)
+{
+	/* commented callbacks are not required: */
+	return
+		ops->mw_count				&&
+		ops->mw_get_range			&&
+		ops->mw_set_trans			&&
+		/* ops->mw_clear_trans			&& */
+		ops->link_is_up				&&
+		ops->link_enable			&&
+		ops->link_disable			&&
+		/* ops->db_is_unsafe			&& */
+		ops->db_valid_mask			&&
+
+		/* both set, or both unset */
+		(!ops->db_vector_count == !ops->db_vector_mask) &&
+
+		ops->db_read				&&
+		/* ops->db_set				&& */
+		ops->db_clear				&&
+		/* ops->db_read_mask			&& */
+		ops->db_set_mask			&&
+		ops->db_clear_mask			&&
+		ops->peer_db_addr			&&
+		/* ops->peer_db_read			&& */
+		ops->peer_db_set			&&
+		/* ops->peer_db_clear			&& */
+		/* ops->peer_db_read_mask		&& */
+		/* ops->peer_db_set_mask		&& */
+		/* ops->peer_db_clear_mask		&& */
+		/* ops->spad_is_unsafe			&& */
+		ops->spad_count				&&
+		ops->spad_read				&&
+		ops->spad_write				&&
+		ops->peer_spad_addr			&&
+		/* ops->peer_spad_read			&& */
+		ops->peer_spad_write			&&
+		1;
+}
+
+/**
+ * struct ntb_client - client interested in ntb devices
+ * @drv:		Linux driver object.
+ * @ops:		See &ntb_client_ops.
+ */
+struct ntb_client {
+	struct device_driver		drv;
+	const struct ntb_client_ops	ops;
+};
+
+#define drv_ntb_client(__drv) container_of((__drv), struct ntb_client, drv)
+
+/**
+ * struct ntb_device - ntb device
+ * @dev:		Linux device object.
+ * @pdev:		Pci device entry of the ntb.
+ * @topo:		Detected topology of the ntb.
+ * @ops:		See &ntb_dev_ops.
+ * @ctx:		See &ntb_ctx_ops.
+ * @ctx_ops:		See &ntb_ctx_ops.
+ */
+struct ntb_dev {
+	struct device			dev;
+	struct pci_dev			*pdev;
+	enum ntb_topo			topo;
+	const struct ntb_dev_ops	*ops;
+	void				*ctx;
+	const struct ntb_ctx_ops	*ctx_ops;
+
+	/* private: */
+
+	/* synchronize setting, clearing, and calling ctx_ops */
+	spinlock_t			ctx_lock;
+	/* block unregister until device is fully released */
+	struct completion		released;
+};
+
+#define dev_ntb(__dev) container_of((__dev), struct ntb_dev, dev)
+
+/**
+ * ntb_register_client() - register a client for interest in ntb devices
+ * @client:	Client context.
+ *
+ * The client will be added to the list of clients interested in ntb devices.
+ * The client will be notified of any ntb devices that are not already
+ * associated with a client, or if ntb devices are registered later.
+ *
+ * Return: Zero if the client is registered, otherwise an error number.
+ */
+#define ntb_register_client(client) \
+	__ntb_register_client((client), THIS_MODULE, KBUILD_MODNAME)
+
+int __ntb_register_client(struct ntb_client *client, struct module *mod,
+			  const char *mod_name);
+
+/**
+ * ntb_unregister_client() - unregister a client for interest in ntb devices
+ * @client:	Client context.
+ *
+ * The client will be removed from the list of clients interested in ntb
+ * devices.  If any ntb devices are associated with the client, the client will
+ * be notified to remove those devices.
+ */
+void ntb_unregister_client(struct ntb_client *client);
+
+#define module_ntb_client(__ntb_client) \
+	module_driver(__ntb_client, ntb_register_client, \
+			ntb_unregister_client)
+
+/**
+ * ntb_register_device() - register a ntb device
+ * @ntb:	NTB device context.
+ *
+ * The device will be added to the list of ntb devices.  If any clients are
+ * interested in ntb devices, each client will be notified of the ntb device,
+ * until at most one client accepts the device.
+ *
+ * Return: Zero if the device is registered, otherwise an error number.
+ */
+int ntb_register_device(struct ntb_dev *ntb);
+
+/**
+ * ntb_register_device() - unregister a ntb device
+ * @ntb:	NTB device context.
+ *
+ * The device will be removed from the list of ntb devices.  If the ntb device
+ * is associated with a client, the client will be notified to remove the
+ * device.
+ */
+void ntb_unregister_device(struct ntb_dev *ntb);
+
+/**
+ * ntb_set_ctx() - associate a driver context with an ntb device
+ * @ntb:	NTB device context.
+ * @ctx:	Driver context.
+ * @ctx_ops:	Driver context operations.
+ *
+ * Associate a driver context and operations with a ntb device.  The context is
+ * provided by the client driver, and the driver may associate a different
+ * context with each ntb device.
+ *
+ * Return: Zero if the context is associated, otherwise an error number.
+ */
+int ntb_set_ctx(struct ntb_dev *ntb, void *ctx,
+		const struct ntb_ctx_ops *ctx_ops);
+
+/**
+ * ntb_clear_ctx() - disassociate any driver context from an ntb device
+ * @ntb:	NTB device context.
+ *
+ * Clear any association that may exist between a driver context and the ntb
+ * device.
+ */
+void ntb_clear_ctx(struct ntb_dev *ntb);
+
+/**
+ * ntb_link_event() - notify driver context of a change in link status
+ * @ntb:	NTB device context.
+ *
+ * Notify the driver context that the link status may have changed.  The driver
+ * should call ntb_link_is_up() to get the current status.
+ */
+void ntb_link_event(struct ntb_dev *ntb);
+
+/**
+ * ntb_db_event() - notify driver context of a doorbell event
+ * @ntb:	NTB device context.
+ * @vector:	Interrupt vector number.
+ *
+ * Notify the driver context of a doorbell event.  If hardware supports
+ * multiple interrupt vectors for doorbells, the vector number indicates which
+ * vector received the interrupt.  The vector number is relative to the first
+ * vector used for doorbells, starting at zero, and must be less than
+ ** ntb_db_vector_count().  The driver may call ntb_db_read() to check which
+ * doorbell bits need service, and ntb_db_vector_mask() to determine which of
+ * those bits are associated with the vector number.
+ */
+void ntb_db_event(struct ntb_dev *ntb, int vector);
+
+/**
+ * ntb_mw_count() - get the number of memory windows
+ * @ntb:	NTB device context.
+ *
+ * Hardware and topology may support a different number of memory windows.
+ *
+ * Return: the number of memory windows.
+ */
+static inline int ntb_mw_count(struct ntb_dev *ntb)
+{
+	return ntb->ops->mw_count(ntb);
+}
+
+/**
+ * ntb_mw_get_range() - get the range of a memory window
+ * @ntb:	NTB device context.
+ * @idx:	Memory window number.
+ * @base:	OUT - the base address for mapping the memory window
+ * @size:	OUT - the size for mapping the memory window
+ * @align:	OUT - the base alignment for translating the memory window
+ * @align_size:	OUT - the size alignment for translating the memory window
+ *
+ * Get the range of a memory window.  NULL may be given for any output
+ * parameter if the value is not needed.  The base and size may be used for
+ * mapping the memory window, to access the peer memory.  The alignment and
+ * size may be used for translating the memory window, for the peer to access
+ * memory on the local system.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_mw_get_range(struct ntb_dev *ntb, int idx,
+				   phys_addr_t *base, resource_size_t *size,
+		resource_size_t *align, resource_size_t *align_size)
+{
+	return ntb->ops->mw_get_range(ntb, idx, base, size,
+			align, align_size);
+}
+
+/**
+ * ntb_mw_set_trans() - set the translation of a memory window
+ * @ntb:	NTB device context.
+ * @idx:	Memory window number.
+ * @addr:	The dma address local memory to expose to the peer.
+ * @size:	The size of the local memory to expose to the peer.
+ *
+ * Set the translation of a memory window.  The peer may access local memory
+ * through the window starting at the address, up to the size.  The address
+ * must be aligned to the alignment specified by ntb_mw_get_range().  The size
+ * must be aligned to the size alignment specified by ntb_mw_get_range().
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
+				   dma_addr_t addr, resource_size_t size)
+{
+	return ntb->ops->mw_set_trans(ntb, idx, addr, size);
+}
+
+/**
+ * ntb_mw_clear_trans() - clear the translation of a memory window
+ * @ntb:	NTB device context.
+ * @idx:	Memory window number.
+ *
+ * Clear the translation of a memory window.  The peer may no longer access
+ * local memory through the window.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_mw_clear_trans(struct ntb_dev *ntb, int idx)
+{
+	if (!ntb->ops->mw_clear_trans)
+		return ntb->ops->mw_set_trans(ntb, idx, 0, 0);
+
+	return ntb->ops->mw_clear_trans(ntb, idx);
+}
+
+/**
+ * ntb_link_is_up() - get the current ntb link state
+ * @ntb:	NTB device context.
+ * @speed:	OUT - The link speed expressed as PCIe generation number.
+ * @width:	OUT - The link width expressed as the number of PCIe lanes.
+ *
+ * Set the translation of a memory window.  The peer may access local memory
+ * through the window starting at the address, up to the size.  The address
+ * must be aligned to the alignment specified by ntb_mw_get_range().  The size
+ * must be aligned to the size alignment specified by ntb_mw_get_range().
+ *
+ * Return: One if the link is up, zero if the link is down, otherwise a
+ *		negative value indicating the error number.
+ */
+static inline int ntb_link_is_up(struct ntb_dev *ntb,
+				 enum ntb_speed *speed, enum ntb_width *width)
+{
+	return ntb->ops->link_is_up(ntb, speed, width);
+}
+
+/**
+ * ntb_link_enable() - enable the link on the secondary side of the ntb
+ * @ntb:	NTB device context.
+ * @max_speed:	The maximum link speed expressed as PCIe generation number.
+ * @max_width:	The maximum link width expressed as the number of PCIe lanes.
+ *
+ * Enable the link on the secondary side of the ntb.  This can only be done
+ * from the primary side of the ntb in primary or b2b topology.  The ntb device
+ * should train the link to its maximum speed and width, or the requested speed
+ * and width, whichever is smaller, if supported.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_link_enable(struct ntb_dev *ntb,
+				  enum ntb_speed max_speed,
+				  enum ntb_width max_width)
+{
+	return ntb->ops->link_enable(ntb, max_speed, max_width);
+}
+
+/**
+ * ntb_link_disable() - disable the link on the secondary side of the ntb
+ * @ntb:	NTB device context.
+ *
+ * Disable the link on the secondary side of the ntb.  This can only be
+ * done from the primary side of the ntb in primary or b2b topology.  The ntb
+ * device should disable the link.  Returning from this call must indicate that
+ * a barrier has passed, though with no more writes may pass in either
+ * direction across the link, except if this call returns an error number.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_link_disable(struct ntb_dev *ntb)
+{
+	return ntb->ops->link_disable(ntb);
+}
+
+/**
+ * ntb_db_is_unsafe() - check if it is safe to use hardware doorbell
+ * @ntb:	NTB device context.
+ *
+ * It is possible for some ntb hardware to be affected by errata.  Hardware
+ * drivers can advise clients to avoid using doorbells.  Clients may ignore
+ * this advice, though caution is recommended.
+ *
+ * Return: Zero if it is safe to use doorbells, or One if it is not safe.
+ */
+static inline int ntb_db_is_unsafe(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->db_is_unsafe)
+		return 0;
+
+	return ntb->ops->db_is_unsafe(ntb);
+}
+
+/**
+ * ntb_db_valid_mask() - get a mask of doorbell bits supported by the ntb
+ * @ntb:	NTB device context.
+ *
+ * Hardware may support different number or arrangement of doorbell bits.
+ *
+ * Return: A mask of doorbell bits supported by the ntb.
+ */
+static inline u64 ntb_db_valid_mask(struct ntb_dev *ntb)
+{
+	return ntb->ops->db_valid_mask(ntb);
+}
+
+/**
+ * ntb_db_vector_count() - get the number of doorbell interrupt vectors
+ * @ntb:	NTB device context.
+ *
+ * Hardware may support different number of interrupt vectors.
+ *
+ * Return: The number of doorbell interrupt vectors.
+ */
+static inline int ntb_db_vector_count(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->db_vector_count)
+		return 1;
+
+	return ntb->ops->db_vector_count(ntb);
+}
+
+/**
+ * ntb_db_vector_mask() - get a mask of doorbell bits serviced by a vector
+ * @ntb:	NTB device context.
+ * @vector:	Doorbell vector number.
+ *
+ * Each interrupt vector may have a different number or arrangement of bits.
+ *
+ * Return: A mask of doorbell bits serviced by a vector.
+ */
+static inline u64 ntb_db_vector_mask(struct ntb_dev *ntb, int vector)
+{
+	if (!ntb->ops->db_vector_mask)
+		return ntb_db_valid_mask(ntb);
+
+	return ntb->ops->db_vector_mask(ntb, vector);
+}
+
+/**
+ * ntb_db_read() - read the local doorbell register
+ * @ntb:	NTB device context.
+ *
+ * Read the local doorbell register, and return the bits that are set.
+ *
+ * Return: The bits currently set in the local doorbell register.
+ */
+static inline u64 ntb_db_read(struct ntb_dev *ntb)
+{
+	return ntb->ops->db_read(ntb);
+}
+
+/**
+ * ntb_db_set() - set bits in the local doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to set.
+ *
+ * Set bits in the local doorbell register, which may generate a local doorbell
+ * interrupt.  Bits that were already set must remain set.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+	if (!ntb->ops->db_set)
+		return -EINVAL;
+
+	return ntb->ops->db_set(ntb, db_bits);
+}
+
+/**
+ * ntb_db_clear() - clear bits in the local doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell register, arming the bits for the next
+ * doorbell.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+	return ntb->ops->db_clear(ntb, db_bits);
+}
+
+/**
+ * ntb_db_read_mask() - read the local doorbell mask
+ * @ntb:	NTB device context.
+ *
+ * Read the local doorbell mask register, and return the bits that are set.
+ *
+ * This is unusual, though hardware is likely to support it.
+ *
+ * Return: The bits currently set in the local doorbell mask register.
+ */
+static inline u64 ntb_db_read_mask(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->db_read_mask)
+		return 0;
+
+	return ntb->ops->db_read_mask(ntb);
+}
+
+/**
+ * ntb_db_set_mask() - set bits in the local doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell mask bits to set.
+ *
+ * Set bits in the local doorbell mask register, preventing doorbell interrupts
+ * from being generated for those doorbell bits.  Bits that were already set
+ * must remain set.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	return ntb->ops->db_set_mask(ntb, db_bits);
+}
+
+/**
+ * ntb_db_clear_mask() - clear bits in the local doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the local doorbell mask register, allowing doorbell interrupts
+ * from being generated for those doorbell bits.  If a doorbell bit is already
+ * set at the time the mask is cleared, and the corresponding mask bit is
+ * changed from set to clear, then the ntb driver must ensure that
+ * ntb_db_event() is called.  If the hardware does not generate the interrupt
+ * on clearing the mask bit, then the driver must call ntb_db_event() anyway.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	return ntb->ops->db_clear_mask(ntb, db_bits);
+}
+
+/**
+ * ntb_peer_db_addr() - address and size of the peer doorbell register
+ * @ntb:	NTB device context.
+ * @db_addr:	OUT - The address of the peer doorbell register.
+ * @db_size:	OUT - The number of bytes to write the peer doorbell register.
+ *
+ * Return the address of the peer doorbell register.  This may be used, for
+ * example, by drivers that offload memory copy operations to a dma engine.
+ * The drivers may wish to ring the peer doorbell at the completion of memory
+ * copy operations.  For efficiency, and to simplify ordering of operations
+ * between the dma memory copies and the ringing doorbell, the driver may
+ * append one additional dma memory copy with the doorbell register as the
+ * destination, after the memory copy operations.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_addr(struct ntb_dev *ntb,
+				   phys_addr_t *db_addr,
+				   resource_size_t *db_size)
+{
+	return ntb->ops->peer_db_addr(ntb, db_addr, db_size);
+}
+
+/**
+ * ntb_peer_db_read() - read the peer doorbell register
+ * @ntb:	NTB device context.
+ *
+ * Read the peer doorbell register, and return the bits that are set.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: The bits currently set in the peer doorbell register.
+ */
+static inline u64 ntb_peer_db_read(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->peer_db_read)
+		return 0;
+
+	return ntb->ops->peer_db_read(ntb);
+}
+
+/**
+ * ntb_peer_db_set() - set bits in the peer doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to set.
+ *
+ * Set bits in the peer doorbell register, which may generate a peer doorbell
+ * interrupt.  Bits that were already set must remain set.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+	return ntb->ops->peer_db_set(ntb, db_bits);
+}
+
+/**
+ * ntb_peer_db_clear() - clear bits in the local doorbell register
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the peer doorbell register, arming the bits for the next
+ * doorbell.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+	if (!ntb->ops->db_clear)
+		return -EINVAL;
+
+	return ntb->ops->peer_db_clear(ntb, db_bits);
+}
+
+/**
+ * ntb_peer_db_read_mask() - read the peer doorbell mask
+ * @ntb:	NTB device context.
+ *
+ * Read the peer doorbell mask register, and return the bits that are set.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: The bits currently set in the peer doorbell mask register.
+ */
+static inline u64 ntb_peer_db_read_mask(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->db_read_mask)
+		return 0;
+
+	return ntb->ops->peer_db_read_mask(ntb);
+}
+
+/**
+ * ntb_peer_db_set_mask() - set bits in the peer doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell mask bits to set.
+ *
+ * Set bits in the peer doorbell mask register, preventing doorbell interrupts
+ * from being generated for those doorbell bits.  Bits that were already set
+ * must remain set.
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	if (!ntb->ops->db_set_mask)
+		return -EINVAL;
+
+	return ntb->ops->peer_db_set_mask(ntb, db_bits);
+}
+
+/**
+ * ntb_peer_db_clear_mask() - clear bits in the peer doorbell mask
+ * @ntb:	NTB device context.
+ * @db_bits:	Doorbell bits to clear.
+ *
+ * Clear bits in the peer doorbell mask register, allowing doorbell interrupts
+ * from being generated for those doorbell bits.  If the hardware does not
+ * generate the interrupt on clearing the mask bit, then the driver should not
+ * implement this function!
+ *
+ * This is unusual, and hardware may not support it.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	if (!ntb->ops->db_clear_mask)
+		return -EINVAL;
+
+	return ntb->ops->peer_db_clear_mask(ntb, db_bits);
+}
+
+/**
+ * ntb_spad_is_unsafe() - check if it is safe to use the hardware scratchpads
+ * @ntb:	NTB device context.
+ *
+ * It is possible for some ntb hardware to be affected by errata.  Hardware
+ * drivers can advise clients to avoid using scratchpads.  Clients may ignore
+ * this advice, though caution is recommended.
+ *
+ * Return: Zero if it is safe to use scratchpads, or One if it is not safe.
+ */
+static inline int ntb_spad_is_unsafe(struct ntb_dev *ntb)
+{
+	if (!ntb->ops->spad_is_unsafe)
+		return 0;
+
+	return ntb->ops->spad_is_unsafe(ntb);
+}
+
+/**
+ * ntb_mw_count() - get the number of scratchpads
+ * @ntb:	NTB device context.
+ *
+ * Hardware and topology may support a different number of scratchpads.
+ *
+ * Return: the number of scratchpads.
+ */
+static inline int ntb_spad_count(struct ntb_dev *ntb)
+{
+	return ntb->ops->spad_count(ntb);
+}
+
+/**
+ * ntb_spad_read() - read the local scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ *
+ * Read the local scratchpad register, and return the value.
+ *
+ * Return: The value of the local scratchpad register.
+ */
+static inline u32 ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+	return ntb->ops->spad_read(ntb, idx);
+}
+
+/**
+ * ntb_spad_write() - write the local scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ * @val:	Scratchpad value.
+ *
+ * Write the value to the local scratchpad register.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+	return ntb->ops->spad_write(ntb, idx, val);
+}
+
+/**
+ * ntb_peer_spad_addr() - address of the peer scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ * @spad_addr:	OUT - The address of the peer scratchpad register.
+ *
+ * Return the address of the peer doorbell register.  This may be used, for
+ * example, by drivers that offload memory copy operations to a dma engine.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_spad_addr(struct ntb_dev *ntb, int idx,
+				     phys_addr_t *spad_addr)
+{
+	return ntb->ops->peer_spad_addr(ntb, idx, spad_addr);
+}
+
+/**
+ * ntb_peer_spad_read() - read the peer scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ *
+ * Read the peer scratchpad register, and return the value.
+ *
+ * Return: The value of the local scratchpad register.
+ */
+static inline u32 ntb_peer_spad_read(struct ntb_dev *ntb, int idx)
+{
+	return ntb->ops->peer_spad_read(ntb, idx);
+}
+
+/**
+ * ntb_peer_spad_write() - write the peer scratchpad register
+ * @ntb:	NTB device context.
+ * @idx:	Scratchpad index.
+ * @val:	Scratchpad value.
+ *
+ * Write the value to the peer scratchpad register.
+ *
+ * Return: Zero on success, otherwise an error number.
+ */
+static inline int ntb_peer_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+	return ntb->ops->peer_spad_write(ntb, idx, val);
+}
+
+#endif
-- 
cgit v1.2.3


From e26a5843f7f5014ae4460030ca4de029a3ac35d3 Mon Sep 17 00:00:00 2001
From: Allen Hubbe <Allen.Hubbe@emc.com>
Date: Thu, 9 Apr 2015 10:33:20 -0400
Subject: NTB: Split ntb_hw_intel and ntb_transport drivers

Change ntb_hw_intel to use the new NTB hardware abstraction layer.

Split ntb_transport into its own driver.  Change it to use the new NTB
hardware abstraction layer.

Signed-off-by: Allen Hubbe <Allen.Hubbe@emc.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 Documentation/ntb.txt               |   26 +
 MAINTAINERS                         |    8 +
 drivers/net/ntb_netdev.c            |   54 +-
 drivers/ntb/Kconfig                 |   37 +-
 drivers/ntb/Makefile                |    6 +-
 drivers/ntb/hw/Kconfig              |    1 +
 drivers/ntb/hw/Makefile             |    1 +
 drivers/ntb/hw/intel/Kconfig        |    7 +
 drivers/ntb/hw/intel/Makefile       |    1 +
 drivers/ntb/hw/intel/ntb_hw_intel.c | 3075 +++++++++++++++++++----------------
 drivers/ntb/hw/intel/ntb_hw_intel.h |  607 ++++---
 drivers/ntb/ntb_transport.c         |  936 ++++++-----
 include/linux/ntb_transport.h       |   25 +-
 13 files changed, 2589 insertions(+), 2195 deletions(-)
 create mode 100644 drivers/ntb/hw/Kconfig
 create mode 100644 drivers/ntb/hw/Makefile
 create mode 100644 drivers/ntb/hw/intel/Kconfig
 create mode 100644 drivers/ntb/hw/intel/Makefile

(limited to 'include/linux')

diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt
index 9d46dc9712a8..725ba1e6c127 100644
--- a/Documentation/ntb.txt
+++ b/Documentation/ntb.txt
@@ -26,7 +26,33 @@ as ntb hardware, or hardware drivers, are inserted and removed.  The
 registration uses the Linux Device framework, so it should feel familiar to
 anyone who has written a pci driver.
 
+### NTB Transport Client (ntb\_transport) and NTB Netdev (ntb\_netdev)
+
+The primary client for NTB is the Transport client, used in tandem with NTB
+Netdev.  These drivers function together to create a logical link to the peer,
+across the ntb, to exchange packets of network data.  The Transport client
+establishes a logical link to the peer, and creates queue pairs to exchange
+messages and data.  The NTB Netdev then creates an ethernet device using a
+Transport queue pair.  Network data is copied between socket buffers and the
+Transport queue pair buffer.  The Transport client may be used for other things
+besides Netdev, however no other applications have yet been written.
+
 ## NTB Hardware Drivers
 
 NTB hardware drivers should register devices with the NTB core driver.  After
 registering, clients probe and remove functions will be called.
+
+### NTB Intel Hardware Driver (ntb\_hw\_intel)
+
+The Intel hardware driver supports NTB on Xeon and Atom CPUs.
+
+Module Parameters:
+
+* b2b\_mw\_idx - If the peer ntb is to be accessed via a memory window, then use
+	this memory window to access the peer ntb.  A value of zero or positive
+	starts from the first mw idx, and a negative value starts from the last
+	mw idx.  Both sides MUST set the same value here!  The default value is
+	`-1`.
+* b2b\_mw\_share - If the peer ntb is to be accessed via a memory window, and if
+	the memory window is large enough, still allow the client to use the
+	second half of the memory window for address translation to the peer.
diff --git a/MAINTAINERS b/MAINTAINERS
index e2fc9eec2e16..a682805d6d56 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7007,6 +7007,14 @@ F:	drivers/net/ntb_netdev.c
 F:	include/linux/ntb.h
 F:	include/linux/ntb_transport.h
 
+NTB INTEL DRIVER
+M:	Jon Mason <jdmason@kudzu.us>
+M:	Dave Jiang <dave.jiang@intel.com>
+S:	Supported
+W:	https://github.com/jonmason/ntb/wiki
+T:	git git://github.com/jonmason/ntb.git
+F:	drivers/ntb/hw/intel/
+
 NTFS FILESYSTEM
 M:	Anton Altaparmakov <anton@tuxera.com>
 L:	linux-ntfs-dev@lists.sourceforge.net
diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c
index 6d3bfa62f5ec..3cc316cb7e6b 100644
--- a/drivers/net/ntb_netdev.c
+++ b/drivers/net/ntb_netdev.c
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -40,7 +42,7 @@
  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Intel PCIe NTB Network Linux driver
+ * PCIe NTB Network Linux driver
  *
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
@@ -49,6 +51,7 @@
 #include <linux/ethtool.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/ntb.h>
 #include <linux/ntb_transport.h>
 
 #define NTB_NETDEV_VER	"0.7"
@@ -70,26 +73,19 @@ struct ntb_netdev {
 
 static LIST_HEAD(dev_list);
 
-static void ntb_netdev_event_handler(void *data, int status)
+static void ntb_netdev_event_handler(void *data, int link_is_up)
 {
 	struct net_device *ndev = data;
 	struct ntb_netdev *dev = netdev_priv(ndev);
 
-	netdev_dbg(ndev, "Event %x, Link %x\n", status,
+	netdev_dbg(ndev, "Event %x, Link %x\n", link_is_up,
 		   ntb_transport_link_query(dev->qp));
 
-	switch (status) {
-	case NTB_LINK_DOWN:
+	if (link_is_up) {
+		if (ntb_transport_link_query(dev->qp))
+			netif_carrier_on(ndev);
+	} else {
 		netif_carrier_off(ndev);
-		break;
-	case NTB_LINK_UP:
-		if (!ntb_transport_link_query(dev->qp))
-			return;
-
-		netif_carrier_on(ndev);
-		break;
-	default:
-		netdev_warn(ndev, "Unsupported event type %d\n", status);
 	}
 }
 
@@ -160,8 +156,6 @@ static netdev_tx_t ntb_netdev_start_xmit(struct sk_buff *skb,
 	struct ntb_netdev *dev = netdev_priv(ndev);
 	int rc;
 
-	netdev_dbg(ndev, "%s: skb len %d\n", __func__, skb->len);
-
 	rc = ntb_transport_tx_enqueue(dev->qp, skb, skb->data, skb->len);
 	if (rc)
 		goto err;
@@ -322,20 +316,26 @@ static const struct ntb_queue_handlers ntb_netdev_handlers = {
 	.event_handler = ntb_netdev_event_handler,
 };
 
-static int ntb_netdev_probe(struct pci_dev *pdev)
+static int ntb_netdev_probe(struct device *client_dev)
 {
+	struct ntb_dev *ntb;
 	struct net_device *ndev;
+	struct pci_dev *pdev;
 	struct ntb_netdev *dev;
 	int rc;
 
-	ndev = alloc_etherdev(sizeof(struct ntb_netdev));
+	ntb = dev_ntb(client_dev->parent);
+	pdev = ntb->pdev;
+	if (!pdev)
+		return -ENODEV;
+
+	ndev = alloc_etherdev(sizeof(*dev));
 	if (!ndev)
 		return -ENOMEM;
 
 	dev = netdev_priv(ndev);
 	dev->ndev = ndev;
 	dev->pdev = pdev;
-	BUG_ON(!dev->pdev);
 	ndev->features = NETIF_F_HIGHDMA;
 
 	ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
@@ -349,7 +349,8 @@ static int ntb_netdev_probe(struct pci_dev *pdev)
 	ndev->netdev_ops = &ntb_netdev_ops;
 	ndev->ethtool_ops = &ntb_ethtool_ops;
 
-	dev->qp = ntb_transport_create_queue(ndev, pdev, &ntb_netdev_handlers);
+	dev->qp = ntb_transport_create_queue(ndev, client_dev,
+					     &ntb_netdev_handlers);
 	if (!dev->qp) {
 		rc = -EIO;
 		goto err;
@@ -372,12 +373,17 @@ err:
 	return rc;
 }
 
-static void ntb_netdev_remove(struct pci_dev *pdev)
+static void ntb_netdev_remove(struct device *client_dev)
 {
+	struct ntb_dev *ntb;
 	struct net_device *ndev;
+	struct pci_dev *pdev;
 	struct ntb_netdev *dev;
 	bool found = false;
 
+	ntb = dev_ntb(client_dev->parent);
+	pdev = ntb->pdev;
+
 	list_for_each_entry(dev, &dev_list, list) {
 		if (dev->pdev == pdev) {
 			found = true;
@@ -396,7 +402,7 @@ static void ntb_netdev_remove(struct pci_dev *pdev)
 	free_netdev(ndev);
 }
 
-static struct ntb_client ntb_netdev_client = {
+static struct ntb_transport_client ntb_netdev_client = {
 	.driver.name = KBUILD_MODNAME,
 	.driver.owner = THIS_MODULE,
 	.probe = ntb_netdev_probe,
@@ -407,7 +413,7 @@ static int __init ntb_netdev_init_module(void)
 {
 	int rc;
 
-	rc = ntb_register_client_dev(KBUILD_MODNAME);
+	rc = ntb_transport_register_client_dev(KBUILD_MODNAME);
 	if (rc)
 		return rc;
 	return ntb_transport_register_client(&ntb_netdev_client);
@@ -417,6 +423,6 @@ module_init(ntb_netdev_init_module);
 static void __exit ntb_netdev_exit_module(void)
 {
 	ntb_transport_unregister_client(&ntb_netdev_client);
-	ntb_unregister_client_dev(KBUILD_MODNAME);
+	ntb_transport_unregister_client_dev(KBUILD_MODNAME);
 }
 module_exit(ntb_netdev_exit_module);
diff --git a/drivers/ntb/Kconfig b/drivers/ntb/Kconfig
index f69df793dbe2..53b042429673 100644
--- a/drivers/ntb/Kconfig
+++ b/drivers/ntb/Kconfig
@@ -1,13 +1,26 @@
-config NTB
-       tristate "Intel Non-Transparent Bridge support"
-       depends on PCI
-       depends on X86
-       help
-        The PCI-E Non-transparent bridge hardware is a point-to-point PCI-E bus
-        connecting 2 systems.  When configured, writes to the device's PCI
-        mapped memory will be mirrored to a buffer on the remote system.  The
-        ntb Linux driver uses this point-to-point communication as a method to
-        transfer data from one system to the other.
-
-        If unsure, say N.
+menuconfig NTB
+	tristate "Non-Transparent Bridge support"
+	depends on PCI
+	help
+	 The PCI-E Non-transparent bridge hardware is a point-to-point PCI-E bus
+	 connecting 2 systems.  When configured, writes to the device's PCI
+	 mapped memory will be mirrored to a buffer on the remote system.  The
+	 ntb Linux driver uses this point-to-point communication as a method to
+	 transfer data from one system to the other.
 
+	 If unsure, say N.
+
+if NTB
+
+source "drivers/ntb/hw/Kconfig"
+
+config NTB_TRANSPORT
+	tristate "NTB Transport Client"
+	help
+	 This is a transport driver that enables connected systems to exchange
+	 messages over the ntb hardware.  The transport exposes a queue pair api
+	 to client drivers.
+
+	 If unsure, say N.
+
+endif # NTB
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 712e953a8fda..b9fa663ecfec 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,4 +1,2 @@
-obj-$(CONFIG_NTB) += ntb.o
-obj-$(CONFIG_NTB) += ntb_hw_intel.o
-
-ntb_hw_intel-objs := hw/intel/ntb_hw_intel.o ntb_transport.o
+obj-$(CONFIG_NTB) += ntb.o hw/
+obj-$(CONFIG_NTB_TRANSPORT) += ntb_transport.o
diff --git a/drivers/ntb/hw/Kconfig b/drivers/ntb/hw/Kconfig
new file mode 100644
index 000000000000..4d5535c4cddf
--- /dev/null
+++ b/drivers/ntb/hw/Kconfig
@@ -0,0 +1 @@
+source "drivers/ntb/hw/intel/Kconfig"
diff --git a/drivers/ntb/hw/Makefile b/drivers/ntb/hw/Makefile
new file mode 100644
index 000000000000..175d7c92a569
--- /dev/null
+++ b/drivers/ntb/hw/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_INTEL)	+= intel/
diff --git a/drivers/ntb/hw/intel/Kconfig b/drivers/ntb/hw/intel/Kconfig
new file mode 100644
index 000000000000..91f995e33ac6
--- /dev/null
+++ b/drivers/ntb/hw/intel/Kconfig
@@ -0,0 +1,7 @@
+config NTB_INTEL
+	tristate "Intel Non-Transparent Bridge support"
+	depends on X86_64
+	help
+	 This driver supports Intel NTB on capable Xeon and Atom hardware.
+
+	 If unsure, say N.
diff --git a/drivers/ntb/hw/intel/Makefile b/drivers/ntb/hw/intel/Makefile
new file mode 100644
index 000000000000..1b434568d2ad
--- /dev/null
+++ b/drivers/ntb/hw/intel/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_INTEL) += ntb_hw_intel.o
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c b/drivers/ntb/hw/intel/ntb_hw_intel.c
index 044534a995f1..686091756ba4 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.c
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -45,6 +47,7 @@
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
  */
+
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/init.h>
@@ -53,99 +56,97 @@
 #include <linux/pci.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#include <linux/ntb.h>
+
 #include "ntb_hw_intel.h"
 
-#define NTB_NAME	"Intel(R) PCI-E Non-Transparent Bridge Driver"
-#define NTB_VER		"1.0"
+#define NTB_NAME	"ntb_hw_intel"
+#define NTB_DESC	"Intel(R) PCI-E Non-Transparent Bridge Driver"
+#define NTB_VER		"2.0"
 
-MODULE_DESCRIPTION(NTB_NAME);
+MODULE_DESCRIPTION(NTB_DESC);
 MODULE_VERSION(NTB_VER);
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Intel Corporation");
 
-enum {
-	NTB_CONN_TRANSPARENT = 0,
-	NTB_CONN_B2B,
-	NTB_CONN_RP,
-};
-
-enum {
-	NTB_DEV_USD = 0,
-	NTB_DEV_DSD,
-};
-
-enum {
-	SNB_HW = 0,
-	BWD_HW,
-};
-
+#define bar0_off(base, bar) ((base) + ((bar) << 2))
+#define bar2_off(base, bar) bar0_off(base, (bar) - 2)
+
+static int b2b_mw_idx = -1;
+module_param(b2b_mw_idx, int, 0644);
+MODULE_PARM_DESC(b2b_mw_idx, "Use this mw idx to access the peer ntb.  A "
+		 "value of zero or positive starts from first mw idx, and a "
+		 "negative value starts from last mw idx.  Both sides MUST "
+		 "set the same value here!");
+
+static unsigned int b2b_mw_share;
+module_param(b2b_mw_share, uint, 0644);
+MODULE_PARM_DESC(b2b_mw_share, "If the b2b mw is large enough, configure the "
+		 "ntb so that the peer ntb only occupies the first half of "
+		 "the mw, so the second half can still be used as a mw.  Both "
+		 "sides MUST set the same value here!");
+
+static const struct intel_ntb_reg bwd_reg;
+static const struct intel_ntb_alt_reg bwd_pri_reg;
+static const struct intel_ntb_alt_reg bwd_sec_reg;
+static const struct intel_ntb_alt_reg bwd_b2b_reg;
+static const struct intel_ntb_xlat_reg bwd_pri_xlat;
+static const struct intel_ntb_xlat_reg bwd_sec_xlat;
+static const struct intel_ntb_reg snb_reg;
+static const struct intel_ntb_alt_reg snb_pri_reg;
+static const struct intel_ntb_alt_reg snb_sec_reg;
+static const struct intel_ntb_alt_reg snb_b2b_reg;
+static const struct intel_ntb_xlat_reg snb_pri_xlat;
+static const struct intel_ntb_xlat_reg snb_sec_xlat;
+static const struct intel_b2b_addr snb_b2b_usd_addr;
+static const struct intel_b2b_addr snb_b2b_dsd_addr;
+
+static const struct ntb_dev_ops intel_ntb_ops;
+
+static const struct file_operations intel_ntb_debugfs_info;
 static struct dentry *debugfs_dir;
 
-#define BWD_LINK_RECOVERY_TIME	500
-
-/* Translate memory window 0,1,2 to BAR 2,4,5 */
-#define MW_TO_BAR(mw)	(mw == 0 ? 2 : (mw == 1 ? 4 : 5))
-
-static const struct pci_device_id ntb_pci_tbl[] = {
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
-	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
-	{0}
-};
-MODULE_DEVICE_TABLE(pci, ntb_pci_tbl);
-
-static int is_ntb_xeon(struct ntb_device *ndev)
+#ifndef ioread64
+#ifdef readq
+#define ioread64 readq
+#else
+#define ioread64 _ioread64
+static inline u64 _ioread64(void __iomem *mmio)
 {
-	switch (ndev->pdev->device) {
-	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_SS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_PS_HSX:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_JSF:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
-	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
-		return 1;
-	default:
-		return 0;
-	}
+	u64 low, high;
 
-	return 0;
+	low = ioread32(mmio);
+	high = ioread32(mmio + sizeof(u32));
+	return low | (high << 32);
+}
+#endif
+#endif
+
+#ifndef iowrite64
+#ifdef writeq
+#define iowrite64 writeq
+#else
+#define iowrite64 _iowrite64
+static inline void _iowrite64(u64 val, void __iomem *mmio)
+{
+	iowrite32(val, mmio);
+	iowrite32(val >> 32, mmio + sizeof(u32));
 }
+#endif
+#endif
 
-static int is_ntb_atom(struct ntb_device *ndev)
+static inline int pdev_is_bwd(struct pci_dev *pdev)
 {
-	switch (ndev->pdev->device) {
+	switch (pdev->device) {
 	case PCI_DEVICE_ID_INTEL_NTB_B2B_BWD:
 		return 1;
-	default:
-		return 0;
 	}
-
 	return 0;
 }
 
-static void ntb_set_errata_flags(struct ntb_device *ndev)
+static inline int pdev_is_snb(struct pci_dev *pdev)
 {
-	switch (ndev->pdev->device) {
-	/*
-	 * this workaround applies to all platform up to IvyBridge
-	 * Haswell has splitbar support and use a different workaround
-	 */
+	switch (pdev->device) {
 	case PCI_DEVICE_ID_INTEL_NTB_SS_JSF:
 	case PCI_DEVICE_ID_INTEL_NTB_SS_SNB:
 	case PCI_DEVICE_ID_INTEL_NTB_SS_IVT:
@@ -158,1737 +159,1957 @@ static void ntb_set_errata_flags(struct ntb_device *ndev)
 	case PCI_DEVICE_ID_INTEL_NTB_B2B_SNB:
 	case PCI_DEVICE_ID_INTEL_NTB_B2B_IVT:
 	case PCI_DEVICE_ID_INTEL_NTB_B2B_HSX:
-		ndev->wa_flags |= WA_SNB_ERR;
-		break;
+		return 1;
 	}
+	return 0;
 }
 
-/**
- * ntb_register_event_callback() - register event callback
- * @ndev: pointer to ntb_device instance
- * @func: callback function to register
- *
- * This function registers a callback for any HW driver events such as link
- * up/down, power management notices and etc.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_register_event_callback(struct ntb_device *ndev,
-				void (*func)(void *handle,
-					     enum ntb_hw_event event))
+static inline void ndev_reset_unsafe_flags(struct intel_ntb_dev *ndev)
 {
-	if (ndev->event_cb)
-		return -EINVAL;
-
-	ndev->event_cb = func;
-
-	return 0;
+	ndev->unsafe_flags = 0;
+	ndev->unsafe_flags_ignore = 0;
+
+	/* Only B2B has a workaround to avoid SDOORBELL */
+	if (ndev->hwerr_flags & NTB_HWERR_SDOORBELL_LOCKUP)
+		if (!ntb_topo_is_b2b(ndev->ntb.topo))
+			ndev->unsafe_flags |= NTB_UNSAFE_DB;
+
+	/* No low level workaround to avoid SB01BASE */
+	if (ndev->hwerr_flags & NTB_HWERR_SB01BASE_LOCKUP) {
+		ndev->unsafe_flags |= NTB_UNSAFE_DB;
+		ndev->unsafe_flags |= NTB_UNSAFE_SPAD;
+	}
 }
 
-/**
- * ntb_unregister_event_callback() - unregisters the event callback
- * @ndev: pointer to ntb_device instance
- *
- * This function unregisters the existing callback from transport
- */
-void ntb_unregister_event_callback(struct ntb_device *ndev)
+static inline int ndev_is_unsafe(struct intel_ntb_dev *ndev,
+				 unsigned long flag)
 {
-	ndev->event_cb = NULL;
+	return !!(flag & ndev->unsafe_flags & ~ndev->unsafe_flags_ignore);
 }
 
-static void ntb_irq_work(unsigned long data)
+static inline int ndev_ignore_unsafe(struct intel_ntb_dev *ndev,
+				     unsigned long flag)
 {
-	struct ntb_db_cb *db_cb = (struct ntb_db_cb *)data;
-	int rc;
+	flag &= ndev->unsafe_flags;
+	ndev->unsafe_flags_ignore |= flag;
 
-	rc = db_cb->callback(db_cb->data, db_cb->db_num);
-	if (rc)
-		tasklet_schedule(&db_cb->irq_work);
-	else {
-		struct ntb_device *ndev = db_cb->ndev;
-		unsigned long mask;
-
-		mask = readw(ndev->reg_ofs.ldb_mask);
-		clear_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-		writew(mask, ndev->reg_ofs.ldb_mask);
-	}
+	return !!flag;
 }
 
-/**
- * ntb_register_db_callback() - register a callback for doorbell interrupt
- * @ndev: pointer to ntb_device instance
- * @idx: doorbell index to register callback, zero based
- * @data: pointer to be returned to caller with every callback
- * @func: callback function to register
- *
- * This function registers a callback function for the doorbell interrupt
- * on the primary side. The function will unmask the doorbell as well to
- * allow interrupt.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
-			     void *data, int (*func)(void *data, int db_num))
+static int ndev_mw_to_bar(struct intel_ntb_dev *ndev, int idx)
 {
-	unsigned long mask;
-
-	if (idx >= ndev->max_cbs || ndev->db_cb[idx].callback) {
-		dev_warn(&ndev->pdev->dev, "Invalid Index.\n");
+	if (idx < 0 || idx > ndev->mw_count)
 		return -EINVAL;
-	}
+	return ndev->reg->mw_bar[idx];
+}
 
-	ndev->db_cb[idx].callback = func;
-	ndev->db_cb[idx].data = data;
-	ndev->db_cb[idx].ndev = ndev;
+static inline int ndev_db_addr(struct intel_ntb_dev *ndev,
+			       phys_addr_t *db_addr, resource_size_t *db_size,
+			       phys_addr_t reg_addr, unsigned long reg)
+{
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
 
-	tasklet_init(&ndev->db_cb[idx].irq_work, ntb_irq_work,
-		     (unsigned long) &ndev->db_cb[idx]);
+	if (db_addr) {
+		*db_addr = reg_addr + reg;
+		dev_dbg(ndev_dev(ndev), "Peer db addr %llx\n", *db_addr);
+	}
 
-	/* unmask interrupt */
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	clear_bit(idx * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
+	if (db_size) {
+		*db_size = ndev->reg->db_size;
+		dev_dbg(ndev_dev(ndev), "Peer db size %llx\n", *db_size);
+	}
 
 	return 0;
 }
 
-/**
- * ntb_unregister_db_callback() - unregister a callback for doorbell interrupt
- * @ndev: pointer to ntb_device instance
- * @idx: doorbell index to register callback, zero based
- *
- * This function unregisters a callback function for the doorbell interrupt
- * on the primary side. The function will also mask the said doorbell.
- */
-void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx)
+static inline u64 ndev_db_read(struct intel_ntb_dev *ndev,
+			       void __iomem *mmio)
 {
-	unsigned long mask;
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
 
-	if (idx >= ndev->max_cbs || !ndev->db_cb[idx].callback)
-		return;
+	return ndev->reg->db_ioread(mmio);
+}
+
+static inline int ndev_db_write(struct intel_ntb_dev *ndev, u64 db_bits,
+				void __iomem *mmio)
+{
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
 
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(idx * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
+	if (db_bits & ~ndev->db_valid_mask)
+		return -EINVAL;
 
-	tasklet_disable(&ndev->db_cb[idx].irq_work);
+	ndev->reg->db_iowrite(db_bits, mmio);
 
-	ndev->db_cb[idx].callback = NULL;
+	return 0;
 }
 
-/**
- * ntb_find_transport() - find the transport pointer
- * @transport: pointer to pci device
- *
- * Given the pci device pointer, return the transport pointer passed in when
- * the transport attached when it was inited.
- *
- * RETURNS: pointer to transport.
- */
-void *ntb_find_transport(struct pci_dev *pdev)
+static inline int ndev_db_set_mask(struct intel_ntb_dev *ndev, u64 db_bits,
+				   void __iomem *mmio)
 {
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-	return ndev->ntb_transport;
-}
+	unsigned long irqflags;
 
-/**
- * ntb_register_transport() - Register NTB transport with NTB HW driver
- * @transport: transport identifier
- *
- * This function allows a transport to reserve the hardware driver for
- * NTB usage.
- *
- * RETURNS: pointer to ntb_device, NULL on error.
- */
-struct ntb_device *ntb_register_transport(struct pci_dev *pdev, void *transport)
-{
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
+
+	if (db_bits & ~ndev->db_valid_mask)
+		return -EINVAL;
 
-	if (ndev->ntb_transport)
-		return NULL;
+	spin_lock_irqsave(&ndev->db_mask_lock, irqflags);
+	{
+		ndev->db_mask |= db_bits;
+		ndev->reg->db_iowrite(ndev->db_mask, mmio);
+	}
+	spin_unlock_irqrestore(&ndev->db_mask_lock, irqflags);
 
-	ndev->ntb_transport = transport;
-	return ndev;
+	return 0;
 }
 
-/**
- * ntb_unregister_transport() - Unregister the transport with the NTB HW driver
- * @ndev - ntb_device of the transport to be freed
- *
- * This function unregisters the transport from the HW driver and performs any
- * necessary cleanups.
- */
-void ntb_unregister_transport(struct ntb_device *ndev)
+static inline int ndev_db_clear_mask(struct intel_ntb_dev *ndev, u64 db_bits,
+				     void __iomem *mmio)
 {
-	int i;
+	unsigned long irqflags;
 
-	if (!ndev->ntb_transport)
-		return;
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_DB));
+
+	if (db_bits & ~ndev->db_valid_mask)
+		return -EINVAL;
 
-	for (i = 0; i < ndev->max_cbs; i++)
-		ntb_unregister_db_callback(ndev, i);
+	spin_lock_irqsave(&ndev->db_mask_lock, irqflags);
+	{
+		ndev->db_mask &= ~db_bits;
+		ndev->reg->db_iowrite(ndev->db_mask, mmio);
+	}
+	spin_unlock_irqrestore(&ndev->db_mask_lock, irqflags);
 
-	ntb_unregister_event_callback(ndev);
-	ndev->ntb_transport = NULL;
+	return 0;
 }
 
-/**
- * ntb_write_local_spad() - write to the secondary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. This writes over the data mirrored to the local scratchpad register
- * by the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+static inline int ndev_vec_mask(struct intel_ntb_dev *ndev, int db_vector)
 {
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
+	u64 shift, mask;
 
-	dev_dbg(&ndev->pdev->dev, "Writing %x to local scratch pad index %d\n",
-		val, idx);
-	writel(val, ndev->reg_ofs.spad_read + idx * 4);
+	shift = ndev->db_vec_shift;
+	mask = BIT_ULL(shift) - 1;
 
-	return 0;
+	return mask << (shift * db_vector);
 }
 
-/**
- * ntb_read_local_spad() - read from the primary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.  This allows the local system to read data
- * written and mirrored to the scratchpad register by the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+static inline int ndev_spad_addr(struct intel_ntb_dev *ndev, int idx,
+				 phys_addr_t *spad_addr, phys_addr_t reg_addr,
+				 unsigned long reg)
 {
-	if (idx >= ndev->limits.max_spads)
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_SPAD));
+
+	if (idx < 0 || idx >= ndev->spad_count)
 		return -EINVAL;
 
-	*val = readl(ndev->reg_ofs.spad_write + idx * 4);
-	dev_dbg(&ndev->pdev->dev,
-		"Reading %x from local scratch pad index %d\n", *val, idx);
+	if (spad_addr) {
+		*spad_addr = reg_addr + reg + (idx << 2);
+		dev_dbg(ndev_dev(ndev), "Peer spad addr %llx\n", *spad_addr);
+	}
 
 	return 0;
 }
 
-/**
- * ntb_write_remote_spad() - write to the secondary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to the scratchpad register, 0 based
- * @val: the data value to put into the register
- *
- * This function allows writing of a 32bit value to the indexed scratchpad
- * register. The register resides on the secondary (external) side.  This allows
- * the local system to write data to be mirrored to the remote systems
- * scratchpad register.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val)
+static inline u32 ndev_spad_read(struct intel_ntb_dev *ndev, int idx,
+				 void __iomem *mmio)
 {
-	if (idx >= ndev->limits.max_spads)
-		return -EINVAL;
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_SPAD));
 
-	dev_dbg(&ndev->pdev->dev, "Writing %x to remote scratch pad index %d\n",
-		val, idx);
-	writel(val, ndev->reg_ofs.spad_write + idx * 4);
+	if (idx < 0 || idx >= ndev->spad_count)
+		return 0;
 
-	return 0;
+	return ioread32(mmio + (idx << 2));
 }
 
-/**
- * ntb_read_remote_spad() - read from the primary scratchpad register
- * @ndev: pointer to ntb_device instance
- * @idx: index to scratchpad register, 0 based
- * @val: pointer to 32bit integer for storing the register value
- *
- * This function allows reading of the 32bit scratchpad register on
- * the primary (internal) side.  This alloows the local system to read the data
- * it wrote to be mirrored on the remote system.
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val)
+static inline int ndev_spad_write(struct intel_ntb_dev *ndev, int idx, u32 val,
+				  void __iomem *mmio)
 {
-	if (idx >= ndev->limits.max_spads)
+	WARN_ON_ONCE(ndev_is_unsafe(ndev, NTB_UNSAFE_SPAD));
+
+	if (idx < 0 || idx >= ndev->spad_count)
 		return -EINVAL;
 
-	*val = readl(ndev->reg_ofs.spad_read + idx * 4);
-	dev_dbg(&ndev->pdev->dev,
-		"Reading %x from remote scratch pad index %d\n", *val, idx);
+	iowrite32(val, mmio + (idx << 2));
 
 	return 0;
 }
 
-/**
- * ntb_get_mw_base() - get addr for the NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the base address of the memory window specified.
- *
- * RETURNS: address, or NULL on error.
- */
-resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw)
+static irqreturn_t ndev_interrupt(struct intel_ntb_dev *ndev, int vec)
 {
-	if (mw >= ntb_max_mw(ndev))
-		return 0;
+	u64 vec_mask;
+
+	vec_mask = ndev_vec_mask(ndev, vec);
+
+	dev_dbg(ndev_dev(ndev), "vec %d vec_mask %llx\n", vec, vec_mask);
 
-	return pci_resource_start(ndev->pdev, MW_TO_BAR(mw));
+	ndev->last_ts = jiffies;
+
+	if (vec_mask & ndev->db_link_mask) {
+		if (ndev->reg->poll_link(ndev))
+			ntb_link_event(&ndev->ntb);
+	}
+
+	if (vec_mask & ndev->db_valid_mask)
+		ntb_db_event(&ndev->ntb, vec);
+
+	return IRQ_HANDLED;
 }
 
-/**
- * ntb_get_mw_vbase() - get virtual addr for the NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the base virtual address of the memory window
- * specified.
- *
- * RETURNS: pointer to virtual address, or NULL on error.
- */
-void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw)
+static irqreturn_t ndev_vec_isr(int irq, void *dev)
 {
-	if (mw >= ntb_max_mw(ndev))
-		return NULL;
+	struct intel_ntb_vec *nvec = dev;
 
-	return ndev->mw[mw].vbase;
+	return ndev_interrupt(nvec->ndev, nvec->num);
 }
 
-/**
- * ntb_get_mw_size() - return size of NTB memory window
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- *
- * This function provides the physical size of the memory window specified
- *
- * RETURNS: the size of the memory window or zero on error
- */
-u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw)
+static irqreturn_t ndev_irq_isr(int irq, void *dev)
 {
-	if (mw >= ntb_max_mw(ndev))
-		return 0;
+	struct intel_ntb_dev *ndev = dev;
 
-	return ndev->mw[mw].bar_sz;
+	return ndev_interrupt(ndev, irq - ndev_pdev(ndev)->irq);
 }
 
-/**
- * ntb_set_mw_addr - set the memory window address
- * @ndev: pointer to ntb_device instance
- * @mw: memory window number
- * @addr: base address for data
- *
- * This function sets the base physical address of the memory window.  This
- * memory address is where data from the remote system will be transfered into
- * or out of depending on how the transport is configured.
- */
-void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr)
+static int ndev_init_isr(struct intel_ntb_dev *ndev,
+			 int msix_min, int msix_max,
+			 int msix_shift, int total_shift)
 {
-	if (mw >= ntb_max_mw(ndev))
-		return;
+	struct pci_dev *pdev;
+	int rc, i, msix_count;
 
-	dev_dbg(&ndev->pdev->dev, "Writing addr %Lx to BAR %d\n", addr,
-		MW_TO_BAR(mw));
+	pdev = ndev_pdev(ndev);
 
-	ndev->mw[mw].phys_addr = addr;
+	/* Mask all doorbell interrupts */
+	ndev->db_mask = ndev->db_valid_mask;
+	ndev->reg->db_iowrite(ndev->db_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_mask);
 
-	switch (MW_TO_BAR(mw)) {
-	case NTB_BAR_23:
-		writeq(addr, ndev->reg_ofs.bar2_xlat);
-		break;
-	case NTB_BAR_4:
-		if (ndev->split_bar)
-			writel(addr, ndev->reg_ofs.bar4_xlat);
-		else
-			writeq(addr, ndev->reg_ofs.bar4_xlat);
-		break;
-	case NTB_BAR_5:
-		writel(addr, ndev->reg_ofs.bar5_xlat);
-		break;
+	/* Try to set up msix irq */
+
+	ndev->vec = kcalloc(msix_max, sizeof(*ndev->vec), GFP_KERNEL);
+	if (!ndev->vec)
+		goto err_msix_vec_alloc;
+
+	ndev->msix = kcalloc(msix_max, sizeof(*ndev->msix), GFP_KERNEL);
+	if (!ndev->msix)
+		goto err_msix_alloc;
+
+	for (i = 0; i < msix_max; ++i)
+		ndev->msix[i].entry = i;
+
+	msix_count = pci_enable_msix_range(pdev, ndev->msix,
+					   msix_min, msix_max);
+	if (msix_count < 0)
+		goto err_msix_enable;
+
+	for (i = 0; i < msix_count; ++i) {
+		ndev->vec[i].ndev = ndev;
+		ndev->vec[i].num = i;
+		rc = request_irq(ndev->msix[i].vector, ndev_vec_isr, 0,
+				 "ndev_vec_isr", &ndev->vec[i]);
+		if (rc)
+			goto err_msix_request;
 	}
-}
 
-/**
- * ntb_ring_doorbell() - Set the doorbell on the secondary/external side
- * @ndev: pointer to ntb_device instance
- * @db: doorbell to ring
- *
- * This function allows triggering of a doorbell on the secondary/external
- * side that will initiate an interrupt on the remote host
- *
- * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
- */
-void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int db)
-{
-	dev_dbg(&ndev->pdev->dev, "%s: ringing doorbell %d\n", __func__, db);
+	dev_dbg(ndev_dev(ndev), "Using msix interrupts\n");
+	ndev->db_vec_count = msix_count;
+	ndev->db_vec_shift = msix_shift;
+	return 0;
 
-	if (ndev->hw_type == BWD_HW)
-		writeq((u64) 1 << db, ndev->reg_ofs.rdb);
-	else
-		writew(((1 << ndev->bits_per_vector) - 1) <<
-		       (db * ndev->bits_per_vector), ndev->reg_ofs.rdb);
-}
+err_msix_request:
+	while (i-- > 0)
+		free_irq(ndev->msix[i].vector, ndev);
+	pci_disable_msix(pdev);
+err_msix_enable:
+	kfree(ndev->msix);
+err_msix_alloc:
+	kfree(ndev->vec);
+err_msix_vec_alloc:
+	ndev->msix = NULL;
+	ndev->vec = NULL;
 
-static void bwd_recover_link(struct ntb_device *ndev)
-{
-	u32 status;
+	/* Try to set up msi irq */
 
-	/* Driver resets the NTB ModPhy lanes - magic! */
-	writeb(0xe0, ndev->reg_base + BWD_MODPHY_PCSREG6);
-	writeb(0x40, ndev->reg_base + BWD_MODPHY_PCSREG4);
-	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG4);
-	writeb(0x60, ndev->reg_base + BWD_MODPHY_PCSREG6);
+	rc = pci_enable_msi(pdev);
+	if (rc)
+		goto err_msi_enable;
 
-	/* Driver waits 100ms to allow the NTB ModPhy to settle */
-	msleep(100);
+	rc = request_irq(pdev->irq, ndev_irq_isr, 0,
+			 "ndev_irq_isr", ndev);
+	if (rc)
+		goto err_msi_request;
 
-	/* Clear AER Errors, write to clear */
-	status = readl(ndev->reg_base + BWD_ERRCORSTS_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "ERRCORSTS = %x\n", status);
-	status &= PCI_ERR_COR_REP_ROLL;
-	writel(status, ndev->reg_base + BWD_ERRCORSTS_OFFSET);
+	dev_dbg(ndev_dev(ndev), "Using msi interrupts\n");
+	ndev->db_vec_count = 1;
+	ndev->db_vec_shift = total_shift;
+	return 0;
 
-	/* Clear unexpected electrical idle event in LTSSM, write to clear */
-	status = readl(ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "LTSSMERRSTS0 = %x\n", status);
-	status |= BWD_LTSSMERRSTS0_UNEXPECTEDEI;
-	writel(status, ndev->reg_base + BWD_LTSSMERRSTS0_OFFSET);
+err_msi_request:
+	pci_disable_msi(pdev);
+err_msi_enable:
 
-	/* Clear DeSkew Buffer error, write to clear */
-	status = readl(ndev->reg_base + BWD_DESKEWSTS_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "DESKEWSTS = %x\n", status);
-	status |= BWD_DESKEWSTS_DBERR;
-	writel(status, ndev->reg_base + BWD_DESKEWSTS_OFFSET);
+	/* Try to set up intx irq */
 
-	status = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "IBSTERRRCRVSTS0 = %x\n", status);
-	status &= BWD_IBIST_ERR_OFLOW;
-	writel(status, ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
+	pci_intx(pdev, 1);
 
-	/* Releases the NTB state machine to allow the link to retrain */
-	status = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-	dev_dbg(&ndev->pdev->dev, "LTSSMSTATEJMP = %x\n", status);
-	status &= ~BWD_LTSSMSTATEJMP_FORCEDETECT;
-	writel(status, ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
+	rc = request_irq(pdev->irq, ndev_irq_isr, IRQF_SHARED,
+			 "ndev_irq_isr", ndev);
+	if (rc)
+		goto err_intx_request;
+
+	dev_dbg(ndev_dev(ndev), "Using intx interrupts\n");
+	ndev->db_vec_count = 1;
+	ndev->db_vec_shift = total_shift;
+	return 0;
+
+err_intx_request:
+	return rc;
 }
 
-static void ntb_link_event(struct ntb_device *ndev, int link_state)
+static void ndev_deinit_isr(struct intel_ntb_dev *ndev)
 {
-	unsigned int event;
+	struct pci_dev *pdev;
+	int i;
 
-	if (ndev->link_status == link_state)
-		return;
+	pdev = ndev_pdev(ndev);
 
-	if (link_state == NTB_LINK_UP) {
-		u16 status;
-
-		dev_info(&ndev->pdev->dev, "Link Up\n");
-		ndev->link_status = NTB_LINK_UP;
-		event = NTB_EVENT_HW_LINK_UP;
-
-		if (is_ntb_atom(ndev) ||
-		    ndev->conn_type == NTB_CONN_TRANSPARENT)
-			status = readw(ndev->reg_ofs.lnk_stat);
-		else {
-			int rc = pci_read_config_word(ndev->pdev,
-						      SNB_LINK_STATUS_OFFSET,
-						      &status);
-			if (rc)
-				return;
-		}
+	/* Mask all doorbell interrupts */
+	ndev->db_mask = ndev->db_valid_mask;
+	ndev->reg->db_iowrite(ndev->db_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_mask);
 
-		ndev->link_width = (status & NTB_LINK_WIDTH_MASK) >> 4;
-		ndev->link_speed = (status & NTB_LINK_SPEED_MASK);
-		dev_info(&ndev->pdev->dev, "Link Width %d, Link Speed %d\n",
-			 ndev->link_width, ndev->link_speed);
+	if (ndev->msix) {
+		i = ndev->db_vec_count;
+		while (i--)
+			free_irq(ndev->msix[i].vector, &ndev->vec[i]);
+		pci_disable_msix(pdev);
+		kfree(ndev->msix);
+		kfree(ndev->vec);
 	} else {
-		dev_info(&ndev->pdev->dev, "Link Down\n");
-		ndev->link_status = NTB_LINK_DOWN;
-		event = NTB_EVENT_HW_LINK_DOWN;
-		/* Don't modify link width/speed, we need it in link recovery */
+		free_irq(pdev->irq, ndev);
+		if (pci_dev_msi_enabled(pdev))
+			pci_disable_msi(pdev);
 	}
-
-	/* notify the upper layer if we have an event change */
-	if (ndev->event_cb)
-		ndev->event_cb(ndev->ntb_transport, event);
 }
 
-static int ntb_link_status(struct ntb_device *ndev)
+static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf,
+				 size_t count, loff_t *offp)
 {
-	int link_state;
+	struct intel_ntb_dev *ndev;
+	void __iomem *mmio;
+	char *buf;
+	size_t buf_size;
+	ssize_t ret, off;
+	union { u64 v64; u32 v32; u16 v16; } u;
 
-	if (is_ntb_atom(ndev)) {
-		u32 ntb_cntl;
+	ndev = filp->private_data;
+	mmio = ndev->self_mmio;
 
-		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-		if (ntb_cntl & BWD_CNTL_LINK_DOWN)
-			link_state = NTB_LINK_DOWN;
-		else
-			link_state = NTB_LINK_UP;
-	} else {
-		u16 status;
-		int rc;
+	buf_size = min(count, 0x800ul);
 
-		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
-					  &status);
-		if (rc)
-			return rc;
+	buf = kmalloc(buf_size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-		if (status & NTB_LINK_STATUS_ACTIVE)
-			link_state = NTB_LINK_UP;
-		else
-			link_state = NTB_LINK_DOWN;
-	}
+	off = 0;
 
-	ntb_link_event(ndev, link_state);
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB Device Information:\n");
 
-	return 0;
-}
+	off += scnprintf(buf + off, buf_size - off,
+			 "Connection Topology -\t%s\n",
+			 ntb_topo_string(ndev->ntb.topo));
 
-static void bwd_link_recovery(struct work_struct *work)
-{
-	struct ntb_device *ndev = container_of(work, struct ntb_device,
-					       lr_timer.work);
-	u32 status32;
+	off += scnprintf(buf + off, buf_size - off,
+			 "B2B Offset -\t\t%#lx\n", ndev->b2b_off);
+	off += scnprintf(buf + off, buf_size - off,
+			 "B2B MW Idx -\t\t%d\n", ndev->b2b_idx);
+	off += scnprintf(buf + off, buf_size - off,
+			 "BAR4 Split -\t\t%s\n",
+			 ndev->bar4_split ? "yes" : "no");
 
-	bwd_recover_link(ndev);
-	/* There is a potential race between the 2 NTB devices recovering at the
-	 * same time.  If the times are the same, the link will not recover and
-	 * the driver will be stuck in this loop forever.  Add a random interval
-	 * to the recovery time to prevent this race.
-	 */
-	msleep(BWD_LINK_RECOVERY_TIME + prandom_u32() % BWD_LINK_RECOVERY_TIME);
-
-	status32 = readl(ndev->reg_base + BWD_LTSSMSTATEJMP_OFFSET);
-	if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT)
-		goto retry;
-
-	status32 = readl(ndev->reg_base + BWD_IBSTERRRCRVSTS0_OFFSET);
-	if (status32 & BWD_IBIST_ERR_OFLOW)
-		goto retry;
-
-	status32 = readl(ndev->reg_ofs.lnk_cntl);
-	if (!(status32 & BWD_CNTL_LINK_DOWN)) {
-		unsigned char speed, width;
-		u16 status16;
-
-		status16 = readw(ndev->reg_ofs.lnk_stat);
-		width = (status16 & NTB_LINK_WIDTH_MASK) >> 4;
-		speed = (status16 & NTB_LINK_SPEED_MASK);
-		if (ndev->link_width != width || ndev->link_speed != speed)
-			goto retry;
+	off += scnprintf(buf + off, buf_size - off,
+			 "NTB CTL -\t\t%#06x\n", ndev->ntb_ctl);
+	off += scnprintf(buf + off, buf_size - off,
+			 "LNK STA -\t\t%#06x\n", ndev->lnk_sta);
+
+	if (!ndev->reg->link_is_up(ndev)) {
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tDown\n");
+	} else {
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Status -\t\tUp\n");
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Speed -\t\tPCI-E Gen %u\n",
+				 NTB_LNK_STA_SPEED(ndev->lnk_sta));
+		off += scnprintf(buf + off, buf_size - off,
+				 "Link Width -\t\tx%u\n",
+				 NTB_LNK_STA_WIDTH(ndev->lnk_sta));
 	}
 
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
-	return;
+	off += scnprintf(buf + off, buf_size - off,
+			 "Memory Window Count -\t%u\n", ndev->mw_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Scratchpad Count -\t%u\n", ndev->spad_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Count -\t%u\n", ndev->db_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Vector Count -\t%u\n", ndev->db_vec_count);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Vector Shift -\t%u\n", ndev->db_vec_shift);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Valid Mask -\t%#llx\n", ndev->db_valid_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Link Mask -\t%#llx\n", ndev->db_link_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask Cached -\t%#llx\n", ndev->db_mask);
+
+	u.v64 = ndev_db_read(ndev, mmio + ndev->self_reg->db_mask);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Mask -\t\t%#llx\n", u.v64);
+
+	u.v64 = ndev_db_read(ndev, mmio + ndev->self_reg->db_bell);
+	off += scnprintf(buf + off, buf_size - off,
+			 "Doorbell Bell -\t\t%#llx\n", u.v64);
+
+	off += scnprintf(buf + off, buf_size - off,
+			 "\nNTB Incoming XLAT:\n");
+
+	u.v64 = ioread64(mmio + bar2_off(ndev->xlat_reg->bar2_xlat, 2));
+	off += scnprintf(buf + off, buf_size - off,
+			 "XLAT23 -\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + bar2_off(ndev->xlat_reg->bar2_xlat, 4));
+	off += scnprintf(buf + off, buf_size - off,
+			 "XLAT45 -\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + bar2_off(ndev->xlat_reg->bar2_limit, 2));
+	off += scnprintf(buf + off, buf_size - off,
+			 "LMT23 -\t\t\t%#018llx\n", u.v64);
+
+	u.v64 = ioread64(mmio + bar2_off(ndev->xlat_reg->bar2_limit, 4));
+	off += scnprintf(buf + off, buf_size - off,
+			 "LMT45 -\t\t\t%#018llx\n", u.v64);
+
+	if (pdev_is_snb(ndev->ntb.pdev)) {
+		if (ntb_topo_is_b2b(ndev->ntb.topo)) {
+			off += scnprintf(buf + off, buf_size - off,
+					 "\nNTB Outgoing B2B XLAT:\n");
+
+			u.v64 = ioread64(mmio + SNB_PBAR23XLAT_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "B2B XLAT23 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_PBAR45XLAT_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "B2B XLAT45 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_PBAR23LMT_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "B2B LMT23 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_PBAR45LMT_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "B2B LMT45 -\t\t%#018llx\n", u.v64);
+
+			off += scnprintf(buf + off, buf_size - off,
+					 "\nNTB Secondary BAR:\n");
+
+			u.v64 = ioread64(mmio + SNB_SBAR0BASE_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "SBAR01 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_SBAR23BASE_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "SBAR23 -\t\t%#018llx\n", u.v64);
+
+			u.v64 = ioread64(mmio + SNB_SBAR45BASE_OFFSET);
+			off += scnprintf(buf + off, buf_size - off,
+					 "SBAR45 -\t\t%#018llx\n", u.v64);
+		}
+
+		off += scnprintf(buf + off, buf_size - off,
+				 "\nSNB NTB Statistics:\n");
+
+		u.v16 = ioread16(mmio + SNB_USMEMMISS_OFFSET);
+		off += scnprintf(buf + off, buf_size - off,
+				 "Upstream Memory Miss -\t%u\n", u.v16);
+
+		off += scnprintf(buf + off, buf_size - off,
+				 "\nSNB NTB Hardware Errors:\n");
+
+		if (!pci_read_config_word(ndev->ntb.pdev,
+					  SNB_DEVSTS_OFFSET, &u.v16))
+			off += scnprintf(buf + off, buf_size - off,
+					 "DEVSTS -\t\t%#06x\n", u.v16);
+
+		if (!pci_read_config_word(ndev->ntb.pdev,
+					  SNB_LINK_STATUS_OFFSET, &u.v16))
+			off += scnprintf(buf + off, buf_size - off,
+					 "LNKSTS -\t\t%#06x\n", u.v16);
 
-retry:
-	schedule_delayed_work(&ndev->lr_timer, NTB_HB_TIMEOUT);
+		if (!pci_read_config_dword(ndev->ntb.pdev,
+					   SNB_UNCERRSTS_OFFSET, &u.v32))
+			off += scnprintf(buf + off, buf_size - off,
+					 "UNCERRSTS -\t\t%#06x\n", u.v32);
+
+		if (!pci_read_config_dword(ndev->ntb.pdev,
+					   SNB_CORERRSTS_OFFSET, &u.v32))
+			off += scnprintf(buf + off, buf_size - off,
+					 "CORERRSTS -\t\t%#06x\n", u.v32);
+	}
+
+	ret = simple_read_from_buffer(ubuf, count, offp, buf, off);
+	kfree(buf);
+	return ret;
 }
 
-/* BWD doesn't have link status interrupt, poll on that platform */
-static void bwd_link_poll(struct work_struct *work)
+static void ndev_init_debugfs(struct intel_ntb_dev *ndev)
 {
-	struct ntb_device *ndev = container_of(work, struct ntb_device,
-					       hb_timer.work);
-	unsigned long ts = jiffies;
-
-	/* If we haven't gotten an interrupt in a while, check the BWD link
-	 * status bit
-	 */
-	if (ts > ndev->last_ts + NTB_HB_TIMEOUT) {
-		int rc = ntb_link_status(ndev);
-		if (rc)
-			dev_err(&ndev->pdev->dev,
-				"Error determining link status\n");
-
-		/* Check to see if a link error is the cause of the link down */
-		if (ndev->link_status == NTB_LINK_DOWN) {
-			u32 status32 = readl(ndev->reg_base +
-					     BWD_LTSSMSTATEJMP_OFFSET);
-			if (status32 & BWD_LTSSMSTATEJMP_FORCEDETECT) {
-				schedule_delayed_work(&ndev->lr_timer, 0);
-				return;
-			}
-		}
+	if (!debugfs_dir) {
+		ndev->debugfs_dir = NULL;
+		ndev->debugfs_info = NULL;
+	} else {
+		ndev->debugfs_dir =
+			debugfs_create_dir(ndev_name(ndev), debugfs_dir);
+		if (!ndev->debugfs_dir)
+			ndev->debugfs_info = NULL;
+		else
+			ndev->debugfs_info =
+				debugfs_create_file("info", S_IRUSR,
+						    ndev->debugfs_dir, ndev,
+						    &intel_ntb_debugfs_info);
 	}
+}
 
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+static void ndev_deinit_debugfs(struct intel_ntb_dev *ndev)
+{
+	debugfs_remove_recursive(ndev->debugfs_dir);
 }
 
-static int ntb_xeon_setup(struct ntb_device *ndev)
+static int intel_ntb_mw_count(struct ntb_dev *ntb)
 {
-	switch (ndev->conn_type) {
-	case NTB_CONN_B2B:
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
-		if (ndev->split_bar)
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
-		ndev->limits.max_spads = SNB_MAX_B2B_SPADS;
+	return ntb_ndev(ntb)->mw_count;
+}
 
-		/* There is a Xeon hardware errata related to writes to
-		 * SDOORBELL or B2BDOORBELL in conjunction with inbound access
-		 * to NTB MMIO Space, which may hang the system.  To workaround
-		 * this use the second memory window to access the interrupt and
-		 * scratch pad registers on the remote system.
-		 */
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			if (!ndev->mw[ndev->limits.max_mw - 1].bar_sz)
-				return -EINVAL;
-
-			ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-			ndev->reg_ofs.spad_write =
-				ndev->mw[ndev->limits.max_mw - 1].vbase +
-				SNB_SPAD_OFFSET;
-			ndev->reg_ofs.rdb =
-				ndev->mw[ndev->limits.max_mw - 1].vbase +
-				SNB_PDOORBELL_OFFSET;
-
-			/* Set the Limit register to 4k, the minimum size, to
-			 * prevent an illegal access
-			 */
-			writeq(ndev->mw[1].bar_sz + 0x1000, ndev->reg_base +
-			       SNB_PBAR4LMT_OFFSET);
-			/* HW errata on the Limit registers.  They can only be
-			 * written when the base register is 4GB aligned and
-			 * < 32bit.  This should already be the case based on
-			 * the driver defaults, but write the Limit registers
-			 * first just in case.
-			 */
-
-			ndev->limits.max_mw = SNB_ERRATA_MAX_MW;
-		} else {
-			/* HW Errata on bit 14 of b2bdoorbell register.  Writes
-			 * will not be mirrored to the remote system.  Shrink
-			 * the number of bits by one, since bit 14 is the last
-			 * bit.
-			 */
-			ndev->limits.max_db_bits = SNB_MAX_DB_BITS - 1;
-			ndev->reg_ofs.spad_write = ndev->reg_base +
-						   SNB_B2B_SPAD_OFFSET;
-			ndev->reg_ofs.rdb = ndev->reg_base +
-					    SNB_B2B_DOORBELL_OFFSET;
-
-			/* Disable the Limit register, just incase it is set to
-			 * something silly. A 64bit write should handle it
-			 * regardless of whether it has a split BAR or not.
-			 */
-			writeq(0, ndev->reg_base + SNB_PBAR4LMT_OFFSET);
-			/* HW errata on the Limit registers.  They can only be
-			 * written when the base register is 4GB aligned and
-			 * < 32bit.  This should already be the case based on
-			 * the driver defaults, but write the Limit registers
-			 * first just in case.
-			 */
-			if (ndev->split_bar)
-				ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-			else
-				ndev->limits.max_mw = SNB_MAX_MW;
-		}
+static int intel_ntb_mw_get_range(struct ntb_dev *ntb, int idx,
+				  phys_addr_t *base,
+				  resource_size_t *size,
+				  resource_size_t *align,
+				  resource_size_t *align_size)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+	int bar;
 
-		/* The Xeon errata workaround requires setting SBAR Base
-		 * addresses to known values, so that the PBAR XLAT can be
-		 * pointed at SBAR0 of the remote system.
-		 */
-		if (ndev->dev_type == NTB_DEV_USD) {
-			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
-			       SNB_PBAR2XLAT_OFFSET);
-			if (ndev->wa_flags & WA_SNB_ERR)
-				writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
-				       SNB_PBAR4XLAT_OFFSET);
-			else {
-				if (ndev->split_bar) {
-					writel(SNB_MBAR4_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-					writel(SNB_MBAR5_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR5XLAT_OFFSET);
-				} else
-					writeq(SNB_MBAR4_DSD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-
-				/* B2B_XLAT_OFFSET is a 64bit register, but can
-				 * only take 32bit writes
-				 */
-				writel(SNB_MBAR01_DSD_ADDR & 0xffffffff,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
-				writel(SNB_MBAR01_DSD_ADDR >> 32,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
-			}
-
-			writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
-			       SNB_SBAR0BASE_OFFSET);
-			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
-			       SNB_SBAR2BASE_OFFSET);
-			if (ndev->split_bar) {
-				writel(SNB_MBAR4_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-				writel(SNB_MBAR5_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR5BASE_OFFSET);
-			} else
-				writeq(SNB_MBAR4_USD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-		} else {
-			writeq(SNB_MBAR23_USD_ADDR, ndev->reg_base +
-			       SNB_PBAR2XLAT_OFFSET);
-			if (ndev->wa_flags & WA_SNB_ERR)
-				writeq(SNB_MBAR01_USD_ADDR, ndev->reg_base +
-				       SNB_PBAR4XLAT_OFFSET);
-			else {
-				if (ndev->split_bar) {
-					writel(SNB_MBAR4_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-					writel(SNB_MBAR5_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR5XLAT_OFFSET);
-				} else
-					writeq(SNB_MBAR4_USD_ADDR,
-					       ndev->reg_base +
-					       SNB_PBAR4XLAT_OFFSET);
-
-				/*
-				 * B2B_XLAT_OFFSET is a 64bit register, but can
-				 * only take 32bit writes
-				 */
-				writel(SNB_MBAR01_USD_ADDR & 0xffffffff,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETL);
-				writel(SNB_MBAR01_USD_ADDR >> 32,
-				       ndev->reg_base + SNB_B2B_XLAT_OFFSETU);
-			}
-			writeq(SNB_MBAR01_DSD_ADDR, ndev->reg_base +
-			       SNB_SBAR0BASE_OFFSET);
-			writeq(SNB_MBAR23_DSD_ADDR, ndev->reg_base +
-			       SNB_SBAR2BASE_OFFSET);
-			if (ndev->split_bar) {
-				writel(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
-				writel(SNB_MBAR5_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR5BASE_OFFSET);
-			} else
-				writeq(SNB_MBAR4_DSD_ADDR, ndev->reg_base +
-				       SNB_SBAR4BASE_OFFSET);
+	if (idx >= ndev->b2b_idx && !ndev->b2b_off)
+		idx += 1;
 
-		}
-		break;
-	case NTB_CONN_RP:
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			dev_err(&ndev->pdev->dev,
-				"NTB-RP disabled due to hardware errata.\n");
-			return -EINVAL;
-		}
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
 
-		/* Scratch pads need to have exclusive access from the primary
-		 * or secondary side.  Halve the num spads so that each side can
-		 * have an equal amount.
-		 */
-		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
-		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-		/* Note: The SDOORBELL is the cause of the errata.  You REALLY
-		 * don't want to touch it.
-		 */
-		ndev->reg_ofs.rdb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_PDBMSK_OFFSET;
-		/* Offset the start of the spads to correspond to whether it is
-		 * primary or secondary
-		 */
-		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET +
-					   ndev->limits.max_spads * 4;
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_SBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_SBAR4XLAT_OFFSET;
-		if (ndev->split_bar) {
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_SBAR5XLAT_OFFSET;
-			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-		} else
-			ndev->limits.max_mw = SNB_MAX_MW;
-		break;
-	case NTB_CONN_TRANSPARENT:
-		if (ndev->wa_flags & WA_SNB_ERR) {
-			dev_err(&ndev->pdev->dev,
-				"NTB-TRANSPARENT disabled due to hardware errata.\n");
-			return -EINVAL;
-		}
+	if (base)
+		*base = pci_resource_start(ndev->ntb.pdev, bar) +
+			(idx == ndev->b2b_idx ? ndev->b2b_off : 0);
 
-		/* Scratch pads need to have exclusive access from the primary
-		 * or secondary side.  Halve the num spads so that each side can
-		 * have an equal amount.
-		 */
-		ndev->limits.max_spads = SNB_MAX_COMPAT_SPADS / 2;
-		ndev->limits.max_db_bits = SNB_MAX_DB_BITS;
-		ndev->reg_ofs.rdb = ndev->reg_base + SNB_PDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb = ndev->reg_base + SNB_SDOORBELL_OFFSET;
-		ndev->reg_ofs.ldb_mask = ndev->reg_base + SNB_SDBMSK_OFFSET;
-		ndev->reg_ofs.spad_write = ndev->reg_base + SNB_SPAD_OFFSET;
-		/* Offset the start of the spads to correspond to whether it is
-		 * primary or secondary
-		 */
-		ndev->reg_ofs.spad_read = ndev->reg_base + SNB_SPAD_OFFSET +
-					  ndev->limits.max_spads * 4;
-		ndev->reg_ofs.bar2_xlat = ndev->reg_base + SNB_PBAR2XLAT_OFFSET;
-		ndev->reg_ofs.bar4_xlat = ndev->reg_base + SNB_PBAR4XLAT_OFFSET;
-
-		if (ndev->split_bar) {
-			ndev->reg_ofs.bar5_xlat =
-				ndev->reg_base + SNB_PBAR5XLAT_OFFSET;
-			ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
-		} else
-			ndev->limits.max_mw = SNB_MAX_MW;
-		break;
-	default:
-		/*
-		 * we should never hit this. the detect function should've
-		 * take cared of everything.
-		 */
-		return -EINVAL;
-	}
+	if (size)
+		*size = pci_resource_len(ndev->ntb.pdev, bar) -
+			(idx == ndev->b2b_idx ? ndev->b2b_off : 0);
 
-	ndev->reg_ofs.lnk_cntl = ndev->reg_base + SNB_NTBCNTL_OFFSET;
-	ndev->reg_ofs.lnk_stat = ndev->reg_base + SNB_SLINK_STATUS_OFFSET;
-	ndev->reg_ofs.spci_cmd = ndev->reg_base + SNB_PCICMD_OFFSET;
+	if (align)
+		*align = pci_resource_len(ndev->ntb.pdev, bar);
 
-	ndev->limits.msix_cnt = SNB_MSIX_CNT;
-	ndev->bits_per_vector = SNB_DB_BITS_PER_VEC;
+	if (align_size)
+		*align_size = 1;
 
 	return 0;
 }
 
-static int ntb_bwd_setup(struct ntb_device *ndev)
+static int intel_ntb_mw_set_trans(struct ntb_dev *ntb, int idx,
+				  dma_addr_t addr, resource_size_t size)
 {
-	int rc;
-	u32 val;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
+	unsigned long base_reg, xlat_reg, limit_reg;
+	resource_size_t bar_size, mw_size;
+	void __iomem *mmio;
+	u64 base, limit, reg_val;
+	int bar;
 
-	ndev->hw_type = BWD_HW;
+	if (idx >= ndev->b2b_idx && !ndev->b2b_off)
+		idx += 1;
 
-	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &val);
-	if (rc)
-		return rc;
+	bar = ndev_mw_to_bar(ndev, idx);
+	if (bar < 0)
+		return bar;
 
-	switch ((val & BWD_PPD_CONN_TYPE) >> 8) {
-	case NTB_CONN_B2B:
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-	default:
-		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
+	bar_size = pci_resource_len(ndev->ntb.pdev, bar);
+
+	if (idx == ndev->b2b_idx)
+		mw_size = bar_size - ndev->b2b_off;
+	else
+		mw_size = bar_size;
+
+	/* hardware requires that addr is aligned to bar size */
+	if (addr & (bar_size - 1))
 		return -EINVAL;
+
+	/* make sure the range fits in the usable mw size */
+	if (size > mw_size)
+		return -EINVAL;
+
+	mmio = ndev->self_mmio;
+	base_reg = bar0_off(ndev->xlat_reg->bar0_base, bar);
+	xlat_reg = bar2_off(ndev->xlat_reg->bar2_xlat, bar);
+	limit_reg = bar2_off(ndev->xlat_reg->bar2_limit, bar);
+
+	if (bar < 4 || !ndev->bar4_split) {
+		base = ioread64(mmio + base_reg);
+
+		/* Set the limit if supported, if size is not mw_size */
+		if (limit_reg && size != mw_size)
+			limit = base + size;
+		else
+			limit = 0;
+
+		/* set and verify setting the translation address */
+		iowrite64(addr, mmio + xlat_reg);
+		reg_val = ioread64(mmio + xlat_reg);
+		if (reg_val != addr) {
+			iowrite64(0, mmio + xlat_reg);
+			return -EIO;
+		}
+
+		/* set and verify setting the limit */
+		iowrite64(limit, mmio + limit_reg);
+		reg_val = ioread64(mmio + limit_reg);
+		if (reg_val != limit) {
+			iowrite64(base, mmio + limit_reg);
+			iowrite64(0, mmio + xlat_reg);
+			return -EIO;
+		}
+	} else {
+		/* split bar addr range must all be 32 bit */
+		if (addr & (~0ull << 32))
+			return -EINVAL;
+		if ((addr + size) & (~0ull << 32))
+			return -EINVAL;
+
+		base = ioread32(mmio + base_reg);
+
+		/* Set the limit if supported, if size is not mw_size */
+		if (limit_reg && size != mw_size)
+			limit = base + size;
+		else
+			limit = 0;
+
+		/* set and verify setting the translation address */
+		iowrite32(addr, mmio + xlat_reg);
+		reg_val = ioread32(mmio + xlat_reg);
+		if (reg_val != addr) {
+			iowrite32(0, mmio + xlat_reg);
+			return -EIO;
+		}
+
+		/* set and verify setting the limit */
+		iowrite32(limit, mmio + limit_reg);
+		reg_val = ioread32(mmio + limit_reg);
+		if (reg_val != limit) {
+			iowrite32(base, mmio + limit_reg);
+			iowrite32(0, mmio + xlat_reg);
+			return -EIO;
+		}
 	}
 
-	if (val & BWD_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_DSD;
-	else
-		ndev->dev_type = NTB_DEV_USD;
+	return 0;
+}
 
-	/* Initiate PCI-E link training */
-	rc = pci_write_config_dword(ndev->pdev, NTB_PPD_OFFSET,
-				    val | BWD_PPD_INIT_LINK);
-	if (rc)
-		return rc;
+static int intel_ntb_link_is_up(struct ntb_dev *ntb,
+				enum ntb_speed *speed,
+				enum ntb_width *width)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	ndev->reg_ofs.ldb = ndev->reg_base + BWD_PDOORBELL_OFFSET;
-	ndev->reg_ofs.ldb_mask = ndev->reg_base + BWD_PDBMSK_OFFSET;
-	ndev->reg_ofs.rdb = ndev->reg_base + BWD_B2B_DOORBELL_OFFSET;
-	ndev->reg_ofs.bar2_xlat = ndev->reg_base + BWD_SBAR2XLAT_OFFSET;
-	ndev->reg_ofs.bar4_xlat = ndev->reg_base + BWD_SBAR4XLAT_OFFSET;
-	ndev->reg_ofs.lnk_cntl = ndev->reg_base + BWD_NTBCNTL_OFFSET;
-	ndev->reg_ofs.lnk_stat = ndev->reg_base + BWD_LINK_STATUS_OFFSET;
-	ndev->reg_ofs.spad_read = ndev->reg_base + BWD_SPAD_OFFSET;
-	ndev->reg_ofs.spad_write = ndev->reg_base + BWD_B2B_SPAD_OFFSET;
-	ndev->reg_ofs.spci_cmd = ndev->reg_base + BWD_PCICMD_OFFSET;
-	ndev->limits.max_mw = BWD_MAX_MW;
-	ndev->limits.max_spads = BWD_MAX_SPADS;
-	ndev->limits.max_db_bits = BWD_MAX_DB_BITS;
-	ndev->limits.msix_cnt = BWD_MSIX_CNT;
-	ndev->bits_per_vector = BWD_DB_BITS_PER_VEC;
-
-	/* Since bwd doesn't have a link interrupt, setup a poll timer */
-	INIT_DELAYED_WORK(&ndev->hb_timer, bwd_link_poll);
-	INIT_DELAYED_WORK(&ndev->lr_timer, bwd_link_recovery);
-	schedule_delayed_work(&ndev->hb_timer, NTB_HB_TIMEOUT);
+	if (ndev->reg->link_is_up(ndev)) {
+		if (speed)
+			*speed = NTB_LNK_STA_SPEED(ndev->lnk_sta);
+		if (width)
+			*width = NTB_LNK_STA_WIDTH(ndev->lnk_sta);
+		return 1;
+	} else {
+		/* TODO MAYBE: is it possible to observe the link speed and
+		 * width while link is training? */
+		if (speed)
+			*speed = NTB_SPEED_NONE;
+		if (width)
+			*width = NTB_WIDTH_NONE;
+		return 0;
+	}
+}
+
+static int intel_ntb_link_enable(struct ntb_dev *ntb,
+				 enum ntb_speed max_speed,
+				 enum ntb_width max_width)
+{
+	struct intel_ntb_dev *ndev;
+	u32 ntb_ctl;
+
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
+
+	if (ndev->ntb.topo == NTB_TOPO_SEC)
+		return -EINVAL;
+
+	dev_dbg(ndev_dev(ndev),
+		"Enabling link with max_speed %d max_width %d\n",
+		max_speed, max_width);
+	if (max_speed != NTB_SPEED_AUTO)
+		dev_dbg(ndev_dev(ndev), "ignoring max_speed %d\n", max_speed);
+	if (max_width != NTB_WIDTH_AUTO)
+		dev_dbg(ndev_dev(ndev), "ignoring max_width %d\n", max_width);
+
+	ntb_ctl = ioread32(ndev->self_mmio + ndev->reg->ntb_ctl);
+	ntb_ctl &= ~(NTB_CTL_DISABLE | NTB_CTL_CFG_LOCK);
+	ntb_ctl |= NTB_CTL_P2S_BAR2_SNOOP | NTB_CTL_S2P_BAR2_SNOOP;
+	ntb_ctl |= NTB_CTL_P2S_BAR4_SNOOP | NTB_CTL_S2P_BAR4_SNOOP;
+	if (ndev->bar4_split)
+		ntb_ctl |= NTB_CTL_P2S_BAR5_SNOOP | NTB_CTL_S2P_BAR5_SNOOP;
+	iowrite32(ntb_ctl, ndev->self_mmio + ndev->reg->ntb_ctl);
 
 	return 0;
 }
 
-static int ntb_device_setup(struct ntb_device *ndev)
+static int intel_ntb_link_disable(struct ntb_dev *ntb)
 {
-	int rc;
+	struct intel_ntb_dev *ndev;
+	u32 ntb_cntl;
 
-	if (is_ntb_xeon(ndev))
-		rc = ntb_xeon_setup(ndev);
-	else if (is_ntb_atom(ndev))
-		rc = ntb_bwd_setup(ndev);
-	else
-		rc = -ENODEV;
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
 
-	if (rc)
-		return rc;
+	if (ndev->ntb.topo == NTB_TOPO_SEC)
+		return -EINVAL;
 
-	if (ndev->conn_type == NTB_CONN_B2B)
-		/* Enable Bus Master and Memory Space on the secondary side */
-		writew(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
-		       ndev->reg_ofs.spci_cmd);
+	dev_dbg(ndev_dev(ndev), "Disabling link\n");
+
+	/* Bring NTB link down */
+	ntb_cntl = ioread32(ndev->self_mmio + ndev->reg->ntb_ctl);
+	ntb_cntl &= ~(NTB_CTL_P2S_BAR2_SNOOP | NTB_CTL_S2P_BAR2_SNOOP);
+	ntb_cntl &= ~(NTB_CTL_P2S_BAR4_SNOOP | NTB_CTL_S2P_BAR4_SNOOP);
+	if (ndev->bar4_split)
+		ntb_cntl &= ~(NTB_CTL_P2S_BAR5_SNOOP | NTB_CTL_S2P_BAR5_SNOOP);
+	ntb_cntl |= NTB_CTL_DISABLE | NTB_CTL_CFG_LOCK;
+	iowrite32(ntb_cntl, ndev->self_mmio + ndev->reg->ntb_ctl);
 
 	return 0;
 }
 
-static void ntb_device_free(struct ntb_device *ndev)
+static int intel_ntb_db_is_unsafe(struct ntb_dev *ntb)
 {
-	if (is_ntb_atom(ndev)) {
-		cancel_delayed_work_sync(&ndev->hb_timer);
-		cancel_delayed_work_sync(&ndev->lr_timer);
-	}
+	return ndev_ignore_unsafe(ntb_ndev(ntb), NTB_UNSAFE_DB);
 }
 
-static irqreturn_t bwd_callback_msix_irq(int irq, void *data)
+static u64 intel_ntb_db_valid_mask(struct ntb_dev *ntb)
 {
-	struct ntb_db_cb *db_cb = data;
-	struct ntb_device *ndev = db_cb->ndev;
-	unsigned long mask;
+	return ntb_ndev(ntb)->db_valid_mask;
+}
 
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
-		db_cb->db_num);
+static int intel_ntb_db_vector_count(struct ntb_dev *ntb)
+{
+	struct intel_ntb_dev *ndev;
 
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
 
-	tasklet_schedule(&db_cb->irq_work);
+	return ndev->db_vec_count;
+}
 
-	/* No need to check for the specific HB irq, any interrupt means
-	 * we're connected.
-	 */
-	ndev->last_ts = jiffies;
+static u64 intel_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	writeq((u64) 1 << db_cb->db_num, ndev->reg_ofs.ldb);
+	if (db_vector < 0 || db_vector > ndev->db_vec_count)
+		return 0;
 
-	return IRQ_HANDLED;
+	return ndev->db_valid_mask & ndev_vec_mask(ndev, db_vector);
 }
 
-static irqreturn_t xeon_callback_msix_irq(int irq, void *data)
+static u64 intel_ntb_db_read(struct ntb_dev *ntb)
 {
-	struct ntb_db_cb *db_cb = data;
-	struct ntb_device *ndev = db_cb->ndev;
-	unsigned long mask;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for DB %d\n", irq,
-		db_cb->db_num);
+	return ndev_db_read(ndev,
+			    ndev->self_mmio +
+			    ndev->self_reg->db_bell);
+}
 
-	mask = readw(ndev->reg_ofs.ldb_mask);
-	set_bit(db_cb->db_num * ndev->bits_per_vector, &mask);
-	writew(mask, ndev->reg_ofs.ldb_mask);
+static int intel_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	tasklet_schedule(&db_cb->irq_work);
+	return ndev_db_write(ndev, db_bits,
+			     ndev->self_mmio +
+			     ndev->self_reg->db_bell);
+}
 
-	/* On Sandybridge, there are 16 bits in the interrupt register
-	 * but only 4 vectors.  So, 5 bits are assigned to the first 3
-	 * vectors, with the 4th having a single bit for link
-	 * interrupts.
-	 */
-	writew(((1 << ndev->bits_per_vector) - 1) <<
-	       (db_cb->db_num * ndev->bits_per_vector), ndev->reg_ofs.ldb);
+static int intel_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	return IRQ_HANDLED;
+	return ndev_db_set_mask(ndev, db_bits,
+				ndev->self_mmio +
+				ndev->self_reg->db_mask);
 }
 
-/* Since we do not have a HW doorbell in BWD, this is only used in JF/JT */
-static irqreturn_t xeon_event_msix_irq(int irq, void *dev)
+static int intel_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
 {
-	struct ntb_device *ndev = dev;
-	int rc;
-
-	dev_dbg(&ndev->pdev->dev, "MSI-X irq %d received for Events\n", irq);
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	rc = ntb_link_status(ndev);
-	if (rc)
-		dev_err(&ndev->pdev->dev, "Error determining link status\n");
+	return ndev_db_clear_mask(ndev, db_bits,
+				  ndev->self_mmio +
+				  ndev->self_reg->db_mask);
+}
 
-	/* bit 15 is always the link bit */
-	writew(1 << SNB_LINK_DB, ndev->reg_ofs.ldb);
+static int intel_ntb_peer_db_addr(struct ntb_dev *ntb,
+				  phys_addr_t *db_addr,
+				  resource_size_t *db_size)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	return IRQ_HANDLED;
+	return ndev_db_addr(ndev, db_addr, db_size, ndev->peer_addr,
+			    ndev->peer_reg->db_bell);
 }
 
-static irqreturn_t ntb_interrupt(int irq, void *dev)
+static int intel_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 {
-	struct ntb_device *ndev = dev;
-	unsigned int i = 0;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	if (is_ntb_atom(ndev)) {
-		u64 ldb = readq(ndev->reg_ofs.ldb);
+	return ndev_db_write(ndev, db_bits,
+			     ndev->peer_mmio +
+			     ndev->peer_reg->db_bell);
+}
 
-		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %Lx\n", irq, ldb);
+static int intel_ntb_spad_is_unsafe(struct ntb_dev *ntb)
+{
+	return ndev_ignore_unsafe(ntb_ndev(ntb), NTB_UNSAFE_SPAD);
+}
 
-		while (ldb) {
-			i = __ffs(ldb);
-			ldb &= ldb - 1;
-			bwd_callback_msix_irq(irq, &ndev->db_cb[i]);
-		}
-	} else {
-		u16 ldb = readw(ndev->reg_ofs.ldb);
+static int intel_ntb_spad_count(struct ntb_dev *ntb)
+{
+	struct intel_ntb_dev *ndev;
 
-		dev_dbg(&ndev->pdev->dev, "irq %d - ldb = %x\n", irq, ldb);
+	ndev = container_of(ntb, struct intel_ntb_dev, ntb);
 
-		if (ldb & SNB_DB_HW_LINK) {
-			xeon_event_msix_irq(irq, dev);
-			ldb &= ~SNB_DB_HW_LINK;
-		}
+	return ndev->spad_count;
+}
 
-		while (ldb) {
-			i = __ffs(ldb);
-			ldb &= ldb - 1;
-			xeon_callback_msix_irq(irq, &ndev->db_cb[i]);
-		}
-	}
+static u32 intel_ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	return IRQ_HANDLED;
+	return ndev_spad_read(ndev, idx,
+			      ndev->self_mmio +
+			      ndev->self_reg->spad);
 }
 
-static int ntb_setup_snb_msix(struct ntb_device *ndev, int msix_entries)
+static int intel_ntb_spad_write(struct ntb_dev *ntb,
+				int idx, u32 val)
 {
-	struct pci_dev *pdev = ndev->pdev;
-	struct msix_entry *msix;
-	int rc, i;
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	if (msix_entries < ndev->limits.msix_cnt)
-		return -ENOSPC;
+	return ndev_spad_write(ndev, idx, val,
+			       ndev->self_mmio +
+			       ndev->self_reg->spad);
+}
 
-	rc = pci_enable_msix_exact(pdev, ndev->msix_entries, msix_entries);
-	if (rc < 0)
-		return rc;
+static int intel_ntb_peer_spad_addr(struct ntb_dev *ntb, int idx,
+				    phys_addr_t *spad_addr)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	for (i = 0; i < msix_entries; i++) {
-		msix = &ndev->msix_entries[i];
-		WARN_ON(!msix->vector);
+	return ndev_spad_addr(ndev, idx, spad_addr, ndev->peer_addr,
+			      ndev->peer_reg->spad);
+}
 
-		if (i == msix_entries - 1) {
-			rc = request_irq(msix->vector,
-					 xeon_event_msix_irq, 0,
-					 "ntb-event-msix", ndev);
-			if (rc)
-				goto err;
-		} else {
-			rc = request_irq(msix->vector,
-					 xeon_callback_msix_irq, 0,
-					 "ntb-callback-msix",
-					 &ndev->db_cb[i]);
-			if (rc)
-				goto err;
-		}
-	}
+static u32 intel_ntb_peer_spad_read(struct ntb_dev *ntb, int idx)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-	ndev->num_msix = msix_entries;
-	ndev->max_cbs = msix_entries - 1;
+	return ndev_spad_read(ndev, idx,
+			      ndev->peer_mmio +
+			      ndev->peer_reg->spad);
+}
 
-	return 0;
+static int intel_ntb_peer_spad_write(struct ntb_dev *ntb,
+				     int idx, u32 val)
+{
+	struct intel_ntb_dev *ndev = ntb_ndev(ntb);
 
-err:
-	while (--i >= 0) {
-		/* Code never reaches here for entry nr 'ndev->num_msix - 1' */
-		msix = &ndev->msix_entries[i];
-		free_irq(msix->vector, &ndev->db_cb[i]);
-	}
+	return ndev_spad_write(ndev, idx, val,
+			       ndev->peer_mmio +
+			       ndev->peer_reg->spad);
+}
 
-	pci_disable_msix(pdev);
-	ndev->num_msix = 0;
+/* BWD */
 
-	return rc;
+static u64 bwd_db_ioread(void __iomem *mmio)
+{
+	return ioread64(mmio);
+}
+
+static void bwd_db_iowrite(u64 bits, void __iomem *mmio)
+{
+	iowrite64(bits, mmio);
 }
 
-static int ntb_setup_bwd_msix(struct ntb_device *ndev, int msix_entries)
+static int bwd_poll_link(struct intel_ntb_dev *ndev)
 {
-	struct pci_dev *pdev = ndev->pdev;
-	struct msix_entry *msix;
-	int rc, i;
+	u32 ntb_ctl;
 
-	msix_entries = pci_enable_msix_range(pdev, ndev->msix_entries,
-					     1, msix_entries);
-	if (msix_entries < 0)
-		return msix_entries;
+	ntb_ctl = ioread32(ndev->self_mmio + BWD_NTBCNTL_OFFSET);
 
-	for (i = 0; i < msix_entries; i++) {
-		msix = &ndev->msix_entries[i];
-		WARN_ON(!msix->vector);
+	if (ntb_ctl == ndev->ntb_ctl)
+		return 0;
 
-		rc = request_irq(msix->vector, bwd_callback_msix_irq, 0,
-				 "ntb-callback-msix", &ndev->db_cb[i]);
-		if (rc)
-			goto err;
-	}
+	ndev->ntb_ctl = ntb_ctl;
 
-	ndev->num_msix = msix_entries;
-	ndev->max_cbs = msix_entries;
+	ndev->lnk_sta = ioread32(ndev->self_mmio + BWD_LINK_STATUS_OFFSET);
 
-	return 0;
+	return 1;
+}
 
-err:
-	while (--i >= 0)
-		free_irq(msix->vector, &ndev->db_cb[i]);
+static int bwd_link_is_up(struct intel_ntb_dev *ndev)
+{
+	return BWD_NTB_CTL_ACTIVE(ndev->ntb_ctl);
+}
 
-	pci_disable_msix(pdev);
-	ndev->num_msix = 0;
+static int bwd_link_is_err(struct intel_ntb_dev *ndev)
+{
+	if (ioread32(ndev->self_mmio + BWD_LTSSMSTATEJMP_OFFSET)
+	    & BWD_LTSSMSTATEJMP_FORCEDETECT)
+		return 1;
 
-	return rc;
+	if (ioread32(ndev->self_mmio + BWD_IBSTERRRCRVSTS0_OFFSET)
+	    & BWD_IBIST_ERR_OFLOW)
+		return 1;
+
+	return 0;
 }
 
-static int ntb_setup_msix(struct ntb_device *ndev)
+static inline enum ntb_topo bwd_ppd_topo(struct intel_ntb_dev *ndev, u32 ppd)
 {
-	struct pci_dev *pdev = ndev->pdev;
-	int msix_entries;
-	int rc, i;
+	switch (ppd & BWD_PPD_TOPO_MASK) {
+	case BWD_PPD_TOPO_B2B_USD:
+		dev_dbg(ndev_dev(ndev), "PPD %d B2B USD\n", ppd);
+		return NTB_TOPO_B2B_USD;
+
+	case BWD_PPD_TOPO_B2B_DSD:
+		dev_dbg(ndev_dev(ndev), "PPD %d B2B DSD\n", ppd);
+		return NTB_TOPO_B2B_DSD;
+
+	case BWD_PPD_TOPO_PRI_USD:
+	case BWD_PPD_TOPO_PRI_DSD: /* accept bogus PRI_DSD */
+	case BWD_PPD_TOPO_SEC_USD:
+	case BWD_PPD_TOPO_SEC_DSD: /* accept bogus SEC_DSD */
+		dev_dbg(ndev_dev(ndev), "PPD %d non B2B disabled\n", ppd);
+		return NTB_TOPO_NONE;
+	}
 
-	msix_entries = pci_msix_vec_count(pdev);
-	if (msix_entries < 0) {
-		rc = msix_entries;
-		goto err;
-	} else if (msix_entries > ndev->limits.msix_cnt) {
-		rc = -EINVAL;
-		goto err;
+	dev_dbg(ndev_dev(ndev), "PPD %d invalid\n", ppd);
+	return NTB_TOPO_NONE;
+}
+
+static void bwd_link_hb(struct work_struct *work)
+{
+	struct intel_ntb_dev *ndev = hb_ndev(work);
+	unsigned long poll_ts;
+	void __iomem *mmio;
+	u32 status32;
+
+	poll_ts = ndev->last_ts + BWD_LINK_HB_TIMEOUT;
+
+	/* Delay polling the link status if an interrupt was received,
+	 * unless the cached link status says the link is down.
+	 */
+	if (time_after(poll_ts, jiffies) && bwd_link_is_up(ndev)) {
+		schedule_delayed_work(&ndev->hb_timer, poll_ts - jiffies);
+		return;
 	}
 
-	ndev->msix_entries = kmalloc(sizeof(struct msix_entry) * msix_entries,
-				     GFP_KERNEL);
-	if (!ndev->msix_entries) {
-		rc = -ENOMEM;
-		goto err;
+	if (bwd_poll_link(ndev))
+		ntb_link_event(&ndev->ntb);
+
+	if (bwd_link_is_up(ndev) || !bwd_link_is_err(ndev)) {
+		schedule_delayed_work(&ndev->hb_timer, BWD_LINK_HB_TIMEOUT);
+		return;
 	}
 
-	for (i = 0; i < msix_entries; i++)
-		ndev->msix_entries[i].entry = i;
+	/* Link is down with error: recover the link! */
 
-	if (is_ntb_atom(ndev))
-		rc = ntb_setup_bwd_msix(ndev, msix_entries);
-	else
-		rc = ntb_setup_snb_msix(ndev, msix_entries);
-	if (rc)
-		goto err1;
+	mmio = ndev->self_mmio;
 
-	return 0;
+	/* Driver resets the NTB ModPhy lanes - magic! */
+	iowrite8(0xe0, mmio + BWD_MODPHY_PCSREG6);
+	iowrite8(0x40, mmio + BWD_MODPHY_PCSREG4);
+	iowrite8(0x60, mmio + BWD_MODPHY_PCSREG4);
+	iowrite8(0x60, mmio + BWD_MODPHY_PCSREG6);
 
-err1:
-	kfree(ndev->msix_entries);
-err:
-	dev_err(&pdev->dev, "Error allocating MSI-X interrupt\n");
-	return rc;
+	/* Driver waits 100ms to allow the NTB ModPhy to settle */
+	msleep(100);
+
+	/* Clear AER Errors, write to clear */
+	status32 = ioread32(mmio + BWD_ERRCORSTS_OFFSET);
+	dev_dbg(ndev_dev(ndev), "ERRCORSTS = %x\n", status32);
+	status32 &= PCI_ERR_COR_REP_ROLL;
+	iowrite32(status32, mmio + BWD_ERRCORSTS_OFFSET);
+
+	/* Clear unexpected electrical idle event in LTSSM, write to clear */
+	status32 = ioread32(mmio + BWD_LTSSMERRSTS0_OFFSET);
+	dev_dbg(ndev_dev(ndev), "LTSSMERRSTS0 = %x\n", status32);
+	status32 |= BWD_LTSSMERRSTS0_UNEXPECTEDEI;
+	iowrite32(status32, mmio + BWD_LTSSMERRSTS0_OFFSET);
+
+	/* Clear DeSkew Buffer error, write to clear */
+	status32 = ioread32(mmio + BWD_DESKEWSTS_OFFSET);
+	dev_dbg(ndev_dev(ndev), "DESKEWSTS = %x\n", status32);
+	status32 |= BWD_DESKEWSTS_DBERR;
+	iowrite32(status32, mmio + BWD_DESKEWSTS_OFFSET);
+
+	status32 = ioread32(mmio + BWD_IBSTERRRCRVSTS0_OFFSET);
+	dev_dbg(ndev_dev(ndev), "IBSTERRRCRVSTS0 = %x\n", status32);
+	status32 &= BWD_IBIST_ERR_OFLOW;
+	iowrite32(status32, mmio + BWD_IBSTERRRCRVSTS0_OFFSET);
+
+	/* Releases the NTB state machine to allow the link to retrain */
+	status32 = ioread32(mmio + BWD_LTSSMSTATEJMP_OFFSET);
+	dev_dbg(ndev_dev(ndev), "LTSSMSTATEJMP = %x\n", status32);
+	status32 &= ~BWD_LTSSMSTATEJMP_FORCEDETECT;
+	iowrite32(status32, mmio + BWD_LTSSMSTATEJMP_OFFSET);
+
+	/* There is a potential race between the 2 NTB devices recovering at the
+	 * same time.  If the times are the same, the link will not recover and
+	 * the driver will be stuck in this loop forever.  Add a random interval
+	 * to the recovery time to prevent this race.
+	 */
+	schedule_delayed_work(&ndev->hb_timer, BWD_LINK_RECOVERY_TIME
+			      + prandom_u32() % BWD_LINK_RECOVERY_TIME);
 }
 
-static int ntb_setup_msi(struct ntb_device *ndev)
+static int bwd_init_isr(struct intel_ntb_dev *ndev)
 {
-	struct pci_dev *pdev = ndev->pdev;
 	int rc;
 
-	rc = pci_enable_msi(pdev);
+	rc = ndev_init_isr(ndev, 1, BWD_DB_MSIX_VECTOR_COUNT,
+			   BWD_DB_MSIX_VECTOR_SHIFT, BWD_DB_TOTAL_SHIFT);
 	if (rc)
 		return rc;
 
-	rc = request_irq(pdev->irq, ntb_interrupt, 0, "ntb-msi", ndev);
-	if (rc) {
-		pci_disable_msi(pdev);
-		dev_err(&pdev->dev, "Error allocating MSI interrupt\n");
-		return rc;
-	}
+	/* BWD doesn't have link status interrupt, poll on that platform */
+	ndev->last_ts = jiffies;
+	INIT_DELAYED_WORK(&ndev->hb_timer, bwd_link_hb);
+	schedule_delayed_work(&ndev->hb_timer, BWD_LINK_HB_TIMEOUT);
 
 	return 0;
 }
 
-static int ntb_setup_intx(struct ntb_device *ndev)
+static void bwd_deinit_isr(struct intel_ntb_dev *ndev)
 {
-	struct pci_dev *pdev = ndev->pdev;
-	int rc;
+	cancel_delayed_work_sync(&ndev->hb_timer);
+	ndev_deinit_isr(ndev);
+}
 
-	/* Verify intx is enabled */
-	pci_intx(pdev, 1);
+static int bwd_init_ntb(struct intel_ntb_dev *ndev)
+{
+	ndev->mw_count = BWD_MW_COUNT;
+	ndev->spad_count = BWD_SPAD_COUNT;
+	ndev->db_count = BWD_DB_COUNT;
 
-	rc = request_irq(pdev->irq, ntb_interrupt, IRQF_SHARED, "ntb-intx",
-			 ndev);
-	if (rc)
-		return rc;
+	switch (ndev->ntb.topo) {
+	case NTB_TOPO_B2B_USD:
+	case NTB_TOPO_B2B_DSD:
+		ndev->self_reg = &bwd_pri_reg;
+		ndev->peer_reg = &bwd_b2b_reg;
+		ndev->xlat_reg = &bwd_sec_xlat;
+
+		/* Enable Bus Master and Memory Space on the secondary side */
+		iowrite16(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
+			  ndev->self_mmio + BWD_SPCICMD_OFFSET);
+
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
 
 	return 0;
 }
 
-static int ntb_setup_interrupts(struct ntb_device *ndev)
+static int bwd_init_dev(struct intel_ntb_dev *ndev)
 {
+	u32 ppd;
 	int rc;
 
-	/* On BWD, disable all interrupts.  On SNB, disable all but Link
-	 * Interrupt.  The rest will be unmasked as callbacks are registered.
-	 */
-	if (is_ntb_atom(ndev))
-		writeq(~0, ndev->reg_ofs.ldb_mask);
-	else {
-		u16 var = 1 << SNB_LINK_DB;
-		writew(~var, ndev->reg_ofs.ldb_mask);
-	}
-
-	rc = ntb_setup_msix(ndev);
-	if (!rc)
-		goto done;
+	rc = pci_read_config_dword(ndev->ntb.pdev, BWD_PPD_OFFSET, &ppd);
+	if (rc)
+		return -EIO;
 
-	ndev->bits_per_vector = 1;
-	ndev->max_cbs = ndev->limits.max_db_bits;
+	ndev->ntb.topo = bwd_ppd_topo(ndev, ppd);
+	if (ndev->ntb.topo == NTB_TOPO_NONE)
+		return -EINVAL;
 
-	rc = ntb_setup_msi(ndev);
-	if (!rc)
-		goto done;
+	rc = bwd_init_ntb(ndev);
+	if (rc)
+		return rc;
 
-	rc = ntb_setup_intx(ndev);
-	if (rc) {
-		dev_err(&ndev->pdev->dev, "no usable interrupts\n");
+	rc = bwd_init_isr(ndev);
+	if (rc)
 		return rc;
+
+	if (ndev->ntb.topo != NTB_TOPO_SEC) {
+		/* Initiate PCI-E link training */
+		rc = pci_write_config_dword(ndev->ntb.pdev, BWD_PPD_OFFSET,
+					    ppd | BWD_PPD_INIT_LINK);
+		if (rc)
+			return rc;
 	}
 
-done:
 	return 0;
 }
 
-static void ntb_free_interrupts(struct ntb_device *ndev)
+static void bwd_deinit_dev(struct intel_ntb_dev *ndev)
 {
-	struct pci_dev *pdev = ndev->pdev;
+	bwd_deinit_isr(ndev);
+}
 
-	/* mask interrupts */
-	if (is_ntb_atom(ndev))
-		writeq(~0, ndev->reg_ofs.ldb_mask);
-	else
-		writew(~0, ndev->reg_ofs.ldb_mask);
+/* SNB */
 
-	if (ndev->num_msix) {
-		struct msix_entry *msix;
-		u32 i;
+static u64 snb_db_ioread(void __iomem *mmio)
+{
+	return (u64)ioread16(mmio);
+}
 
-		for (i = 0; i < ndev->num_msix; i++) {
-			msix = &ndev->msix_entries[i];
-			if (is_ntb_xeon(ndev) && i == ndev->num_msix - 1)
-				free_irq(msix->vector, ndev);
-			else
-				free_irq(msix->vector, &ndev->db_cb[i]);
-		}
-		pci_disable_msix(pdev);
-		kfree(ndev->msix_entries);
-	} else {
-		free_irq(pdev->irq, ndev);
+static void snb_db_iowrite(u64 bits, void __iomem *mmio)
+{
+	iowrite16((u16)bits, mmio);
+}
 
-		if (pci_dev_msi_enabled(pdev))
-			pci_disable_msi(pdev);
-	}
+static int snb_poll_link(struct intel_ntb_dev *ndev)
+{
+	u16 reg_val;
+	int rc;
+
+	ndev->reg->db_iowrite(ndev->db_link_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_bell);
+
+	rc = pci_read_config_word(ndev->ntb.pdev,
+				  SNB_LINK_STATUS_OFFSET, &reg_val);
+	if (rc)
+		return 0;
+
+	if (reg_val == ndev->lnk_sta)
+		return 0;
+
+	ndev->lnk_sta = reg_val;
+
+	return 1;
 }
 
-static int ntb_create_callbacks(struct ntb_device *ndev)
+static int snb_link_is_up(struct intel_ntb_dev *ndev)
 {
-	int i;
+	return NTB_LNK_STA_ACTIVE(ndev->lnk_sta);
+}
 
-	/* Chicken-egg issue.  We won't know how many callbacks are necessary
-	 * until we see how many MSI-X vectors we get, but these pointers need
-	 * to be passed into the MSI-X register function.  So, we allocate the
-	 * max, knowing that they might not all be used, to work around this.
-	 */
-	ndev->db_cb = kcalloc(ndev->limits.max_db_bits,
-			      sizeof(struct ntb_db_cb),
-			      GFP_KERNEL);
-	if (!ndev->db_cb)
-		return -ENOMEM;
+static inline enum ntb_topo snb_ppd_topo(struct intel_ntb_dev *ndev, u8 ppd)
+{
+	switch (ppd & SNB_PPD_TOPO_MASK) {
+	case SNB_PPD_TOPO_B2B_USD:
+		return NTB_TOPO_B2B_USD;
+
+	case SNB_PPD_TOPO_B2B_DSD:
+		return NTB_TOPO_B2B_DSD;
+
+	case SNB_PPD_TOPO_PRI_USD:
+	case SNB_PPD_TOPO_PRI_DSD: /* accept bogus PRI_DSD */
+		return NTB_TOPO_PRI;
 
-	for (i = 0; i < ndev->limits.max_db_bits; i++) {
-		ndev->db_cb[i].db_num = i;
-		ndev->db_cb[i].ndev = ndev;
+	case SNB_PPD_TOPO_SEC_USD:
+	case SNB_PPD_TOPO_SEC_DSD: /* accept bogus SEC_DSD */
+		return NTB_TOPO_SEC;
 	}
 
-	return 0;
+	return NTB_TOPO_NONE;
 }
 
-static void ntb_free_callbacks(struct ntb_device *ndev)
+static inline int snb_ppd_bar4_split(struct intel_ntb_dev *ndev, u8 ppd)
 {
-	int i;
+	if (ppd & SNB_PPD_SPLIT_BAR_MASK) {
+		dev_dbg(ndev_dev(ndev), "PPD %d split bar\n", ppd);
+		return 1;
+	}
+	return 0;
+}
 
-	for (i = 0; i < ndev->limits.max_db_bits; i++)
-		ntb_unregister_db_callback(ndev, i);
+static int snb_init_isr(struct intel_ntb_dev *ndev)
+{
+	return ndev_init_isr(ndev, SNB_DB_MSIX_VECTOR_COUNT,
+			     SNB_DB_MSIX_VECTOR_COUNT,
+			     SNB_DB_MSIX_VECTOR_SHIFT,
+			     SNB_DB_TOTAL_SHIFT);
+}
 
-	kfree(ndev->db_cb);
+static void snb_deinit_isr(struct intel_ntb_dev *ndev)
+{
+	ndev_deinit_isr(ndev);
 }
 
-static ssize_t ntb_debugfs_read(struct file *filp, char __user *ubuf,
-				size_t count, loff_t *offp)
+static int snb_setup_b2b_mw(struct intel_ntb_dev *ndev,
+			    const struct intel_b2b_addr *addr,
+			    const struct intel_b2b_addr *peer_addr)
 {
-	struct ntb_device *ndev;
-	char *buf;
-	ssize_t ret, offset, out_count;
+	struct pci_dev *pdev;
+	void __iomem *mmio;
+	resource_size_t bar_size;
+	phys_addr_t bar_addr;
+	int b2b_bar;
+	u8 bar_sz;
+
+	pdev = ndev_pdev(ndev);
+	mmio = ndev->self_mmio;
+
+	if (ndev->b2b_idx >= ndev->mw_count) {
+		dev_dbg(ndev_dev(ndev), "not using b2b mw\n");
+		b2b_bar = 0;
+		ndev->b2b_off = 0;
+	} else {
+		b2b_bar = ndev_mw_to_bar(ndev, ndev->b2b_idx);
+		if (b2b_bar < 0)
+			return -EIO;
 
-	out_count = 500;
+		dev_dbg(ndev_dev(ndev), "using b2b mw bar %d\n", b2b_bar);
 
-	buf = kmalloc(out_count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+		bar_size = pci_resource_len(ndev->ntb.pdev, b2b_bar);
 
-	ndev = filp->private_data;
-	offset = 0;
-	offset += snprintf(buf + offset, out_count - offset,
-			   "NTB Device Information:\n");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Connection Type - \t\t%s\n",
-			   ndev->conn_type == NTB_CONN_TRANSPARENT ?
-			   "Transparent" : (ndev->conn_type == NTB_CONN_B2B) ?
-			   "Back to back" : "Root Port");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Device Type - \t\t\t%s\n",
-			   ndev->dev_type == NTB_DEV_USD ?
-			   "DSD/USP" : "USD/DSP");
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Max Number of Callbacks - \t%u\n",
-			   ntb_max_cbs(ndev));
-	offset += snprintf(buf + offset, out_count - offset,
-			   "Link Status - \t\t\t%s\n",
-			   ntb_hw_link_status(ndev) ? "Up" : "Down");
-	if (ntb_hw_link_status(ndev)) {
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Link Speed - \t\t\tPCI-E Gen %u\n",
-				   ndev->link_speed);
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Link Width - \t\t\tx%u\n",
-				   ndev->link_width);
-	}
+		dev_dbg(ndev_dev(ndev), "b2b bar size %#llx\n", bar_size);
 
-	if (is_ntb_xeon(ndev)) {
-		u32 status32;
-		u16 status16;
-		int rc;
-
-		offset += snprintf(buf + offset, out_count - offset,
-				   "\nNTB Device Statistics:\n");
-		offset += snprintf(buf + offset, out_count - offset,
-				   "Upstream Memory Miss - \t%u\n",
-				   readw(ndev->reg_base +
-					 SNB_USMEMMISS_OFFSET));
-
-		offset += snprintf(buf + offset, out_count - offset,
-				   "\nNTB Hardware Errors:\n");
-
-		rc = pci_read_config_word(ndev->pdev, SNB_DEVSTS_OFFSET,
-					  &status16);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "DEVSTS - \t%#06x\n", status16);
-
-		rc = pci_read_config_word(ndev->pdev, SNB_LINK_STATUS_OFFSET,
-					  &status16);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "LNKSTS - \t%#06x\n", status16);
-
-		rc = pci_read_config_dword(ndev->pdev, SNB_UNCERRSTS_OFFSET,
-					   &status32);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "UNCERRSTS - \t%#010x\n", status32);
-
-		rc = pci_read_config_dword(ndev->pdev, SNB_CORERRSTS_OFFSET,
-					   &status32);
-		if (!rc)
-			offset += snprintf(buf + offset, out_count - offset,
-					   "CORERRSTS - \t%#010x\n", status32);
+		if (b2b_mw_share && SNB_B2B_MIN_SIZE <= bar_size >> 1) {
+			dev_dbg(ndev_dev(ndev),
+				"b2b using first half of bar\n");
+			ndev->b2b_off = bar_size >> 1;
+		} else if (SNB_B2B_MIN_SIZE <= bar_size) {
+			dev_dbg(ndev_dev(ndev),
+				"b2b using whole bar\n");
+			ndev->b2b_off = 0;
+			--ndev->mw_count;
+		} else {
+			dev_dbg(ndev_dev(ndev),
+				"b2b bar size is too small\n");
+			return -EIO;
+		}
 	}
 
-	if (offset > out_count)
-		offset = out_count;
+	/* Reset the secondary bar sizes to match the primary bar sizes,
+	 * except disable or halve the size of the b2b secondary bar.
+	 *
+	 * Note: code for each specific bar size register, because the register
+	 * offsets are not in a consistent order (bar5sz comes after ppd, odd).
+	 */
+	pci_read_config_byte(pdev, SNB_PBAR23SZ_OFFSET, &bar_sz);
+	dev_dbg(ndev_dev(ndev), "PBAR23SZ %#x\n", bar_sz);
+	if (b2b_bar == 2) {
+		if (ndev->b2b_off)
+			bar_sz -= 1;
+		else
+			bar_sz = 0;
+	}
+	pci_write_config_byte(pdev, SNB_SBAR23SZ_OFFSET, bar_sz);
+	pci_read_config_byte(pdev, SNB_SBAR23SZ_OFFSET, &bar_sz);
+	dev_dbg(ndev_dev(ndev), "SBAR23SZ %#x\n", bar_sz);
+
+	if (!ndev->bar4_split) {
+		pci_read_config_byte(pdev, SNB_PBAR45SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "PBAR45SZ %#x\n", bar_sz);
+		if (b2b_bar == 4) {
+			if (ndev->b2b_off)
+				bar_sz -= 1;
+			else
+				bar_sz = 0;
+		}
+		pci_write_config_byte(pdev, SNB_SBAR45SZ_OFFSET, bar_sz);
+		pci_read_config_byte(pdev, SNB_SBAR45SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "SBAR45SZ %#x\n", bar_sz);
+	} else {
+		pci_read_config_byte(pdev, SNB_PBAR4SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "PBAR4SZ %#x\n", bar_sz);
+		if (b2b_bar == 4) {
+			if (ndev->b2b_off)
+				bar_sz -= 1;
+			else
+				bar_sz = 0;
+		}
+		pci_write_config_byte(pdev, SNB_SBAR4SZ_OFFSET, bar_sz);
+		pci_read_config_byte(pdev, SNB_SBAR4SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "SBAR4SZ %#x\n", bar_sz);
+
+		pci_read_config_byte(pdev, SNB_PBAR5SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "PBAR5SZ %#x\n", bar_sz);
+		if (b2b_bar == 5) {
+			if (ndev->b2b_off)
+				bar_sz -= 1;
+			else
+				bar_sz = 0;
+		}
+		pci_write_config_byte(pdev, SNB_SBAR5SZ_OFFSET, bar_sz);
+		pci_read_config_byte(pdev, SNB_SBAR5SZ_OFFSET, &bar_sz);
+		dev_dbg(ndev_dev(ndev), "SBAR5SZ %#x\n", bar_sz);
+	}
 
-	ret = simple_read_from_buffer(ubuf, count, offp, buf, offset);
-	kfree(buf);
-	return ret;
-}
+	/* SBAR01 hit by first part of the b2b bar */
+	if (b2b_bar == 0)
+		bar_addr = addr->bar0_addr;
+	else if (b2b_bar == 2)
+		bar_addr = addr->bar2_addr64;
+	else if (b2b_bar == 4 && !ndev->bar4_split)
+		bar_addr = addr->bar4_addr64;
+	else if (b2b_bar == 4)
+		bar_addr = addr->bar4_addr32;
+	else if (b2b_bar == 5)
+		bar_addr = addr->bar5_addr32;
+	else
+		return -EIO;
 
-static const struct file_operations ntb_debugfs_info = {
-	.owner = THIS_MODULE,
-	.open = simple_open,
-	.read = ntb_debugfs_read,
-};
+	dev_dbg(ndev_dev(ndev), "SBAR01 %#018llx\n", bar_addr);
+	iowrite64(bar_addr, mmio + SNB_SBAR0BASE_OFFSET);
 
-static void ntb_setup_debugfs(struct ntb_device *ndev)
-{
-	if (!debugfs_initialized())
-		return;
+	/* Other SBAR are normally hit by the PBAR xlat, except for b2b bar.
+	 * The b2b bar is either disabled above, or configured half-size, and
+	 * it starts at the PBAR xlat + offset.
+	 */
 
-	if (!debugfs_dir)
-		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+	bar_addr = addr->bar2_addr64 + (b2b_bar == 2 ? ndev->b2b_off : 0);
+	iowrite64(bar_addr, mmio + SNB_SBAR23BASE_OFFSET);
+	bar_addr = ioread64(mmio + SNB_SBAR23BASE_OFFSET);
+	dev_dbg(ndev_dev(ndev), "SBAR23 %#018llx\n", bar_addr);
+
+	if (!ndev->bar4_split) {
+		bar_addr = addr->bar4_addr64 +
+			(b2b_bar == 4 ? ndev->b2b_off : 0);
+		iowrite64(bar_addr, mmio + SNB_SBAR45BASE_OFFSET);
+		bar_addr = ioread64(mmio + SNB_SBAR45BASE_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR45 %#018llx\n", bar_addr);
+	} else {
+		bar_addr = addr->bar4_addr32 +
+			(b2b_bar == 4 ? ndev->b2b_off : 0);
+		iowrite32(bar_addr, mmio + SNB_SBAR4BASE_OFFSET);
+		bar_addr = ioread32(mmio + SNB_SBAR4BASE_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR4 %#010llx\n", bar_addr);
+
+		bar_addr = addr->bar5_addr32 +
+			(b2b_bar == 5 ? ndev->b2b_off : 0);
+		iowrite32(bar_addr, mmio + SNB_SBAR5BASE_OFFSET);
+		bar_addr = ioread32(mmio + SNB_SBAR5BASE_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR5 %#010llx\n", bar_addr);
+	}
 
-	ndev->debugfs_dir = debugfs_create_dir(pci_name(ndev->pdev),
-					       debugfs_dir);
-	if (ndev->debugfs_dir)
-		ndev->debugfs_info = debugfs_create_file("info", S_IRUSR,
-							 ndev->debugfs_dir,
-							 ndev,
-							 &ntb_debugfs_info);
-}
+	/* setup incoming bar limits == base addrs (zero length windows) */
 
-static void ntb_free_debugfs(struct ntb_device *ndev)
-{
-	debugfs_remove_recursive(ndev->debugfs_dir);
+	bar_addr = addr->bar2_addr64 + (b2b_bar == 2 ? ndev->b2b_off : 0);
+	iowrite64(bar_addr, mmio + SNB_SBAR23LMT_OFFSET);
+	bar_addr = ioread64(mmio + SNB_SBAR23LMT_OFFSET);
+	dev_dbg(ndev_dev(ndev), "SBAR23LMT %#018llx\n", bar_addr);
 
-	if (debugfs_dir && simple_empty(debugfs_dir)) {
-		debugfs_remove_recursive(debugfs_dir);
-		debugfs_dir = NULL;
+	if (!ndev->bar4_split) {
+		bar_addr = addr->bar4_addr64 +
+			(b2b_bar == 4 ? ndev->b2b_off : 0);
+		iowrite64(bar_addr, mmio + SNB_SBAR45LMT_OFFSET);
+		bar_addr = ioread64(mmio + SNB_SBAR45LMT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR45LMT %#018llx\n", bar_addr);
+	} else {
+		bar_addr = addr->bar4_addr32 +
+			(b2b_bar == 4 ? ndev->b2b_off : 0);
+		iowrite32(bar_addr, mmio + SNB_SBAR4LMT_OFFSET);
+		bar_addr = ioread32(mmio + SNB_SBAR4LMT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR4LMT %#010llx\n", bar_addr);
+
+		bar_addr = addr->bar5_addr32 +
+			(b2b_bar == 5 ? ndev->b2b_off : 0);
+		iowrite32(bar_addr, mmio + SNB_SBAR5LMT_OFFSET);
+		bar_addr = ioread32(mmio + SNB_SBAR5LMT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "SBAR5LMT %#05llx\n", bar_addr);
 	}
-}
 
-static void ntb_hw_link_up(struct ntb_device *ndev)
-{
-	if (ndev->conn_type == NTB_CONN_TRANSPARENT)
-		ntb_link_event(ndev, NTB_LINK_UP);
-	else {
-		u32 ntb_cntl;
+	/* zero incoming translation addrs */
+	iowrite64(0, mmio + SNB_SBAR23XLAT_OFFSET);
 
-		/* Let's bring the NTB link up */
-		ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-		ntb_cntl &= ~(NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK);
-		ntb_cntl |= NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP;
-		ntb_cntl |= NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP;
-		if (ndev->split_bar)
-			ntb_cntl |= NTB_CNTL_P2S_BAR5_SNOOP |
-				    NTB_CNTL_S2P_BAR5_SNOOP;
+	if (!ndev->bar4_split) {
+		iowrite64(0, mmio + SNB_SBAR45XLAT_OFFSET);
+	} else {
+		iowrite32(0, mmio + SNB_SBAR4XLAT_OFFSET);
+		iowrite32(0, mmio + SNB_SBAR5XLAT_OFFSET);
+	}
 
-		writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+	/* zero outgoing translation limits (whole bar size windows) */
+	iowrite64(0, mmio + SNB_PBAR23LMT_OFFSET);
+	if (!ndev->bar4_split) {
+		iowrite64(0, mmio + SNB_PBAR45LMT_OFFSET);
+	} else {
+		iowrite32(0, mmio + SNB_PBAR4LMT_OFFSET);
+		iowrite32(0, mmio + SNB_PBAR5LMT_OFFSET);
 	}
-}
 
-static void ntb_hw_link_down(struct ntb_device *ndev)
-{
-	u32 ntb_cntl;
+	/* set outgoing translation offsets */
+	bar_addr = peer_addr->bar2_addr64;
+	iowrite64(bar_addr, mmio + SNB_PBAR23XLAT_OFFSET);
+	bar_addr = ioread64(mmio + SNB_PBAR23XLAT_OFFSET);
+	dev_dbg(ndev_dev(ndev), "PBAR23XLAT %#018llx\n", bar_addr);
+
+	if (!ndev->bar4_split) {
+		bar_addr = peer_addr->bar4_addr64;
+		iowrite64(bar_addr, mmio + SNB_PBAR45XLAT_OFFSET);
+		bar_addr = ioread64(mmio + SNB_PBAR45XLAT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "PBAR45XLAT %#018llx\n", bar_addr);
+	} else {
+		bar_addr = peer_addr->bar4_addr32;
+		iowrite32(bar_addr, mmio + SNB_PBAR4XLAT_OFFSET);
+		bar_addr = ioread32(mmio + SNB_PBAR4XLAT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "PBAR4XLAT %#010llx\n", bar_addr);
+
+		bar_addr = peer_addr->bar5_addr32;
+		iowrite32(bar_addr, mmio + SNB_PBAR5XLAT_OFFSET);
+		bar_addr = ioread32(mmio + SNB_PBAR5XLAT_OFFSET);
+		dev_dbg(ndev_dev(ndev), "PBAR5XLAT %#010llx\n", bar_addr);
+	}
 
-	if (ndev->conn_type == NTB_CONN_TRANSPARENT) {
-		ntb_link_event(ndev, NTB_LINK_DOWN);
-		return;
+	/* set the translation offset for b2b registers */
+	if (b2b_bar == 0)
+		bar_addr = peer_addr->bar0_addr;
+	else if (b2b_bar == 2)
+		bar_addr = peer_addr->bar2_addr64;
+	else if (b2b_bar == 4 && !ndev->bar4_split)
+		bar_addr = peer_addr->bar4_addr64;
+	else if (b2b_bar == 4)
+		bar_addr = peer_addr->bar4_addr32;
+	else if (b2b_bar == 5)
+		bar_addr = peer_addr->bar5_addr32;
+	else
+		return -EIO;
+
+	/* B2B_XLAT_OFFSET is 64bit, but can only take 32bit writes */
+	dev_dbg(ndev_dev(ndev), "B2BXLAT %#018llx\n", bar_addr);
+	iowrite32(bar_addr, mmio + SNB_B2B_XLAT_OFFSETL);
+	iowrite32(bar_addr >> 32, mmio + SNB_B2B_XLAT_OFFSETU);
+
+	if (b2b_bar) {
+		/* map peer ntb mmio config space registers */
+		ndev->peer_mmio = pci_iomap(pdev, b2b_bar,
+					    SNB_B2B_MIN_SIZE);
+		if (!ndev->peer_mmio)
+			return -EIO;
 	}
 
-	/* Bring NTB link down */
-	ntb_cntl = readl(ndev->reg_ofs.lnk_cntl);
-	ntb_cntl &= ~(NTB_CNTL_P2S_BAR23_SNOOP | NTB_CNTL_S2P_BAR23_SNOOP);
-	ntb_cntl &= ~(NTB_CNTL_P2S_BAR4_SNOOP | NTB_CNTL_S2P_BAR4_SNOOP);
-	if (ndev->split_bar)
-		ntb_cntl &= ~(NTB_CNTL_P2S_BAR5_SNOOP |
-			      NTB_CNTL_S2P_BAR5_SNOOP);
-	ntb_cntl |= NTB_CNTL_LINK_DISABLE | NTB_CNTL_CFG_LOCK;
-	writel(ntb_cntl, ndev->reg_ofs.lnk_cntl);
+	return 0;
 }
 
-static void ntb_max_mw_detect(struct ntb_device *ndev)
+static int snb_init_ntb(struct intel_ntb_dev *ndev)
 {
-	if (ndev->split_bar)
-		ndev->limits.max_mw = HSX_SPLITBAR_MAX_MW;
+	int rc;
+
+	if (ndev->bar4_split)
+		ndev->mw_count = HSX_SPLIT_BAR_MW_COUNT;
 	else
-		ndev->limits.max_mw = SNB_MAX_MW;
-}
+		ndev->mw_count = SNB_MW_COUNT;
 
-static int ntb_xeon_detect(struct ntb_device *ndev)
-{
-	int rc, bars_mask;
-	u32 bars;
-	u8 ppd;
+	ndev->spad_count = SNB_SPAD_COUNT;
+	ndev->db_count = SNB_DB_COUNT;
+	ndev->db_link_mask = SNB_DB_LINK_BIT;
 
-	ndev->hw_type = SNB_HW;
+	switch (ndev->ntb.topo) {
+	case NTB_TOPO_PRI:
+		if (ndev->hwerr_flags & NTB_HWERR_SDOORBELL_LOCKUP) {
+			dev_err(ndev_dev(ndev), "NTB Primary config disabled\n");
+			return -EINVAL;
+		}
+		/* use half the spads for the peer */
+		ndev->spad_count >>= 1;
+		ndev->self_reg = &snb_pri_reg;
+		ndev->peer_reg = &snb_sec_reg;
+		ndev->xlat_reg = &snb_sec_xlat;
+		break;
 
-	rc = pci_read_config_byte(ndev->pdev, NTB_PPD_OFFSET, &ppd);
-	if (rc)
-		return -EIO;
+	case NTB_TOPO_SEC:
+		if (ndev->hwerr_flags & NTB_HWERR_SDOORBELL_LOCKUP) {
+			dev_err(ndev_dev(ndev), "NTB Secondary config disabled\n");
+			return -EINVAL;
+		}
+		/* use half the spads for the peer */
+		ndev->spad_count >>= 1;
+		ndev->self_reg = &snb_sec_reg;
+		ndev->peer_reg = &snb_pri_reg;
+		ndev->xlat_reg = &snb_pri_xlat;
+		break;
 
-	if (ppd & SNB_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_USD;
-	else
-		ndev->dev_type = NTB_DEV_DSD;
+	case NTB_TOPO_B2B_USD:
+	case NTB_TOPO_B2B_DSD:
+		ndev->self_reg = &snb_pri_reg;
+		ndev->peer_reg = &snb_b2b_reg;
+		ndev->xlat_reg = &snb_sec_xlat;
 
-	ndev->split_bar = (ppd & SNB_PPD_SPLIT_BAR) ? 1 : 0;
+		if (ndev->hwerr_flags & NTB_HWERR_SDOORBELL_LOCKUP) {
+			ndev->peer_reg = &snb_pri_reg;
 
-	switch (ppd & SNB_PPD_CONN_TYPE) {
-	case NTB_CONN_B2B:
-		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-		dev_info(&ndev->pdev->dev, "Conn Type = RP\n");
-		ndev->conn_type = NTB_CONN_RP;
-		break;
-	case NTB_CONN_TRANSPARENT:
-		dev_info(&ndev->pdev->dev, "Conn Type = TRANSPARENT\n");
-		ndev->conn_type = NTB_CONN_TRANSPARENT;
-		/*
-		 * This mode is default to USD/DSP. HW does not report
-		 * properly in transparent mode as it has no knowledge of
-		 * NTB. We will just force correct here.
-		 */
-		ndev->dev_type = NTB_DEV_USD;
+			if (b2b_mw_idx < 0)
+				ndev->b2b_idx = b2b_mw_idx + ndev->mw_count;
+			else
+				ndev->b2b_idx = b2b_mw_idx;
 
-		/*
-		 * This is a way for transparent BAR to figure out if we
-		 * are doing split BAR or not. There is no way for the hw
-		 * on the transparent side to know and set the PPD.
-		 */
-		bars_mask = pci_select_bars(ndev->pdev, IORESOURCE_MEM);
-		bars = hweight32(bars_mask);
-		if (bars == (HSX_SPLITBAR_MAX_MW + 1))
-			ndev->split_bar = 1;
+			dev_dbg(ndev_dev(ndev),
+				"setting up b2b mw idx %d means %d\n",
+				b2b_mw_idx, ndev->b2b_idx);
+
+		} else if (ndev->hwerr_flags & NTB_HWERR_B2BDOORBELL_BIT14) {
+			dev_warn(ndev_dev(ndev), "Reduce doorbell count by 1\n");
+			ndev->db_count -= 1;
+		}
+
+		if (ndev->ntb.topo == NTB_TOPO_B2B_USD) {
+			rc = snb_setup_b2b_mw(ndev,
+					      &snb_b2b_dsd_addr,
+					      &snb_b2b_usd_addr);
+		} else {
+			rc = snb_setup_b2b_mw(ndev,
+					      &snb_b2b_usd_addr,
+					      &snb_b2b_dsd_addr);
+		}
+		if (rc)
+			return rc;
+
+		/* Enable Bus Master and Memory Space on the secondary side */
+		iowrite16(PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER,
+			  ndev->self_mmio + SNB_SPCICMD_OFFSET);
 
 		break;
+
 	default:
-		dev_err(&ndev->pdev->dev, "Unknown PPD %x\n", ppd);
-		return -ENODEV;
+		return -EINVAL;
 	}
 
-	ntb_max_mw_detect(ndev);
+	ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1;
+
+	ndev->reg->db_iowrite(ndev->db_valid_mask,
+			      ndev->self_mmio +
+			      ndev->self_reg->db_mask);
 
 	return 0;
 }
 
-static int ntb_atom_detect(struct ntb_device *ndev)
+static int snb_init_dev(struct intel_ntb_dev *ndev)
 {
-	int rc;
-	u32 ppd;
+	struct pci_dev *pdev;
+	u8 ppd;
+	int rc, mem;
+
+	/* There is a Xeon hardware errata related to writes to SDOORBELL or
+	 * B2BDOORBELL in conjunction with inbound access to NTB MMIO Space,
+	 * which may hang the system.  To workaround this use the second memory
+	 * window to access the interrupt and scratch pad registers on the
+	 * remote system.
+	 */
+	ndev->hwerr_flags |= NTB_HWERR_SDOORBELL_LOCKUP;
 
-	ndev->hw_type = BWD_HW;
-	ndev->limits.max_mw = BWD_MAX_MW;
+	/* There is a hardware errata related to accessing any register in
+	 * SB01BASE in the presence of bidirectional traffic crossing the NTB.
+	 */
+	ndev->hwerr_flags |= NTB_HWERR_SB01BASE_LOCKUP;
+
+	/* HW Errata on bit 14 of b2bdoorbell register.  Writes will not be
+	 * mirrored to the remote system.  Shrink the number of bits by one,
+	 * since bit 14 is the last bit.
+	 */
+	ndev->hwerr_flags |= NTB_HWERR_B2BDOORBELL_BIT14;
 
-	rc = pci_read_config_dword(ndev->pdev, NTB_PPD_OFFSET, &ppd);
+	ndev->reg = &snb_reg;
+
+	pdev = ndev_pdev(ndev);
+
+	rc = pci_read_config_byte(pdev, SNB_PPD_OFFSET, &ppd);
 	if (rc)
-		return rc;
+		return -EIO;
 
-	switch ((ppd & BWD_PPD_CONN_TYPE) >> 8) {
-	case NTB_CONN_B2B:
-		dev_info(&ndev->pdev->dev, "Conn Type = B2B\n");
-		ndev->conn_type = NTB_CONN_B2B;
-		break;
-	case NTB_CONN_RP:
-	default:
-		dev_err(&ndev->pdev->dev, "Unsupported NTB configuration\n");
+	ndev->ntb.topo = snb_ppd_topo(ndev, ppd);
+	dev_dbg(ndev_dev(ndev), "ppd %#x topo %s\n", ppd,
+		ntb_topo_string(ndev->ntb.topo));
+	if (ndev->ntb.topo == NTB_TOPO_NONE)
 		return -EINVAL;
+
+	if (ndev->ntb.topo != NTB_TOPO_SEC) {
+		ndev->bar4_split = snb_ppd_bar4_split(ndev, ppd);
+		dev_dbg(ndev_dev(ndev), "ppd %#x bar4_split %d\n",
+			ppd, ndev->bar4_split);
+	} else {
+		/* This is a way for transparent BAR to figure out if we are
+		 * doing split BAR or not. There is no way for the hw on the
+		 * transparent side to know and set the PPD.
+		 */
+		mem = pci_select_bars(pdev, IORESOURCE_MEM);
+		ndev->bar4_split = hweight32(mem) ==
+			HSX_SPLIT_BAR_MW_COUNT + 1;
+		dev_dbg(ndev_dev(ndev), "mem %#x bar4_split %d\n",
+			mem, ndev->bar4_split);
 	}
 
-	if (ppd & BWD_PPD_DEV_TYPE)
-		ndev->dev_type = NTB_DEV_DSD;
-	else
-		ndev->dev_type = NTB_DEV_USD;
+	rc = snb_init_ntb(ndev);
+	if (rc)
+		return rc;
 
-	return 0;
+	return snb_init_isr(ndev);
+}
+
+static void snb_deinit_dev(struct intel_ntb_dev *ndev)
+{
+	snb_deinit_isr(ndev);
 }
 
-static int ntb_device_detect(struct ntb_device *ndev)
+static int intel_ntb_init_pci(struct intel_ntb_dev *ndev, struct pci_dev *pdev)
 {
 	int rc;
 
-	if (is_ntb_xeon(ndev))
-		rc = ntb_xeon_detect(ndev);
-	else if (is_ntb_atom(ndev))
-		rc = ntb_atom_detect(ndev);
-	else
-		rc = -ENODEV;
+	pci_set_drvdata(pdev, ndev);
+
+	rc = pci_enable_device(pdev);
+	if (rc)
+		goto err_pci_enable;
+
+	rc = pci_request_regions(pdev, NTB_NAME);
+	if (rc)
+		goto err_pci_regions;
+
+	pci_set_master(pdev);
+
+	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err_dma_mask;
+		dev_warn(ndev_dev(ndev), "Cannot DMA highmem\n");
+	}
 
-	dev_info(&ndev->pdev->dev, "Device Type = %s\n",
-		 ndev->dev_type == NTB_DEV_USD ? "USD/DSP" : "DSD/USP");
+	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (rc) {
+		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (rc)
+			goto err_dma_mask;
+		dev_warn(ndev_dev(ndev), "Cannot DMA consistent highmem\n");
+	}
+
+	ndev->self_mmio = pci_iomap(pdev, 0, 0);
+	if (!ndev->self_mmio) {
+		rc = -EIO;
+		goto err_mmio;
+	}
+	ndev->peer_mmio = ndev->self_mmio;
 
 	return 0;
+
+err_mmio:
+err_dma_mask:
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+err_pci_regions:
+	pci_disable_device(pdev);
+err_pci_enable:
+	pci_set_drvdata(pdev, NULL);
+	return rc;
 }
 
-static int ntb_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+static void intel_ntb_deinit_pci(struct intel_ntb_dev *ndev)
 {
-	struct ntb_device *ndev;
-	int rc, i;
+	struct pci_dev *pdev = ndev_pdev(ndev);
 
-	ndev = kzalloc(sizeof(struct ntb_device), GFP_KERNEL);
-	if (!ndev)
-		return -ENOMEM;
+	if (ndev->peer_mmio && ndev->peer_mmio != ndev->self_mmio)
+		pci_iounmap(pdev, ndev->peer_mmio);
+	pci_iounmap(pdev, ndev->self_mmio);
 
-	ndev->pdev = pdev;
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
 
-	ntb_set_errata_flags(ndev);
+static inline void ndev_init_struct(struct intel_ntb_dev *ndev,
+				    struct pci_dev *pdev)
+{
+	ndev->ntb.pdev = pdev;
+	ndev->ntb.topo = NTB_TOPO_NONE;
+	ndev->ntb.ops = &intel_ntb_ops;
 
-	ndev->link_status = NTB_LINK_DOWN;
-	pci_set_drvdata(pdev, ndev);
-	ntb_setup_debugfs(ndev);
+	ndev->b2b_off = 0;
+	ndev->b2b_idx = INT_MAX;
 
-	rc = pci_enable_device(pdev);
-	if (rc)
-		goto err;
+	ndev->bar4_split = 0;
 
-	pci_set_master(ndev->pdev);
+	ndev->mw_count = 0;
+	ndev->spad_count = 0;
+	ndev->db_count = 0;
+	ndev->db_vec_count = 0;
+	ndev->db_vec_shift = 0;
 
-	rc = ntb_device_detect(ndev);
-	if (rc)
-		goto err;
+	ndev->ntb_ctl = 0;
+	ndev->lnk_sta = 0;
 
-	ndev->mw = kcalloc(ndev->limits.max_mw, sizeof(struct ntb_mw),
-			   GFP_KERNEL);
-	if (!ndev->mw) {
-		rc = -ENOMEM;
-		goto err1;
-	}
+	ndev->db_valid_mask = 0;
+	ndev->db_link_mask = 0;
+	ndev->db_mask = 0;
 
-	if (ndev->split_bar)
-		rc = pci_request_selected_regions(pdev, NTB_SPLITBAR_MASK,
-						  KBUILD_MODNAME);
-	else
-		rc = pci_request_selected_regions(pdev, NTB_BAR_MASK,
-						  KBUILD_MODNAME);
+	spin_lock_init(&ndev->db_mask_lock);
+}
 
-	if (rc)
-		goto err2;
+static int intel_ntb_pci_probe(struct pci_dev *pdev,
+			       const struct pci_device_id *id)
+{
+	struct intel_ntb_dev *ndev;
+	int rc;
 
-	ndev->reg_base = pci_ioremap_bar(pdev, NTB_BAR_MMIO);
-	if (!ndev->reg_base) {
-		dev_warn(&pdev->dev, "Cannot remap BAR 0\n");
-		rc = -EIO;
-		goto err3;
-	}
+	if (pdev_is_bwd(pdev)) {
+		ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
+		if (!ndev) {
+			rc = -ENOMEM;
+			goto err_ndev;
+		}
 
-	for (i = 0; i < ndev->limits.max_mw; i++) {
-		ndev->mw[i].bar_sz = pci_resource_len(pdev, MW_TO_BAR(i));
+		ndev_init_struct(ndev, pdev);
 
-		/*
-		 * with the errata we need to steal last of the memory
-		 * windows for workarounds and they point to MMIO registers.
-		 */
-		if ((ndev->wa_flags & WA_SNB_ERR) &&
-		    (i == (ndev->limits.max_mw - 1))) {
-			ndev->mw[i].vbase =
-				ioremap_nocache(pci_resource_start(pdev,
-							MW_TO_BAR(i)),
-						ndev->mw[i].bar_sz);
-		} else {
-			ndev->mw[i].vbase =
-				ioremap_wc(pci_resource_start(pdev,
-							MW_TO_BAR(i)),
-					   ndev->mw[i].bar_sz);
-		}
+		rc = intel_ntb_init_pci(ndev, pdev);
+		if (rc)
+			goto err_init_pci;
+
+		rc = bwd_init_dev(ndev);
+		if (rc)
+			goto err_init_dev;
 
-		dev_info(&pdev->dev, "MW %d size %llu\n", i,
-			 (unsigned long long) ndev->mw[i].bar_sz);
-		if (!ndev->mw[i].vbase) {
-			dev_warn(&pdev->dev, "Cannot remap BAR %d\n",
-				 MW_TO_BAR(i));
-			rc = -EIO;
-			goto err4;
+	} else if (pdev_is_snb(pdev)) {
+		ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
+		if (!ndev) {
+			rc = -ENOMEM;
+			goto err_ndev;
 		}
-	}
 
-	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
-		if (rc)
-			goto err4;
+		ndev_init_struct(ndev, pdev);
 
-		dev_warn(&pdev->dev, "Cannot DMA highmem\n");
-	}
+		rc = intel_ntb_init_pci(ndev, pdev);
+		if (rc)
+			goto err_init_pci;
 
-	rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
-	if (rc) {
-		rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		rc = snb_init_dev(ndev);
 		if (rc)
-			goto err4;
+			goto err_init_dev;
 
-		dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n");
+	} else {
+		rc = -EINVAL;
+		goto err_ndev;
 	}
 
-	rc = ntb_device_setup(ndev);
-	if (rc)
-		goto err4;
-
-	rc = ntb_create_callbacks(ndev);
-	if (rc)
-		goto err5;
+	ndev_reset_unsafe_flags(ndev);
 
-	rc = ntb_setup_interrupts(ndev);
-	if (rc)
-		goto err6;
+	ndev->reg->poll_link(ndev);
 
-	/* The scratchpad registers keep the values between rmmod/insmod,
-	 * blast them now
-	 */
-	for (i = 0; i < ndev->limits.max_spads; i++) {
-		ntb_write_local_spad(ndev, i, 0);
-		ntb_write_remote_spad(ndev, i, 0);
-	}
+	ndev_init_debugfs(ndev);
 
-	rc = ntb_transport_init(pdev);
+	rc = ntb_register_device(&ndev->ntb);
 	if (rc)
-		goto err7;
-
-	ntb_hw_link_up(ndev);
+		goto err_register;
 
 	return 0;
 
-err7:
-	ntb_free_interrupts(ndev);
-err6:
-	ntb_free_callbacks(ndev);
-err5:
-	ntb_device_free(ndev);
-err4:
-	for (i--; i >= 0; i--)
-		iounmap(ndev->mw[i].vbase);
-	iounmap(ndev->reg_base);
-err3:
-	if (ndev->split_bar)
-		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
-	else
-		pci_release_selected_regions(pdev, NTB_BAR_MASK);
-err2:
-	kfree(ndev->mw);
-err1:
-	pci_disable_device(pdev);
-err:
-	ntb_free_debugfs(ndev);
+err_register:
+	ndev_deinit_debugfs(ndev);
+	if (pdev_is_bwd(pdev))
+		bwd_deinit_dev(ndev);
+	else if (pdev_is_snb(pdev))
+		snb_deinit_dev(ndev);
+err_init_dev:
+	intel_ntb_deinit_pci(ndev);
+err_init_pci:
 	kfree(ndev);
-
-	dev_err(&pdev->dev, "Error loading %s module\n", KBUILD_MODNAME);
+err_ndev:
 	return rc;
 }
 
-static void ntb_pci_remove(struct pci_dev *pdev)
+static void intel_ntb_pci_remove(struct pci_dev *pdev)
 {
-	struct ntb_device *ndev = pci_get_drvdata(pdev);
-	int i;
+	struct intel_ntb_dev *ndev = pci_get_drvdata(pdev);
+
+	ntb_unregister_device(&ndev->ntb);
+	ndev_deinit_debugfs(ndev);
+	if (pdev_is_bwd(pdev))
+		bwd_deinit_dev(ndev);
+	else if (pdev_is_snb(pdev))
+		snb_deinit_dev(ndev);
+	intel_ntb_deinit_pci(ndev);
+	kfree(ndev);
+}
 
-	ntb_hw_link_down(ndev);
+static const struct intel_ntb_reg bwd_reg = {
+	.poll_link		= bwd_poll_link,
+	.link_is_up		= bwd_link_is_up,
+	.db_ioread		= bwd_db_ioread,
+	.db_iowrite		= bwd_db_iowrite,
+	.db_size		= sizeof(u64),
+	.ntb_ctl		= BWD_NTBCNTL_OFFSET,
+	.mw_bar			= {2, 4},
+};
 
-	ntb_transport_free(ndev->ntb_transport);
+static const struct intel_ntb_alt_reg bwd_pri_reg = {
+	.db_bell		= BWD_PDOORBELL_OFFSET,
+	.db_mask		= BWD_PDBMSK_OFFSET,
+	.spad			= BWD_SPAD_OFFSET,
+};
 
-	ntb_free_interrupts(ndev);
-	ntb_free_callbacks(ndev);
-	ntb_device_free(ndev);
+static const struct intel_ntb_alt_reg bwd_b2b_reg = {
+	.db_bell		= BWD_B2B_DOORBELL_OFFSET,
+	.spad			= BWD_B2B_SPAD_OFFSET,
+};
 
-	/* need to reset max_mw limits so we can unmap properly */
-	if (ndev->hw_type == SNB_HW)
-		ntb_max_mw_detect(ndev);
+static const struct intel_ntb_xlat_reg bwd_sec_xlat = {
+	/* FIXME : .bar0_base	= BWD_SBAR0BASE_OFFSET, */
+	/* FIXME : .bar2_limit	= BWD_SBAR2LMT_OFFSET, */
+	.bar2_xlat		= BWD_SBAR2XLAT_OFFSET,
+};
 
-	for (i = 0; i < ndev->limits.max_mw; i++)
-		iounmap(ndev->mw[i].vbase);
+static const struct intel_ntb_reg snb_reg = {
+	.poll_link		= snb_poll_link,
+	.link_is_up		= snb_link_is_up,
+	.db_ioread		= snb_db_ioread,
+	.db_iowrite		= snb_db_iowrite,
+	.db_size		= sizeof(u32),
+	.ntb_ctl		= SNB_NTBCNTL_OFFSET,
+	.mw_bar			= {2, 4, 5},
+};
 
-	kfree(ndev->mw);
-	iounmap(ndev->reg_base);
-	if (ndev->split_bar)
-		pci_release_selected_regions(pdev, NTB_SPLITBAR_MASK);
-	else
-		pci_release_selected_regions(pdev, NTB_BAR_MASK);
-	pci_disable_device(pdev);
-	ntb_free_debugfs(ndev);
-	kfree(ndev);
-}
+static const struct intel_ntb_alt_reg snb_pri_reg = {
+	.db_bell		= SNB_PDOORBELL_OFFSET,
+	.db_mask		= SNB_PDBMSK_OFFSET,
+	.spad			= SNB_SPAD_OFFSET,
+};
+
+static const struct intel_ntb_alt_reg snb_sec_reg = {
+	.db_bell		= SNB_SDOORBELL_OFFSET,
+	.db_mask		= SNB_SDBMSK_OFFSET,
+	/* second half of the scratchpads */
+	.spad			= SNB_SPAD_OFFSET + (SNB_SPAD_COUNT << 1),
+};
 
-static struct pci_driver ntb_pci_driver = {
+static const struct intel_ntb_alt_reg snb_b2b_reg = {
+	.db_bell		= SNB_B2B_DOORBELL_OFFSET,
+	.spad			= SNB_B2B_SPAD_OFFSET,
+};
+
+static const struct intel_ntb_xlat_reg snb_pri_xlat = {
+	/* Note: no primary .bar0_base visible to the secondary side.
+	 *
+	 * The secondary side cannot get the base address stored in primary
+	 * bars.  The base address is necessary to set the limit register to
+	 * any value other than zero, or unlimited.
+	 *
+	 * WITHOUT THE BASE ADDRESS, THE SECONDARY SIDE CANNOT DISABLE the
+	 * window by setting the limit equal to base, nor can it limit the size
+	 * of the memory window by setting the limit to base + size.
+	 */
+	.bar2_limit		= SNB_PBAR23LMT_OFFSET,
+	.bar2_xlat		= SNB_PBAR23XLAT_OFFSET,
+};
+
+static const struct intel_ntb_xlat_reg snb_sec_xlat = {
+	.bar0_base		= SNB_SBAR0BASE_OFFSET,
+	.bar2_limit		= SNB_SBAR23LMT_OFFSET,
+	.bar2_xlat		= SNB_SBAR23XLAT_OFFSET,
+};
+
+static const struct intel_b2b_addr snb_b2b_usd_addr = {
+	.bar2_addr64		= SNB_B2B_BAR2_USD_ADDR64,
+	.bar4_addr64		= SNB_B2B_BAR4_USD_ADDR64,
+	.bar4_addr32		= SNB_B2B_BAR4_USD_ADDR32,
+	.bar5_addr32		= SNB_B2B_BAR5_USD_ADDR32,
+};
+
+static const struct intel_b2b_addr snb_b2b_dsd_addr = {
+	.bar2_addr64		= SNB_B2B_BAR2_DSD_ADDR64,
+	.bar4_addr64		= SNB_B2B_BAR4_DSD_ADDR64,
+	.bar4_addr32		= SNB_B2B_BAR4_DSD_ADDR32,
+	.bar5_addr32		= SNB_B2B_BAR5_DSD_ADDR32,
+};
+
+/* operations for primary side of local ntb */
+static const struct ntb_dev_ops intel_ntb_ops = {
+	.mw_count		= intel_ntb_mw_count,
+	.mw_get_range		= intel_ntb_mw_get_range,
+	.mw_set_trans		= intel_ntb_mw_set_trans,
+	.link_is_up		= intel_ntb_link_is_up,
+	.link_enable		= intel_ntb_link_enable,
+	.link_disable		= intel_ntb_link_disable,
+	.db_is_unsafe		= intel_ntb_db_is_unsafe,
+	.db_valid_mask		= intel_ntb_db_valid_mask,
+	.db_vector_count	= intel_ntb_db_vector_count,
+	.db_vector_mask		= intel_ntb_db_vector_mask,
+	.db_read		= intel_ntb_db_read,
+	.db_clear		= intel_ntb_db_clear,
+	.db_set_mask		= intel_ntb_db_set_mask,
+	.db_clear_mask		= intel_ntb_db_clear_mask,
+	.peer_db_addr		= intel_ntb_peer_db_addr,
+	.peer_db_set		= intel_ntb_peer_db_set,
+	.spad_is_unsafe		= intel_ntb_spad_is_unsafe,
+	.spad_count		= intel_ntb_spad_count,
+	.spad_read		= intel_ntb_spad_read,
+	.spad_write		= intel_ntb_spad_write,
+	.peer_spad_addr		= intel_ntb_peer_spad_addr,
+	.peer_spad_read		= intel_ntb_peer_spad_read,
+	.peer_spad_write	= intel_ntb_peer_spad_write,
+};
+
+static const struct file_operations intel_ntb_debugfs_info = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = ndev_debugfs_read,
+};
+
+static const struct pci_device_id intel_ntb_pci_tbl[] = {
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_BWD)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_HSX)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_PS_HSX)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_JSF)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_SNB)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_IVT)},
+	{PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_SS_HSX)},
+	{0}
+};
+MODULE_DEVICE_TABLE(pci, intel_ntb_pci_tbl);
+
+static struct pci_driver intel_ntb_pci_driver = {
 	.name = KBUILD_MODNAME,
-	.id_table = ntb_pci_tbl,
-	.probe = ntb_pci_probe,
-	.remove = ntb_pci_remove,
+	.id_table = intel_ntb_pci_tbl,
+	.probe = intel_ntb_pci_probe,
+	.remove = intel_ntb_pci_remove,
 };
 
-module_pci_driver(ntb_pci_driver);
+static int __init intel_ntb_pci_driver_init(void)
+{
+	if (debugfs_initialized())
+		debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	return pci_register_driver(&intel_ntb_pci_driver);
+}
+module_init(intel_ntb_pci_driver_init);
+
+static void __exit intel_ntb_pci_driver_exit(void)
+{
+	pci_unregister_driver(&intel_ntb_pci_driver);
+
+	debugfs_remove_recursive(debugfs_dir);
+}
+module_exit(intel_ntb_pci_driver_exit);
+
diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h
index 935610454f70..fec689dc95cf 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.h
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.h
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -45,341 +47,296 @@
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
  */
-#include <linux/ntb_transport.h>
-
-#define NTB_LINK_STATUS_ACTIVE	0x2000
-#define NTB_LINK_SPEED_MASK	0x000f
-#define NTB_LINK_WIDTH_MASK	0x03f0
-
-#define SNB_MSIX_CNT		4
-#define SNB_MAX_B2B_SPADS	16
-#define SNB_MAX_COMPAT_SPADS	16
-/* Reserve the uppermost bit for link interrupt */
-#define SNB_MAX_DB_BITS		15
-#define SNB_LINK_DB		15
-#define SNB_DB_BITS_PER_VEC	5
-#define HSX_SPLITBAR_MAX_MW	3
-#define SNB_MAX_MW		2
-#define SNB_ERRATA_MAX_MW	1
-
-#define SNB_DB_HW_LINK		0x8000
-
-#define SNB_UNCERRSTS_OFFSET	0x014C
-#define SNB_CORERRSTS_OFFSET	0x0158
-#define SNB_LINK_STATUS_OFFSET	0x01A2
-#define SNB_PCICMD_OFFSET	0x0504
-#define SNB_DEVCTRL_OFFSET	0x0598
-#define SNB_DEVSTS_OFFSET	0x059A
-#define SNB_SLINK_STATUS_OFFSET	0x05A2
-
-#define SNB_PBAR2LMT_OFFSET	0x0000
-#define SNB_PBAR4LMT_OFFSET	0x0008
-#define SNB_PBAR5LMT_OFFSET	0x000C
-#define SNB_PBAR2XLAT_OFFSET	0x0010
-#define SNB_PBAR4XLAT_OFFSET	0x0018
-#define SNB_PBAR5XLAT_OFFSET	0x001C
-#define SNB_SBAR2LMT_OFFSET	0x0020
-#define SNB_SBAR4LMT_OFFSET	0x0028
-#define SNB_SBAR5LMT_OFFSET	0x002C
-#define SNB_SBAR2XLAT_OFFSET	0x0030
-#define SNB_SBAR4XLAT_OFFSET	0x0038
-#define SNB_SBAR5XLAT_OFFSET	0x003C
-#define SNB_SBAR0BASE_OFFSET	0x0040
-#define SNB_SBAR2BASE_OFFSET	0x0048
-#define SNB_SBAR4BASE_OFFSET	0x0050
-#define SNB_SBAR5BASE_OFFSET	0x0054
-#define SNB_NTBCNTL_OFFSET	0x0058
-#define SNB_SBDF_OFFSET		0x005C
-#define SNB_PDOORBELL_OFFSET	0x0060
-#define SNB_PDBMSK_OFFSET	0x0062
-#define SNB_SDOORBELL_OFFSET	0x0064
-#define SNB_SDBMSK_OFFSET	0x0066
-#define SNB_USMEMMISS_OFFSET	0x0070
-#define SNB_SPAD_OFFSET		0x0080
-#define SNB_SPADSEMA4_OFFSET	0x00c0
-#define SNB_WCCNTRL_OFFSET	0x00e0
-#define SNB_B2B_SPAD_OFFSET	0x0100
-#define SNB_B2B_DOORBELL_OFFSET	0x0140
-#define SNB_B2B_XLAT_OFFSETL	0x0144
-#define SNB_B2B_XLAT_OFFSETU	0x0148
 
-/*
- * The addresses are setup so the 32bit BARs can function. Thus
- * the addresses are all in 32bit space
- */
-#define SNB_MBAR01_USD_ADDR	0x000000002100000CULL
-#define SNB_MBAR23_USD_ADDR	0x000000004100000CULL
-#define SNB_MBAR4_USD_ADDR	0x000000008100000CULL
-#define SNB_MBAR5_USD_ADDR	0x00000000A100000CULL
-#define SNB_MBAR01_DSD_ADDR	0x000000002000000CULL
-#define SNB_MBAR23_DSD_ADDR	0x000000004000000CULL
-#define SNB_MBAR4_DSD_ADDR	0x000000008000000CULL
-#define SNB_MBAR5_DSD_ADDR	0x00000000A000000CULL
-
-#define BWD_MSIX_CNT		34
-#define BWD_MAX_SPADS		16
-#define BWD_MAX_DB_BITS		34
-#define BWD_DB_BITS_PER_VEC	1
-#define BWD_MAX_MW		2
-
-#define BWD_PCICMD_OFFSET	0xb004
-#define BWD_MBAR23_OFFSET	0xb018
-#define BWD_MBAR45_OFFSET	0xb020
-#define BWD_DEVCTRL_OFFSET	0xb048
-#define BWD_LINK_STATUS_OFFSET	0xb052
-#define BWD_ERRCORSTS_OFFSET	0xb110
-
-#define BWD_SBAR2XLAT_OFFSET	0x0008
-#define BWD_SBAR4XLAT_OFFSET	0x0010
-#define BWD_PDOORBELL_OFFSET	0x0020
-#define BWD_PDBMSK_OFFSET	0x0028
-#define BWD_NTBCNTL_OFFSET	0x0060
-#define BWD_EBDF_OFFSET		0x0064
-#define BWD_SPAD_OFFSET		0x0080
-#define BWD_SPADSEMA_OFFSET	0x00c0
-#define BWD_STKYSPAD_OFFSET	0x00c4
-#define BWD_PBAR2XLAT_OFFSET	0x8008
-#define BWD_PBAR4XLAT_OFFSET	0x8010
-#define BWD_B2B_DOORBELL_OFFSET	0x8020
-#define BWD_B2B_SPAD_OFFSET	0x8080
-#define BWD_B2B_SPADSEMA_OFFSET	0x80c0
-#define BWD_B2B_STKYSPAD_OFFSET	0x80c4
-
-#define BWD_MODPHY_PCSREG4	0x1c004
-#define BWD_MODPHY_PCSREG6	0x1c006
-
-#define BWD_IP_BASE		0xC000
-#define BWD_DESKEWSTS_OFFSET	(BWD_IP_BASE + 0x3024)
-#define BWD_LTSSMERRSTS0_OFFSET (BWD_IP_BASE + 0x3180)
+#ifndef NTB_HW_INTEL_H
+#define NTB_HW_INTEL_H
+
+#include <linux/ntb.h>
+#include <linux/pci.h>
+
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF	0x3725
+#define PCI_DEVICE_ID_INTEL_NTB_PS_JSF	0x3726
+#define PCI_DEVICE_ID_INTEL_NTB_SS_JSF	0x3727
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB	0x3C0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_SNB	0x3C0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_SNB	0x3C0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_IVT	0x0E0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_IVT	0x0E0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_IVT	0x0E0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_HSX	0x2F0D
+#define PCI_DEVICE_ID_INTEL_NTB_PS_HSX	0x2F0E
+#define PCI_DEVICE_ID_INTEL_NTB_SS_HSX	0x2F0F
+#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD	0x0C4E
+
+/* SNB hardware (and JSF, IVT, HSX) */
+
+#define SNB_PBAR23LMT_OFFSET		0x0000
+#define SNB_PBAR45LMT_OFFSET		0x0008
+#define SNB_PBAR4LMT_OFFSET		0x0008
+#define SNB_PBAR5LMT_OFFSET		0x000c
+#define SNB_PBAR23XLAT_OFFSET		0x0010
+#define SNB_PBAR45XLAT_OFFSET		0x0018
+#define SNB_PBAR4XLAT_OFFSET		0x0018
+#define SNB_PBAR5XLAT_OFFSET		0x001c
+#define SNB_SBAR23LMT_OFFSET		0x0020
+#define SNB_SBAR45LMT_OFFSET		0x0028
+#define SNB_SBAR4LMT_OFFSET		0x0028
+#define SNB_SBAR5LMT_OFFSET		0x002c
+#define SNB_SBAR23XLAT_OFFSET		0x0030
+#define SNB_SBAR45XLAT_OFFSET		0x0038
+#define SNB_SBAR4XLAT_OFFSET		0x0038
+#define SNB_SBAR5XLAT_OFFSET		0x003c
+#define SNB_SBAR0BASE_OFFSET		0x0040
+#define SNB_SBAR23BASE_OFFSET		0x0048
+#define SNB_SBAR45BASE_OFFSET		0x0050
+#define SNB_SBAR4BASE_OFFSET		0x0050
+#define SNB_SBAR5BASE_OFFSET		0x0054
+#define SNB_SBDF_OFFSET			0x005c
+#define SNB_NTBCNTL_OFFSET		0x0058
+#define SNB_PDOORBELL_OFFSET		0x0060
+#define SNB_PDBMSK_OFFSET		0x0062
+#define SNB_SDOORBELL_OFFSET		0x0064
+#define SNB_SDBMSK_OFFSET		0x0066
+#define SNB_USMEMMISS_OFFSET		0x0070
+#define SNB_SPAD_OFFSET			0x0080
+#define SNB_PBAR23SZ_OFFSET		0x00d0
+#define SNB_PBAR45SZ_OFFSET		0x00d1
+#define SNB_PBAR4SZ_OFFSET		0x00d1
+#define SNB_SBAR23SZ_OFFSET		0x00d2
+#define SNB_SBAR45SZ_OFFSET		0x00d3
+#define SNB_SBAR4SZ_OFFSET		0x00d3
+#define SNB_PPD_OFFSET			0x00d4
+#define SNB_PBAR5SZ_OFFSET		0x00d5
+#define SNB_SBAR5SZ_OFFSET		0x00d6
+#define SNB_WCCNTRL_OFFSET		0x00e0
+#define SNB_UNCERRSTS_OFFSET		0x014c
+#define SNB_CORERRSTS_OFFSET		0x0158
+#define SNB_LINK_STATUS_OFFSET		0x01a2
+#define SNB_SPCICMD_OFFSET		0x0504
+#define SNB_DEVCTRL_OFFSET		0x0598
+#define SNB_DEVSTS_OFFSET		0x059a
+#define SNB_SLINK_STATUS_OFFSET		0x05a2
+#define SNB_B2B_SPAD_OFFSET		0x0100
+#define SNB_B2B_DOORBELL_OFFSET		0x0140
+#define SNB_B2B_XLAT_OFFSETL		0x0144
+#define SNB_B2B_XLAT_OFFSETU		0x0148
+#define SNB_PPD_CONN_MASK		0x03
+#define SNB_PPD_CONN_TRANSPARENT	0x00
+#define SNB_PPD_CONN_B2B		0x01
+#define SNB_PPD_CONN_RP			0x02
+#define SNB_PPD_DEV_MASK		0x10
+#define SNB_PPD_DEV_USD			0x00
+#define SNB_PPD_DEV_DSD			0x10
+#define SNB_PPD_SPLIT_BAR_MASK		0x40
+
+#define SNB_PPD_TOPO_MASK	(SNB_PPD_CONN_MASK | SNB_PPD_DEV_MASK)
+#define SNB_PPD_TOPO_PRI_USD	(SNB_PPD_CONN_RP | SNB_PPD_DEV_USD)
+#define SNB_PPD_TOPO_PRI_DSD	(SNB_PPD_CONN_RP | SNB_PPD_DEV_DSD)
+#define SNB_PPD_TOPO_SEC_USD	(SNB_PPD_CONN_TRANSPARENT | SNB_PPD_DEV_USD)
+#define SNB_PPD_TOPO_SEC_DSD	(SNB_PPD_CONN_TRANSPARENT | SNB_PPD_DEV_DSD)
+#define SNB_PPD_TOPO_B2B_USD	(SNB_PPD_CONN_B2B | SNB_PPD_DEV_USD)
+#define SNB_PPD_TOPO_B2B_DSD	(SNB_PPD_CONN_B2B | SNB_PPD_DEV_DSD)
+
+#define SNB_MW_COUNT			2
+#define HSX_SPLIT_BAR_MW_COUNT		3
+#define SNB_DB_COUNT			15
+#define SNB_DB_LINK			15
+#define SNB_DB_LINK_BIT			BIT_ULL(SNB_DB_LINK)
+#define SNB_DB_MSIX_VECTOR_COUNT	4
+#define SNB_DB_MSIX_VECTOR_SHIFT	5
+#define SNB_DB_TOTAL_SHIFT		16
+#define SNB_SPAD_COUNT			16
+
+/* BWD hardware */
+
+#define BWD_SBAR2XLAT_OFFSET		0x0008
+#define BWD_PDOORBELL_OFFSET		0x0020
+#define BWD_PDBMSK_OFFSET		0x0028
+#define BWD_NTBCNTL_OFFSET		0x0060
+#define BWD_SPAD_OFFSET			0x0080
+#define BWD_PPD_OFFSET			0x00d4
+#define BWD_PBAR2XLAT_OFFSET		0x8008
+#define BWD_B2B_DOORBELL_OFFSET		0x8020
+#define BWD_B2B_SPAD_OFFSET		0x8080
+#define BWD_SPCICMD_OFFSET		0xb004
+#define BWD_LINK_STATUS_OFFSET		0xb052
+#define BWD_ERRCORSTS_OFFSET		0xb110
+#define BWD_IP_BASE			0xc000
+#define BWD_DESKEWSTS_OFFSET		(BWD_IP_BASE + 0x3024)
+#define BWD_LTSSMERRSTS0_OFFSET		(BWD_IP_BASE + 0x3180)
 #define BWD_LTSSMSTATEJMP_OFFSET	(BWD_IP_BASE + 0x3040)
 #define BWD_IBSTERRRCRVSTS0_OFFSET	(BWD_IP_BASE + 0x3324)
+#define BWD_MODPHY_PCSREG4		0x1c004
+#define BWD_MODPHY_PCSREG6		0x1c006
+
+#define BWD_PPD_INIT_LINK		0x0008
+#define BWD_PPD_CONN_MASK		0x0300
+#define BWD_PPD_CONN_TRANSPARENT	0x0000
+#define BWD_PPD_CONN_B2B		0x0100
+#define BWD_PPD_CONN_RP			0x0200
+#define BWD_PPD_DEV_MASK		0x1000
+#define BWD_PPD_DEV_USD			0x0000
+#define BWD_PPD_DEV_DSD			0x1000
+#define BWD_PPD_TOPO_MASK	(BWD_PPD_CONN_MASK | BWD_PPD_DEV_MASK)
+#define BWD_PPD_TOPO_PRI_USD	(BWD_PPD_CONN_TRANSPARENT | BWD_PPD_DEV_USD)
+#define BWD_PPD_TOPO_PRI_DSD	(BWD_PPD_CONN_TRANSPARENT | BWD_PPD_DEV_DSD)
+#define BWD_PPD_TOPO_SEC_USD	(BWD_PPD_CONN_RP | BWD_PPD_DEV_USD)
+#define BWD_PPD_TOPO_SEC_DSD	(BWD_PPD_CONN_RP | BWD_PPD_DEV_DSD)
+#define BWD_PPD_TOPO_B2B_USD	(BWD_PPD_CONN_B2B | BWD_PPD_DEV_USD)
+#define BWD_PPD_TOPO_B2B_DSD	(BWD_PPD_CONN_B2B | BWD_PPD_DEV_DSD)
+
+#define BWD_MW_COUNT			2
+#define BWD_DB_COUNT			34
+#define BWD_DB_VALID_MASK		(BIT_ULL(BWD_DB_COUNT) - 1)
+#define BWD_DB_MSIX_VECTOR_COUNT	34
+#define BWD_DB_MSIX_VECTOR_SHIFT	1
+#define BWD_DB_TOTAL_SHIFT		34
+#define BWD_SPAD_COUNT			16
+
+#define BWD_NTB_CTL_DOWN_BIT		BIT(16)
+#define BWD_NTB_CTL_ACTIVE(x)		!(x & BWD_NTB_CTL_DOWN_BIT)
+
+#define BWD_DESKEWSTS_DBERR		BIT(15)
+#define BWD_LTSSMERRSTS0_UNEXPECTEDEI	BIT(20)
+#define BWD_LTSSMSTATEJMP_FORCEDETECT	BIT(2)
+#define BWD_IBIST_ERR_OFLOW		0x7FFF7FFF
+
+#define BWD_LINK_HB_TIMEOUT		msecs_to_jiffies(1000)
+#define BWD_LINK_RECOVERY_TIME		msecs_to_jiffies(500)
+
+/* Ntb control and link status */
+
+#define NTB_CTL_CFG_LOCK		BIT(0)
+#define NTB_CTL_DISABLE			BIT(1)
+#define NTB_CTL_S2P_BAR2_SNOOP		BIT(2)
+#define NTB_CTL_P2S_BAR2_SNOOP		BIT(4)
+#define NTB_CTL_S2P_BAR4_SNOOP		BIT(6)
+#define NTB_CTL_P2S_BAR4_SNOOP		BIT(8)
+#define NTB_CTL_S2P_BAR5_SNOOP		BIT(12)
+#define NTB_CTL_P2S_BAR5_SNOOP		BIT(14)
+
+#define NTB_LNK_STA_ACTIVE_BIT		0x2000
+#define NTB_LNK_STA_SPEED_MASK		0x000f
+#define NTB_LNK_STA_WIDTH_MASK		0x03f0
+#define NTB_LNK_STA_ACTIVE(x)		(!!((x) & NTB_LNK_STA_ACTIVE_BIT))
+#define NTB_LNK_STA_SPEED(x)		((x) & NTB_LNK_STA_SPEED_MASK)
+#define NTB_LNK_STA_WIDTH(x)		(((x) & NTB_LNK_STA_WIDTH_MASK) >> 4)
+
+/* Use the following addresses for translation between b2b ntb devices in case
+ * the hardware default values are not reliable. */
+#define SNB_B2B_BAR0_USD_ADDR		0x1000000000000000ull
+#define SNB_B2B_BAR2_USD_ADDR64		0x2000000000000000ull
+#define SNB_B2B_BAR4_USD_ADDR64		0x4000000000000000ull
+#define SNB_B2B_BAR4_USD_ADDR32		0x20000000u
+#define SNB_B2B_BAR5_USD_ADDR32		0x40000000u
+#define SNB_B2B_BAR0_DSD_ADDR		0x9000000000000000ull
+#define SNB_B2B_BAR2_DSD_ADDR64		0xa000000000000000ull
+#define SNB_B2B_BAR4_DSD_ADDR64		0xc000000000000000ull
+#define SNB_B2B_BAR4_DSD_ADDR32		0xa0000000u
+#define SNB_B2B_BAR5_DSD_ADDR32		0xc0000000u
+
+/* The peer ntb secondary config space is 32KB fixed size */
+#define SNB_B2B_MIN_SIZE		0x8000
+
+/* flags to indicate hardware errata */
+#define NTB_HWERR_SDOORBELL_LOCKUP	BIT_ULL(0)
+#define NTB_HWERR_SB01BASE_LOCKUP	BIT_ULL(1)
+#define NTB_HWERR_B2BDOORBELL_BIT14	BIT_ULL(2)
+
+/* flags to indicate unsafe api */
+#define NTB_UNSAFE_DB			BIT_ULL(0)
+#define NTB_UNSAFE_SPAD			BIT_ULL(1)
+
+struct intel_ntb_dev;
+
+struct intel_ntb_reg {
+	int (*poll_link)(struct intel_ntb_dev *ndev);
+	int (*link_is_up)(struct intel_ntb_dev *ndev);
+	u64 (*db_ioread)(void __iomem *mmio);
+	void (*db_iowrite)(u64 db_bits, void __iomem *mmio);
+	unsigned long			ntb_ctl;
+	resource_size_t			db_size;
+	int				mw_bar[];
+};
 
-#define BWD_DESKEWSTS_DBERR	(1 << 15)
-#define BWD_LTSSMERRSTS0_UNEXPECTEDEI	(1 << 20)
-#define BWD_LTSSMSTATEJMP_FORCEDETECT	(1 << 2)
-#define BWD_IBIST_ERR_OFLOW	0x7FFF7FFF
-
-#define NTB_CNTL_CFG_LOCK		(1 << 0)
-#define NTB_CNTL_LINK_DISABLE		(1 << 1)
-#define NTB_CNTL_S2P_BAR23_SNOOP	(1 << 2)
-#define NTB_CNTL_P2S_BAR23_SNOOP	(1 << 4)
-#define NTB_CNTL_S2P_BAR4_SNOOP	(1 << 6)
-#define NTB_CNTL_P2S_BAR4_SNOOP	(1 << 8)
-#define NTB_CNTL_S2P_BAR5_SNOOP	(1 << 12)
-#define NTB_CNTL_P2S_BAR5_SNOOP	(1 << 14)
-#define BWD_CNTL_LINK_DOWN		(1 << 16)
-
-#define NTB_PPD_OFFSET		0x00D4
-#define SNB_PPD_CONN_TYPE	0x0003
-#define SNB_PPD_DEV_TYPE	0x0010
-#define SNB_PPD_SPLIT_BAR	(1 << 6)
-#define BWD_PPD_INIT_LINK	0x0008
-#define BWD_PPD_CONN_TYPE	0x0300
-#define BWD_PPD_DEV_TYPE	0x1000
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_JSF		0x3725
-#define PCI_DEVICE_ID_INTEL_NTB_PS_JSF		0x3726
-#define PCI_DEVICE_ID_INTEL_NTB_SS_JSF		0x3727
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_SNB		0x3C0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_SNB		0x3C0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_SNB		0x3C0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_IVT		0x0E0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_IVT		0x0E0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_IVT		0x0E0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_HSX		0x2F0D
-#define PCI_DEVICE_ID_INTEL_NTB_PS_HSX		0x2F0E
-#define PCI_DEVICE_ID_INTEL_NTB_SS_HSX		0x2F0F
-#define PCI_DEVICE_ID_INTEL_NTB_B2B_BWD		0x0C4E
-
-#ifndef readq
-static inline u64 readq(void __iomem *addr)
-{
-	return readl(addr) | (((u64) readl(addr + 4)) << 32LL);
-}
-#endif
-
-#ifndef writeq
-static inline void writeq(u64 val, void __iomem *addr)
-{
-	writel(val & 0xffffffff, addr);
-	writel(val >> 32, addr + 4);
-}
-#endif
+struct intel_ntb_alt_reg {
+	unsigned long			db_bell;
+	unsigned long			db_mask;
+	unsigned long			spad;
+};
 
-#define NTB_BAR_MMIO		0
-#define NTB_BAR_23		2
-#define NTB_BAR_4		4
-#define NTB_BAR_5		5
-
-#define NTB_BAR_MASK		((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
-				 (1 << NTB_BAR_4))
-#define NTB_SPLITBAR_MASK	((1 << NTB_BAR_MMIO) | (1 << NTB_BAR_23) |\
-				 (1 << NTB_BAR_4) | (1 << NTB_BAR_5))
-
-#define NTB_HB_TIMEOUT		msecs_to_jiffies(1000)
-
-enum ntb_hw_event {
-	NTB_EVENT_SW_EVENT0 = 0,
-	NTB_EVENT_SW_EVENT1,
-	NTB_EVENT_SW_EVENT2,
-	NTB_EVENT_HW_ERROR,
-	NTB_EVENT_HW_LINK_UP,
-	NTB_EVENT_HW_LINK_DOWN,
+struct intel_ntb_xlat_reg {
+	unsigned long			bar0_base;
+	unsigned long			bar2_xlat;
+	unsigned long			bar2_limit;
 };
 
-struct ntb_mw {
-	dma_addr_t phys_addr;
-	void __iomem *vbase;
-	resource_size_t bar_sz;
+struct intel_b2b_addr {
+	phys_addr_t			bar0_addr;
+	phys_addr_t			bar2_addr64;
+	phys_addr_t			bar4_addr64;
+	phys_addr_t			bar4_addr32;
+	phys_addr_t			bar5_addr32;
 };
 
-struct ntb_db_cb {
-	int (*callback)(void *data, int db_num);
-	unsigned int db_num;
-	void *data;
-	struct ntb_device *ndev;
-	struct tasklet_struct irq_work;
+struct intel_ntb_vec {
+	struct intel_ntb_dev		*ndev;
+	int				num;
 };
 
-#define WA_SNB_ERR	0x00000001
-
-struct ntb_device {
-	struct pci_dev *pdev;
-	struct msix_entry *msix_entries;
-	void __iomem *reg_base;
-	struct ntb_mw *mw;
-	struct {
-		unsigned char max_mw;
-		unsigned char max_spads;
-		unsigned char max_db_bits;
-		unsigned char msix_cnt;
-	} limits;
-	struct {
-		void __iomem *ldb;
-		void __iomem *ldb_mask;
-		void __iomem *rdb;
-		void __iomem *bar2_xlat;
-		void __iomem *bar4_xlat;
-		void __iomem *bar5_xlat;
-		void __iomem *spad_write;
-		void __iomem *spad_read;
-		void __iomem *lnk_cntl;
-		void __iomem *lnk_stat;
-		void __iomem *spci_cmd;
-	} reg_ofs;
-	struct ntb_transport *ntb_transport;
-	void (*event_cb)(void *handle, enum ntb_hw_event event);
-
-	struct ntb_db_cb *db_cb;
-	unsigned char hw_type;
-	unsigned char conn_type;
-	unsigned char dev_type;
-	unsigned char num_msix;
-	unsigned char bits_per_vector;
-	unsigned char max_cbs;
-	unsigned char link_width;
-	unsigned char link_speed;
-	unsigned char link_status;
-	unsigned char split_bar;
-
-	struct delayed_work hb_timer;
-	unsigned long last_ts;
-
-	struct delayed_work lr_timer;
-
-	struct dentry *debugfs_dir;
-	struct dentry *debugfs_info;
-
-	unsigned int wa_flags;
+struct intel_ntb_dev {
+	struct ntb_dev			ntb;
+
+	/* offset of peer bar0 in b2b bar */
+	unsigned long			b2b_off;
+	/* mw idx used to access peer bar0 */
+	unsigned int			b2b_idx;
+
+	/* BAR45 is split into BAR4 and BAR5 */
+	bool				bar4_split;
+
+	u32				ntb_ctl;
+	u32				lnk_sta;
+
+	unsigned char			mw_count;
+	unsigned char			spad_count;
+	unsigned char			db_count;
+	unsigned char			db_vec_count;
+	unsigned char			db_vec_shift;
+
+	u64				db_valid_mask;
+	u64				db_link_mask;
+	u64				db_mask;
+
+	/* synchronize rmw access of db_mask and hw reg */
+	spinlock_t			db_mask_lock;
+
+	struct msix_entry		*msix;
+	struct intel_ntb_vec		*vec;
+
+	const struct intel_ntb_reg	*reg;
+	const struct intel_ntb_alt_reg	*self_reg;
+	const struct intel_ntb_alt_reg	*peer_reg;
+	const struct intel_ntb_xlat_reg	*xlat_reg;
+	void				__iomem *self_mmio;
+	void				__iomem *peer_mmio;
+	phys_addr_t			peer_addr;
+
+	unsigned long			last_ts;
+	struct delayed_work		hb_timer;
+
+	unsigned long			hwerr_flags;
+	unsigned long			unsafe_flags;
+	unsigned long			unsafe_flags_ignore;
+
+	struct dentry			*debugfs_dir;
+	struct dentry			*debugfs_info;
 };
 
-/**
- * ntb_max_cbs() - return the max callbacks
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the maximum number of callbacks
- *
- * RETURNS: the maximum number of callbacks
- */
-static inline unsigned char ntb_max_cbs(struct ntb_device *ndev)
-{
-	return ndev->max_cbs;
-}
-
-/**
- * ntb_max_mw() - return the max number of memory windows
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the maximum number of memory windows
- *
- * RETURNS: the maximum number of memory windows
- */
-static inline unsigned char ntb_max_mw(struct ntb_device *ndev)
-{
-	return ndev->limits.max_mw;
-}
-
-/**
- * ntb_hw_link_status() - return the hardware link status
- * @ndev: pointer to ntb_device instance
- *
- * Returns true if the hardware is connected to the remote system
- *
- * RETURNS: true or false based on the hardware link state
- */
-static inline bool ntb_hw_link_status(struct ntb_device *ndev)
-{
-	return ndev->link_status == NTB_LINK_UP;
-}
-
-/**
- * ntb_query_pdev() - return the pci_dev pointer
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the pci_dev pointer for the NTB hardware device
- *
- * RETURNS: a pointer to the ntb pci_dev
- */
-static inline struct pci_dev *ntb_query_pdev(struct ntb_device *ndev)
-{
-	return ndev->pdev;
-}
-
-/**
- * ntb_query_debugfs() - return the debugfs pointer
- * @ndev: pointer to ntb_device instance
- *
- * Given the ntb pointer, return the debugfs directory pointer for the NTB
- * hardware device
- *
- * RETURNS: a pointer to the debugfs directory
- */
-static inline struct dentry *ntb_query_debugfs(struct ntb_device *ndev)
-{
-	return ndev->debugfs_dir;
-}
-
-struct ntb_device *ntb_register_transport(struct pci_dev *pdev,
-					  void *transport);
-void ntb_unregister_transport(struct ntb_device *ndev);
-void ntb_set_mw_addr(struct ntb_device *ndev, unsigned int mw, u64 addr);
-int ntb_register_db_callback(struct ntb_device *ndev, unsigned int idx,
-			     void *data, int (*db_cb_func)(void *data,
-							   int db_num));
-void ntb_unregister_db_callback(struct ntb_device *ndev, unsigned int idx);
-int ntb_register_event_callback(struct ntb_device *ndev,
-				void (*event_cb_func)(void *handle,
-						      enum ntb_hw_event event));
-void ntb_unregister_event_callback(struct ntb_device *ndev);
-int ntb_get_max_spads(struct ntb_device *ndev);
-int ntb_write_local_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
-int ntb_read_local_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
-int ntb_write_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 val);
-int ntb_read_remote_spad(struct ntb_device *ndev, unsigned int idx, u32 *val);
-resource_size_t ntb_get_mw_base(struct ntb_device *ndev, unsigned int mw);
-void __iomem *ntb_get_mw_vbase(struct ntb_device *ndev, unsigned int mw);
-u64 ntb_get_mw_size(struct ntb_device *ndev, unsigned int mw);
-void ntb_ring_doorbell(struct ntb_device *ndev, unsigned int idx);
-void *ntb_find_transport(struct pci_dev *pdev);
-
-int ntb_transport_init(struct pci_dev *pdev);
-void ntb_transport_free(void *transport);
+#define ndev_pdev(ndev) ((ndev)->ntb.pdev)
+#define ndev_name(ndev) pci_name(ndev_pdev(ndev))
+#define ndev_dev(ndev) (&ndev_pdev(ndev)->dev)
+#define ntb_ndev(ntb) container_of(ntb, struct intel_ntb_dev, ntb)
+#define hb_ndev(work) container_of(work, struct intel_ntb_dev, hb_timer.work)
+
+#endif
diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index c5f26cda9f97..9faf1c6029af 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -40,7 +42,7 @@
  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Intel PCIe NTB Linux driver
+ * PCIe NTB Transport Linux driver
  *
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
@@ -56,9 +58,22 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include "hw/intel/ntb_hw_intel.h"
+#include "linux/ntb.h"
+#include "linux/ntb_transport.h"
 
-#define NTB_TRANSPORT_VERSION	3
+#define NTB_TRANSPORT_VERSION	4
+#define NTB_TRANSPORT_VER	"4"
+#define NTB_TRANSPORT_NAME	"ntb_transport"
+#define NTB_TRANSPORT_DESC	"Software Queue-Pair Transport over NTB"
+
+MODULE_DESCRIPTION(NTB_TRANSPORT_DESC);
+MODULE_VERSION(NTB_TRANSPORT_VER);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+static unsigned long max_mw_size;
+module_param(max_mw_size, ulong, 0644);
+MODULE_PARM_DESC(max_mw_size, "Limit size of large memory windows");
 
 static unsigned int transport_mtu = 0x401E;
 module_param(transport_mtu, uint, 0644);
@@ -72,10 +87,12 @@ static unsigned int copy_bytes = 1024;
 module_param(copy_bytes, uint, 0644);
 MODULE_PARM_DESC(copy_bytes, "Threshold under which NTB will use the CPU to copy instead of DMA");
 
+static struct dentry *nt_debugfs_dir;
+
 struct ntb_queue_entry {
 	/* ntb_queue list reference */
 	struct list_head entry;
-	/* pointers to data to be transfered */
+	/* pointers to data to be transferred */
 	void *cb_data;
 	void *buf;
 	unsigned int len;
@@ -94,14 +111,16 @@ struct ntb_rx_info {
 };
 
 struct ntb_transport_qp {
-	struct ntb_transport *transport;
-	struct ntb_device *ndev;
+	struct ntb_transport_ctx *transport;
+	struct ntb_dev *ndev;
 	void *cb_data;
 	struct dma_chan *dma_chan;
 
 	bool client_ready;
-	bool qp_link;
+	bool link_is_up;
+
 	u8 qp_num;	/* Only 64 QP's are allowed.  0-63 */
+	u64 qp_bit;
 
 	struct ntb_rx_info __iomem *rx_info;
 	struct ntb_rx_info *remote_rx_info;
@@ -127,6 +146,7 @@ struct ntb_transport_qp {
 	unsigned int rx_max_entry;
 	unsigned int rx_max_frame;
 	dma_cookie_t last_cookie;
+	struct tasklet_struct rxc_db_work;
 
 	void (*event_handler)(void *data, int status);
 	struct delayed_work link_work;
@@ -153,33 +173,44 @@ struct ntb_transport_qp {
 };
 
 struct ntb_transport_mw {
-	size_t size;
+	phys_addr_t phys_addr;
+	resource_size_t phys_size;
+	resource_size_t xlat_align;
+	resource_size_t xlat_align_size;
+	void __iomem *vbase;
+	size_t xlat_size;
+	size_t buff_size;
 	void *virt_addr;
 	dma_addr_t dma_addr;
 };
 
 struct ntb_transport_client_dev {
 	struct list_head entry;
+	struct ntb_transport_ctx *nt;
 	struct device dev;
 };
 
-struct ntb_transport {
+struct ntb_transport_ctx {
 	struct list_head entry;
 	struct list_head client_devs;
 
-	struct ntb_device *ndev;
-	struct ntb_transport_mw *mw;
-	struct ntb_transport_qp *qps;
-	unsigned int max_qps;
-	unsigned long qp_bitmap;
-	bool transport_link;
+	struct ntb_dev *ndev;
+
+	struct ntb_transport_mw *mw_vec;
+	struct ntb_transport_qp *qp_vec;
+	unsigned int mw_count;
+	unsigned int qp_count;
+	u64 qp_bitmap;
+	u64 qp_bitmap_free;
+
+	bool link_is_up;
 	struct delayed_work link_work;
 	struct work_struct link_cleanup;
 };
 
 enum {
-	DESC_DONE_FLAG = 1 << 0,
-	LINK_DOWN_FLAG = 1 << 1,
+	DESC_DONE_FLAG = BIT(0),
+	LINK_DOWN_FLAG = BIT(1),
 };
 
 struct ntb_payload_header {
@@ -200,68 +231,69 @@ enum {
 	MAX_SPAD,
 };
 
-#define QP_TO_MW(ndev, qp)	((qp) % ntb_max_mw(ndev))
+#define dev_client_dev(__dev) \
+	container_of((__dev), struct ntb_transport_client_dev, dev)
+
+#define drv_client(__drv) \
+	container_of((__drv), struct ntb_transport_client, driver)
+
+#define QP_TO_MW(nt, qp)	((qp) % nt->mw_count)
 #define NTB_QP_DEF_NUM_ENTRIES	100
 #define NTB_LINK_DOWN_TIMEOUT	10
 
-static int ntb_match_bus(struct device *dev, struct device_driver *drv)
+static void ntb_transport_rxc_db(unsigned long data);
+static const struct ntb_ctx_ops ntb_transport_ops;
+static struct ntb_client ntb_transport_client;
+
+static int ntb_transport_bus_match(struct device *dev,
+				   struct device_driver *drv)
 {
 	return !strncmp(dev_name(dev), drv->name, strlen(drv->name));
 }
 
-static int ntb_client_probe(struct device *dev)
+static int ntb_transport_bus_probe(struct device *dev)
 {
-	const struct ntb_client *drv = container_of(dev->driver,
-						    struct ntb_client, driver);
-	struct pci_dev *pdev = container_of(dev->parent, struct pci_dev, dev);
+	const struct ntb_transport_client *client;
 	int rc = -EINVAL;
 
 	get_device(dev);
-	if (drv && drv->probe)
-		rc = drv->probe(pdev);
+
+	client = drv_client(dev->driver);
+	rc = client->probe(dev);
 	if (rc)
 		put_device(dev);
 
 	return rc;
 }
 
-static int ntb_client_remove(struct device *dev)
+static int ntb_transport_bus_remove(struct device *dev)
 {
-	const struct ntb_client *drv = container_of(dev->driver,
-						    struct ntb_client, driver);
-	struct pci_dev *pdev = container_of(dev->parent, struct pci_dev, dev);
+	const struct ntb_transport_client *client;
 
-	if (drv && drv->remove)
-		drv->remove(pdev);
+	client = drv_client(dev->driver);
+	client->remove(dev);
 
 	put_device(dev);
 
 	return 0;
 }
 
-static struct bus_type ntb_bus_type = {
-	.name = "ntb_bus",
-	.match = ntb_match_bus,
-	.probe = ntb_client_probe,
-	.remove = ntb_client_remove,
+static struct bus_type ntb_transport_bus = {
+	.name = "ntb_transport",
+	.match = ntb_transport_bus_match,
+	.probe = ntb_transport_bus_probe,
+	.remove = ntb_transport_bus_remove,
 };
 
 static LIST_HEAD(ntb_transport_list);
 
-static int ntb_bus_init(struct ntb_transport *nt)
+static int ntb_bus_init(struct ntb_transport_ctx *nt)
 {
-	if (list_empty(&ntb_transport_list)) {
-		int rc = bus_register(&ntb_bus_type);
-		if (rc)
-			return rc;
-	}
-
 	list_add(&nt->entry, &ntb_transport_list);
-
 	return 0;
 }
 
-static void ntb_bus_remove(struct ntb_transport *nt)
+static void ntb_bus_remove(struct ntb_transport_ctx *nt)
 {
 	struct ntb_transport_client_dev *client_dev, *cd;
 
@@ -273,29 +305,26 @@ static void ntb_bus_remove(struct ntb_transport *nt)
 	}
 
 	list_del(&nt->entry);
-
-	if (list_empty(&ntb_transport_list))
-		bus_unregister(&ntb_bus_type);
 }
 
-static void ntb_client_release(struct device *dev)
+static void ntb_transport_client_release(struct device *dev)
 {
 	struct ntb_transport_client_dev *client_dev;
-	client_dev = container_of(dev, struct ntb_transport_client_dev, dev);
 
+	client_dev = dev_client_dev(dev);
 	kfree(client_dev);
 }
 
 /**
- * ntb_unregister_client_dev - Unregister NTB client device
+ * ntb_transport_unregister_client_dev - Unregister NTB client device
  * @device_name: Name of NTB client device
  *
  * Unregister an NTB client device with the NTB transport layer
  */
-void ntb_unregister_client_dev(char *device_name)
+void ntb_transport_unregister_client_dev(char *device_name)
 {
 	struct ntb_transport_client_dev *client, *cd;
-	struct ntb_transport *nt;
+	struct ntb_transport_ctx *nt;
 
 	list_for_each_entry(nt, &ntb_transport_list, entry)
 		list_for_each_entry_safe(client, cd, &nt->client_devs, entry)
@@ -305,18 +334,18 @@ void ntb_unregister_client_dev(char *device_name)
 				device_unregister(&client->dev);
 			}
 }
-EXPORT_SYMBOL_GPL(ntb_unregister_client_dev);
+EXPORT_SYMBOL_GPL(ntb_transport_unregister_client_dev);
 
 /**
- * ntb_register_client_dev - Register NTB client device
+ * ntb_transport_register_client_dev - Register NTB client device
  * @device_name: Name of NTB client device
  *
  * Register an NTB client device with the NTB transport layer
  */
-int ntb_register_client_dev(char *device_name)
+int ntb_transport_register_client_dev(char *device_name)
 {
 	struct ntb_transport_client_dev *client_dev;
-	struct ntb_transport *nt;
+	struct ntb_transport_ctx *nt;
 	int rc, i = 0;
 
 	if (list_empty(&ntb_transport_list))
@@ -325,7 +354,7 @@ int ntb_register_client_dev(char *device_name)
 	list_for_each_entry(nt, &ntb_transport_list, entry) {
 		struct device *dev;
 
-		client_dev = kzalloc(sizeof(struct ntb_transport_client_dev),
+		client_dev = kzalloc(sizeof(*client_dev),
 				     GFP_KERNEL);
 		if (!client_dev) {
 			rc = -ENOMEM;
@@ -336,9 +365,9 @@ int ntb_register_client_dev(char *device_name)
 
 		/* setup and register client devices */
 		dev_set_name(dev, "%s%d", device_name, i);
-		dev->bus = &ntb_bus_type;
-		dev->release = ntb_client_release;
-		dev->parent = &ntb_query_pdev(nt->ndev)->dev;
+		dev->bus = &ntb_transport_bus;
+		dev->release = ntb_transport_client_release;
+		dev->parent = &nt->ndev->dev;
 
 		rc = device_register(dev);
 		if (rc) {
@@ -353,11 +382,11 @@ int ntb_register_client_dev(char *device_name)
 	return 0;
 
 err:
-	ntb_unregister_client_dev(device_name);
+	ntb_transport_unregister_client_dev(device_name);
 
 	return rc;
 }
-EXPORT_SYMBOL_GPL(ntb_register_client_dev);
+EXPORT_SYMBOL_GPL(ntb_transport_register_client_dev);
 
 /**
  * ntb_transport_register_client - Register NTB client driver
@@ -367,9 +396,9 @@ EXPORT_SYMBOL_GPL(ntb_register_client_dev);
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
-int ntb_transport_register_client(struct ntb_client *drv)
+int ntb_transport_register_client(struct ntb_transport_client *drv)
 {
-	drv->driver.bus = &ntb_bus_type;
+	drv->driver.bus = &ntb_transport_bus;
 
 	if (list_empty(&ntb_transport_list))
 		return -ENODEV;
@@ -386,7 +415,7 @@ EXPORT_SYMBOL_GPL(ntb_transport_register_client);
  *
  * RETURNS: An appropriate -ERRNO error value on error, or zero for success.
  */
-void ntb_transport_unregister_client(struct ntb_client *drv)
+void ntb_transport_unregister_client(struct ntb_transport_client *drv)
 {
 	driver_unregister(&drv->driver);
 }
@@ -452,8 +481,8 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count,
 			       "tx_max_entry - \t%u\n", qp->tx_max_entry);
 
 	out_offset += snprintf(buf + out_offset, out_count - out_offset,
-			       "\nQP Link %s\n", (qp->qp_link == NTB_LINK_UP) ?
-			       "Up" : "Down");
+			       "\nQP Link %s\n",
+			       qp->link_is_up ? "Up" : "Down");
 	if (out_offset > out_count)
 		out_offset = out_count;
 
@@ -497,26 +526,31 @@ out:
 	return entry;
 }
 
-static void ntb_transport_setup_qp_mw(struct ntb_transport *nt,
-				      unsigned int qp_num)
+static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
+				     unsigned int qp_num)
 {
-	struct ntb_transport_qp *qp = &nt->qps[qp_num];
+	struct ntb_transport_qp *qp = &nt->qp_vec[qp_num];
+	struct ntb_transport_mw *mw;
 	unsigned int rx_size, num_qps_mw;
-	u8 mw_num, mw_max;
+	unsigned int mw_num, mw_count, qp_count;
 	unsigned int i;
 
-	mw_max = ntb_max_mw(nt->ndev);
-	mw_num = QP_TO_MW(nt->ndev, qp_num);
+	mw_count = nt->mw_count;
+	qp_count = nt->qp_count;
 
-	WARN_ON(nt->mw[mw_num].virt_addr == NULL);
+	mw_num = QP_TO_MW(nt, qp_num);
+	mw = &nt->mw_vec[mw_num];
+
+	if (!mw->virt_addr)
+		return -ENOMEM;
 
-	if (nt->max_qps % mw_max && mw_num + 1 < nt->max_qps / mw_max)
-		num_qps_mw = nt->max_qps / mw_max + 1;
+	if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count)
+		num_qps_mw = qp_count / mw_count + 1;
 	else
-		num_qps_mw = nt->max_qps / mw_max;
+		num_qps_mw = qp_count / mw_count;
 
-	rx_size = (unsigned int) nt->mw[mw_num].size / num_qps_mw;
-	qp->rx_buff = nt->mw[mw_num].virt_addr + qp_num / mw_max * rx_size;
+	rx_size = (unsigned int)mw->xlat_size / num_qps_mw;
+	qp->rx_buff = mw->virt_addr + rx_size * qp_num / mw_count;
 	rx_size -= sizeof(struct ntb_rx_info);
 
 	qp->remote_rx_info = qp->rx_buff + rx_size;
@@ -530,49 +564,63 @@ static void ntb_transport_setup_qp_mw(struct ntb_transport *nt,
 
 	/* setup the hdr offsets with 0's */
 	for (i = 0; i < qp->rx_max_entry; i++) {
-		void *offset = qp->rx_buff + qp->rx_max_frame * (i + 1) -
-			       sizeof(struct ntb_payload_header);
+		void *offset = (qp->rx_buff + qp->rx_max_frame * (i + 1) -
+				sizeof(struct ntb_payload_header));
 		memset(offset, 0, sizeof(struct ntb_payload_header));
 	}
 
 	qp->rx_pkts = 0;
 	qp->tx_pkts = 0;
 	qp->tx_index = 0;
+
+	return 0;
 }
 
-static void ntb_free_mw(struct ntb_transport *nt, int num_mw)
+static void ntb_free_mw(struct ntb_transport_ctx *nt, int num_mw)
 {
-	struct ntb_transport_mw *mw = &nt->mw[num_mw];
-	struct pci_dev *pdev = ntb_query_pdev(nt->ndev);
+	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+	struct pci_dev *pdev = nt->ndev->pdev;
 
 	if (!mw->virt_addr)
 		return;
 
-	dma_free_coherent(&pdev->dev, mw->size, mw->virt_addr, mw->dma_addr);
+	ntb_mw_clear_trans(nt->ndev, num_mw);
+	dma_free_coherent(&pdev->dev, mw->buff_size,
+			  mw->virt_addr, mw->dma_addr);
+	mw->xlat_size = 0;
+	mw->buff_size = 0;
 	mw->virt_addr = NULL;
 }
 
-static int ntb_set_mw(struct ntb_transport *nt, int num_mw, unsigned int size)
+static int ntb_set_mw(struct ntb_transport_ctx *nt, int num_mw,
+		      unsigned int size)
 {
-	struct ntb_transport_mw *mw = &nt->mw[num_mw];
-	struct pci_dev *pdev = ntb_query_pdev(nt->ndev);
+	struct ntb_transport_mw *mw = &nt->mw_vec[num_mw];
+	struct pci_dev *pdev = nt->ndev->pdev;
+	unsigned int xlat_size, buff_size;
+	int rc;
+
+	xlat_size = round_up(size, mw->xlat_align_size);
+	buff_size = round_up(size, mw->xlat_align);
 
 	/* No need to re-setup */
-	if (mw->size == ALIGN(size, 4096))
+	if (mw->xlat_size == xlat_size)
 		return 0;
 
-	if (mw->size != 0)
+	if (mw->buff_size)
 		ntb_free_mw(nt, num_mw);
 
-	/* Alloc memory for receiving data.  Must be 4k aligned */
-	mw->size = ALIGN(size, 4096);
+	/* Alloc memory for receiving data.  Must be aligned */
+	mw->xlat_size = xlat_size;
+	mw->buff_size = buff_size;
 
-	mw->virt_addr = dma_alloc_coherent(&pdev->dev, mw->size, &mw->dma_addr,
-					   GFP_KERNEL);
+	mw->virt_addr = dma_alloc_coherent(&pdev->dev, buff_size,
+					   &mw->dma_addr, GFP_KERNEL);
 	if (!mw->virt_addr) {
-		mw->size = 0;
-		dev_err(&pdev->dev, "Unable to allocate MW buffer of size %d\n",
-		       (int) mw->size);
+		mw->xlat_size = 0;
+		mw->buff_size = 0;
+		dev_err(&pdev->dev, "Unable to alloc MW buff of size %d\n",
+			buff_size);
 		return -ENOMEM;
 	}
 
@@ -582,34 +630,39 @@ static int ntb_set_mw(struct ntb_transport *nt, int num_mw, unsigned int size)
 	 * is a requirement of the hardware. It is recommended to setup CMA
 	 * for BAR sizes equal or greater than 4MB.
 	 */
-	if (!IS_ALIGNED(mw->dma_addr, mw->size)) {
-		dev_err(&pdev->dev, "DMA memory %pad not aligned to BAR size\n",
+	if (!IS_ALIGNED(mw->dma_addr, mw->xlat_align)) {
+		dev_err(&pdev->dev, "DMA memory %pad is not aligned\n",
 			&mw->dma_addr);
 		ntb_free_mw(nt, num_mw);
 		return -ENOMEM;
 	}
 
 	/* Notify HW the memory location of the receive buffer */
-	ntb_set_mw_addr(nt->ndev, num_mw, mw->dma_addr);
+	rc = ntb_mw_set_trans(nt->ndev, num_mw, mw->dma_addr, mw->xlat_size);
+	if (rc) {
+		dev_err(&pdev->dev, "Unable to set mw%d translation", num_mw);
+		ntb_free_mw(nt, num_mw);
+		return -EIO;
+	}
 
 	return 0;
 }
 
 static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp)
 {
-	struct ntb_transport *nt = qp->transport;
-	struct pci_dev *pdev = ntb_query_pdev(nt->ndev);
+	struct ntb_transport_ctx *nt = qp->transport;
+	struct pci_dev *pdev = nt->ndev->pdev;
 
-	if (qp->qp_link == NTB_LINK_DOWN) {
+	if (qp->link_is_up) {
 		cancel_delayed_work_sync(&qp->link_work);
 		return;
 	}
 
-	if (qp->event_handler)
-		qp->event_handler(qp->cb_data, NTB_LINK_DOWN);
-
 	dev_info(&pdev->dev, "qp %d: Link Down\n", qp->qp_num);
-	qp->qp_link = NTB_LINK_DOWN;
+	qp->link_is_up = false;
+
+	if (qp->event_handler)
+		qp->event_handler(qp->cb_data, qp->link_is_up);
 }
 
 static void ntb_qp_link_cleanup_work(struct work_struct *work)
@@ -617,11 +670,11 @@ static void ntb_qp_link_cleanup_work(struct work_struct *work)
 	struct ntb_transport_qp *qp = container_of(work,
 						   struct ntb_transport_qp,
 						   link_cleanup);
-	struct ntb_transport *nt = qp->transport;
+	struct ntb_transport_ctx *nt = qp->transport;
 
 	ntb_qp_link_cleanup(qp);
 
-	if (nt->transport_link == NTB_LINK_UP)
+	if (nt->link_is_up)
 		schedule_delayed_work(&qp->link_work,
 				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
 }
@@ -631,180 +684,132 @@ static void ntb_qp_link_down(struct ntb_transport_qp *qp)
 	schedule_work(&qp->link_cleanup);
 }
 
-static void ntb_transport_link_cleanup(struct ntb_transport *nt)
+static void ntb_transport_link_cleanup(struct ntb_transport_ctx *nt)
 {
+	struct ntb_transport_qp *qp;
+	u64 qp_bitmap_alloc;
 	int i;
 
+	qp_bitmap_alloc = nt->qp_bitmap & ~nt->qp_bitmap_free;
+
 	/* Pass along the info to any clients */
-	for (i = 0; i < nt->max_qps; i++)
-		if (!test_bit(i, &nt->qp_bitmap))
-			ntb_qp_link_cleanup(&nt->qps[i]);
+	for (i = 0; i < nt->qp_count; i++)
+		if (qp_bitmap_alloc & BIT_ULL(i)) {
+			qp = &nt->qp_vec[i];
+			ntb_qp_link_cleanup(qp);
+			cancel_work_sync(&qp->link_cleanup);
+			cancel_delayed_work_sync(&qp->link_work);
+		}
 
-	if (nt->transport_link == NTB_LINK_DOWN)
+	if (!nt->link_is_up)
 		cancel_delayed_work_sync(&nt->link_work);
-	else
-		nt->transport_link = NTB_LINK_DOWN;
 
 	/* The scratchpad registers keep the values if the remote side
 	 * goes down, blast them now to give them a sane value the next
 	 * time they are accessed
 	 */
 	for (i = 0; i < MAX_SPAD; i++)
-		ntb_write_local_spad(nt->ndev, i, 0);
+		ntb_spad_write(nt->ndev, i, 0);
 }
 
 static void ntb_transport_link_cleanup_work(struct work_struct *work)
 {
-	struct ntb_transport *nt = container_of(work, struct ntb_transport,
-						link_cleanup);
+	struct ntb_transport_ctx *nt =
+		container_of(work, struct ntb_transport_ctx, link_cleanup);
 
 	ntb_transport_link_cleanup(nt);
 }
 
-static void ntb_transport_event_callback(void *data, enum ntb_hw_event event)
+static void ntb_transport_event_callback(void *data)
 {
-	struct ntb_transport *nt = data;
+	struct ntb_transport_ctx *nt = data;
 
-	switch (event) {
-	case NTB_EVENT_HW_LINK_UP:
+	if (ntb_link_is_up(nt->ndev, NULL, NULL) == 1)
 		schedule_delayed_work(&nt->link_work, 0);
-		break;
-	case NTB_EVENT_HW_LINK_DOWN:
+	else
 		schedule_work(&nt->link_cleanup);
-		break;
-	default:
-		BUG();
-	}
 }
 
 static void ntb_transport_link_work(struct work_struct *work)
 {
-	struct ntb_transport *nt = container_of(work, struct ntb_transport,
-						link_work.work);
-	struct ntb_device *ndev = nt->ndev;
-	struct pci_dev *pdev = ntb_query_pdev(ndev);
+	struct ntb_transport_ctx *nt =
+		container_of(work, struct ntb_transport_ctx, link_work.work);
+	struct ntb_dev *ndev = nt->ndev;
+	struct pci_dev *pdev = ndev->pdev;
+	resource_size_t size;
 	u32 val;
-	int rc, i;
+	int rc, i, spad;
 
 	/* send the local info, in the opposite order of the way we read it */
-	for (i = 0; i < ntb_max_mw(ndev); i++) {
-		rc = ntb_write_remote_spad(ndev, MW0_SZ_HIGH + (i * 2),
-					   ntb_get_mw_size(ndev, i) >> 32);
-		if (rc) {
-			dev_err(&pdev->dev, "Error writing %u to remote spad %d\n",
-				(u32)(ntb_get_mw_size(ndev, i) >> 32),
-				MW0_SZ_HIGH + (i * 2));
-			goto out;
-		}
+	for (i = 0; i < nt->mw_count; i++) {
+		size = nt->mw_vec[i].phys_size;
 
-		rc = ntb_write_remote_spad(ndev, MW0_SZ_LOW + (i * 2),
-					   (u32) ntb_get_mw_size(ndev, i));
-		if (rc) {
-			dev_err(&pdev->dev, "Error writing %u to remote spad %d\n",
-				(u32) ntb_get_mw_size(ndev, i),
-				MW0_SZ_LOW + (i * 2));
-			goto out;
-		}
-	}
+		if (max_mw_size && size > max_mw_size)
+			size = max_mw_size;
 
-	rc = ntb_write_remote_spad(ndev, NUM_MWS, ntb_max_mw(ndev));
-	if (rc) {
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			ntb_max_mw(ndev), NUM_MWS);
-		goto out;
-	}
+		spad = MW0_SZ_HIGH + (i * 2);
+		ntb_peer_spad_write(ndev, spad, (u32)(size >> 32));
 
-	rc = ntb_write_remote_spad(ndev, NUM_QPS, nt->max_qps);
-	if (rc) {
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			nt->max_qps, NUM_QPS);
-		goto out;
+		spad = MW0_SZ_LOW + (i * 2);
+		ntb_peer_spad_write(ndev, spad, (u32)size);
 	}
 
-	rc = ntb_write_remote_spad(ndev, VERSION, NTB_TRANSPORT_VERSION);
-	if (rc) {
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			NTB_TRANSPORT_VERSION, VERSION);
-		goto out;
-	}
+	ntb_peer_spad_write(ndev, NUM_MWS, nt->mw_count);
 
-	/* Query the remote side for its info */
-	rc = ntb_read_remote_spad(ndev, VERSION, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading remote spad %d\n", VERSION);
-		goto out;
-	}
+	ntb_peer_spad_write(ndev, NUM_QPS, nt->qp_count);
 
-	if (val != NTB_TRANSPORT_VERSION)
-		goto out;
-	dev_dbg(&pdev->dev, "Remote version = %d\n", val);
+	ntb_peer_spad_write(ndev, VERSION, NTB_TRANSPORT_VERSION);
 
-	rc = ntb_read_remote_spad(ndev, NUM_QPS, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading remote spad %d\n", NUM_QPS);
+	/* Query the remote side for its info */
+	val = ntb_peer_spad_read(ndev, VERSION);
+	dev_dbg(&pdev->dev, "Remote version = %d\n", val);
+	if (val != NTB_TRANSPORT_VERSION)
 		goto out;
-	}
 
-	if (val != nt->max_qps)
-		goto out;
+	val = ntb_peer_spad_read(ndev, NUM_QPS);
 	dev_dbg(&pdev->dev, "Remote max number of qps = %d\n", val);
-
-	rc = ntb_read_remote_spad(ndev, NUM_MWS, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading remote spad %d\n", NUM_MWS);
+	if (val != nt->qp_count)
 		goto out;
-	}
 
-	if (val != ntb_max_mw(ndev))
-		goto out;
+	val = ntb_peer_spad_read(ndev, NUM_MWS);
 	dev_dbg(&pdev->dev, "Remote number of mws = %d\n", val);
+	if (val != nt->mw_count)
+		goto out;
 
-	for (i = 0; i < ntb_max_mw(ndev); i++) {
+	for (i = 0; i < nt->mw_count; i++) {
 		u64 val64;
 
-		rc = ntb_read_remote_spad(ndev, MW0_SZ_HIGH + (i * 2), &val);
-		if (rc) {
-			dev_err(&pdev->dev, "Error reading remote spad %d\n",
-				MW0_SZ_HIGH + (i * 2));
-			goto out1;
-		}
-
-		val64 = (u64) val << 32;
-
-		rc = ntb_read_remote_spad(ndev, MW0_SZ_LOW + (i * 2), &val);
-		if (rc) {
-			dev_err(&pdev->dev, "Error reading remote spad %d\n",
-				MW0_SZ_LOW + (i * 2));
-			goto out1;
-		}
+		val = ntb_peer_spad_read(ndev, MW0_SZ_HIGH + (i * 2));
+		val64 = (u64)val << 32;
 
+		val = ntb_peer_spad_read(ndev, MW0_SZ_LOW + (i * 2));
 		val64 |= val;
 
-		dev_dbg(&pdev->dev, "Remote MW%d size = %llu\n", i, val64);
+		dev_dbg(&pdev->dev, "Remote MW%d size = %#llx\n", i, val64);
 
 		rc = ntb_set_mw(nt, i, val64);
 		if (rc)
 			goto out1;
 	}
 
-	nt->transport_link = NTB_LINK_UP;
+	nt->link_is_up = true;
 
-	for (i = 0; i < nt->max_qps; i++) {
-		struct ntb_transport_qp *qp = &nt->qps[i];
+	for (i = 0; i < nt->qp_count; i++) {
+		struct ntb_transport_qp *qp = &nt->qp_vec[i];
 
 		ntb_transport_setup_qp_mw(nt, i);
 
-		if (qp->client_ready == NTB_LINK_UP)
+		if (qp->client_ready)
 			schedule_delayed_work(&qp->link_work, 0);
 	}
 
 	return;
 
 out1:
-	for (i = 0; i < ntb_max_mw(ndev); i++)
+	for (i = 0; i < nt->mw_count; i++)
 		ntb_free_mw(nt, i);
 out:
-	if (ntb_hw_link_status(ndev))
+	if (ntb_link_is_up(ndev, NULL, NULL) == 1)
 		schedule_delayed_work(&nt->link_work,
 				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
 }
@@ -814,73 +819,73 @@ static void ntb_qp_link_work(struct work_struct *work)
 	struct ntb_transport_qp *qp = container_of(work,
 						   struct ntb_transport_qp,
 						   link_work.work);
-	struct pci_dev *pdev = ntb_query_pdev(qp->ndev);
-	struct ntb_transport *nt = qp->transport;
-	int rc, val;
+	struct pci_dev *pdev = qp->ndev->pdev;
+	struct ntb_transport_ctx *nt = qp->transport;
+	int val;
 
-	WARN_ON(nt->transport_link != NTB_LINK_UP);
+	WARN_ON(!nt->link_is_up);
 
-	rc = ntb_read_local_spad(nt->ndev, QP_LINKS, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading spad %d\n", QP_LINKS);
-		return;
-	}
+	val = ntb_spad_read(nt->ndev, QP_LINKS);
 
-	rc = ntb_write_remote_spad(nt->ndev, QP_LINKS, val | 1 << qp->qp_num);
-	if (rc)
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			val | 1 << qp->qp_num, QP_LINKS);
+	ntb_peer_spad_write(nt->ndev, QP_LINKS, val | BIT(qp->qp_num));
 
 	/* query remote spad for qp ready bits */
-	rc = ntb_read_remote_spad(nt->ndev, QP_LINKS, &val);
-	if (rc)
-		dev_err(&pdev->dev, "Error reading remote spad %d\n", QP_LINKS);
-
+	ntb_peer_spad_read(nt->ndev, QP_LINKS);
 	dev_dbg(&pdev->dev, "Remote QP link status = %x\n", val);
 
 	/* See if the remote side is up */
-	if (1 << qp->qp_num & val) {
-		qp->qp_link = NTB_LINK_UP;
-
+	if (val & BIT(qp->qp_num)) {
 		dev_info(&pdev->dev, "qp %d: Link Up\n", qp->qp_num);
+		qp->link_is_up = true;
+
 		if (qp->event_handler)
-			qp->event_handler(qp->cb_data, NTB_LINK_UP);
-	} else if (nt->transport_link == NTB_LINK_UP)
+			qp->event_handler(qp->cb_data, qp->link_is_up);
+	} else if (nt->link_is_up)
 		schedule_delayed_work(&qp->link_work,
 				      msecs_to_jiffies(NTB_LINK_DOWN_TIMEOUT));
 }
 
-static int ntb_transport_init_queue(struct ntb_transport *nt,
+static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
 				    unsigned int qp_num)
 {
 	struct ntb_transport_qp *qp;
+	struct ntb_transport_mw *mw;
+	phys_addr_t mw_base;
+	resource_size_t mw_size;
 	unsigned int num_qps_mw, tx_size;
-	u8 mw_num, mw_max;
+	unsigned int mw_num, mw_count, qp_count;
 	u64 qp_offset;
 
-	mw_max = ntb_max_mw(nt->ndev);
-	mw_num = QP_TO_MW(nt->ndev, qp_num);
+	mw_count = nt->mw_count;
+	qp_count = nt->qp_count;
 
-	qp = &nt->qps[qp_num];
+	mw_num = QP_TO_MW(nt, qp_num);
+	mw = &nt->mw_vec[mw_num];
+
+	qp = &nt->qp_vec[qp_num];
 	qp->qp_num = qp_num;
 	qp->transport = nt;
 	qp->ndev = nt->ndev;
-	qp->qp_link = NTB_LINK_DOWN;
-	qp->client_ready = NTB_LINK_DOWN;
+	qp->link_is_up = false;
+	qp->client_ready = false;
 	qp->event_handler = NULL;
 
-	if (nt->max_qps % mw_max && mw_num + 1 < nt->max_qps / mw_max)
-		num_qps_mw = nt->max_qps / mw_max + 1;
+	if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count)
+		num_qps_mw = qp_count / mw_count + 1;
 	else
-		num_qps_mw = nt->max_qps / mw_max;
+		num_qps_mw = qp_count / mw_count;
+
+	mw_base = nt->mw_vec[mw_num].phys_addr;
+	mw_size = nt->mw_vec[mw_num].phys_size;
 
-	tx_size = (unsigned int) ntb_get_mw_size(qp->ndev, mw_num) / num_qps_mw;
-	qp_offset = qp_num / mw_max * tx_size;
-	qp->tx_mw = ntb_get_mw_vbase(nt->ndev, mw_num) + qp_offset;
+	tx_size = (unsigned int)mw_size / num_qps_mw;
+	qp_offset = tx_size * qp_num / mw_count;
+
+	qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset;
 	if (!qp->tx_mw)
 		return -EINVAL;
 
-	qp->tx_mw_phys = ntb_get_mw_base(qp->ndev, mw_num) + qp_offset;
+	qp->tx_mw_phys = mw_base + qp_offset;
 	if (!qp->tx_mw_phys)
 		return -EINVAL;
 
@@ -891,16 +896,19 @@ static int ntb_transport_init_queue(struct ntb_transport *nt,
 	qp->tx_max_frame = min(transport_mtu, tx_size / 2);
 	qp->tx_max_entry = tx_size / qp->tx_max_frame;
 
-	if (ntb_query_debugfs(nt->ndev)) {
+	if (nt_debugfs_dir) {
 		char debugfs_name[4];
 
 		snprintf(debugfs_name, 4, "qp%d", qp_num);
 		qp->debugfs_dir = debugfs_create_dir(debugfs_name,
-						 ntb_query_debugfs(nt->ndev));
+						     nt_debugfs_dir);
 
 		qp->debugfs_stats = debugfs_create_file("stats", S_IRUSR,
 							qp->debugfs_dir, qp,
 							&ntb_qp_debugfs_stats);
+	} else {
+		qp->debugfs_dir = NULL;
+		qp->debugfs_stats = NULL;
 	}
 
 	INIT_DELAYED_WORK(&qp->link_work, ntb_qp_link_work);
@@ -914,46 +922,84 @@ static int ntb_transport_init_queue(struct ntb_transport *nt,
 	INIT_LIST_HEAD(&qp->rx_free_q);
 	INIT_LIST_HEAD(&qp->tx_free_q);
 
+	tasklet_init(&qp->rxc_db_work, ntb_transport_rxc_db,
+		     (unsigned long)qp);
+
 	return 0;
 }
 
-int ntb_transport_init(struct pci_dev *pdev)
+static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev)
 {
-	struct ntb_transport *nt;
+	struct ntb_transport_ctx *nt;
+	struct ntb_transport_mw *mw;
+	unsigned int mw_count, qp_count;
+	u64 qp_bitmap;
 	int rc, i;
 
-	nt = kzalloc(sizeof(struct ntb_transport), GFP_KERNEL);
+	if (ntb_db_is_unsafe(ndev))
+		dev_dbg(&ndev->dev,
+			"doorbell is unsafe, proceed anyway...\n");
+	if (ntb_spad_is_unsafe(ndev))
+		dev_dbg(&ndev->dev,
+			"scratchpad is unsafe, proceed anyway...\n");
+
+	nt = kzalloc(sizeof(*nt), GFP_KERNEL);
 	if (!nt)
 		return -ENOMEM;
 
-	nt->ndev = ntb_register_transport(pdev, nt);
-	if (!nt->ndev) {
-		rc = -EIO;
+	nt->ndev = ndev;
+
+	mw_count = ntb_mw_count(ndev);
+
+	nt->mw_count = mw_count;
+
+	nt->mw_vec = kcalloc(mw_count, sizeof(*nt->mw_vec), GFP_KERNEL);
+	if (!nt->mw_vec) {
+		rc = -ENOMEM;
 		goto err;
 	}
 
-	nt->mw = kcalloc(ntb_max_mw(nt->ndev), sizeof(struct ntb_transport_mw),
-			 GFP_KERNEL);
-	if (!nt->mw) {
-		rc = -ENOMEM;
-		goto err1;
+	for (i = 0; i < mw_count; i++) {
+		mw = &nt->mw_vec[i];
+
+		rc = ntb_mw_get_range(ndev, i, &mw->phys_addr, &mw->phys_size,
+				      &mw->xlat_align, &mw->xlat_align_size);
+		if (rc)
+			goto err1;
+
+		mw->vbase = ioremap(mw->phys_addr, mw->phys_size);
+		if (!mw->vbase) {
+			rc = -ENOMEM;
+			goto err1;
+		}
+
+		mw->buff_size = 0;
+		mw->xlat_size = 0;
+		mw->virt_addr = NULL;
+		mw->dma_addr = 0;
 	}
 
-	if (max_num_clients)
-		nt->max_qps = min(ntb_max_cbs(nt->ndev), max_num_clients);
-	else
-		nt->max_qps = min(ntb_max_cbs(nt->ndev), ntb_max_mw(nt->ndev));
+	qp_bitmap = ntb_db_valid_mask(ndev);
+
+	qp_count = ilog2(qp_bitmap);
+	if (max_num_clients && max_num_clients < qp_count)
+		qp_count = max_num_clients;
+	else if (mw_count < qp_count)
+		qp_count = mw_count;
+
+	qp_bitmap &= BIT_ULL(qp_count) - 1;
+
+	nt->qp_count = qp_count;
+	nt->qp_bitmap = qp_bitmap;
+	nt->qp_bitmap_free = qp_bitmap;
 
-	nt->qps = kcalloc(nt->max_qps, sizeof(struct ntb_transport_qp),
-			  GFP_KERNEL);
-	if (!nt->qps) {
+	nt->qp_vec = kcalloc(qp_count, sizeof(*nt->qp_vec), GFP_KERNEL);
+	if (!nt->qp_vec) {
 		rc = -ENOMEM;
 		goto err2;
 	}
 
-	nt->qp_bitmap = ((u64) 1 << nt->max_qps) - 1;
-
-	for (i = 0; i < nt->max_qps; i++) {
+	for (i = 0; i < qp_count; i++) {
 		rc = ntb_transport_init_queue(nt, i);
 		if (rc)
 			goto err3;
@@ -962,8 +1008,7 @@ int ntb_transport_init(struct pci_dev *pdev)
 	INIT_DELAYED_WORK(&nt->link_work, ntb_transport_link_work);
 	INIT_WORK(&nt->link_cleanup, ntb_transport_link_cleanup_work);
 
-	rc = ntb_register_event_callback(nt->ndev,
-					 ntb_transport_event_callback);
+	rc = ntb_set_ctx(ndev, nt, &ntb_transport_ops);
 	if (rc)
 		goto err3;
 
@@ -972,51 +1017,61 @@ int ntb_transport_init(struct pci_dev *pdev)
 	if (rc)
 		goto err4;
 
-	if (ntb_hw_link_status(nt->ndev))
-		schedule_delayed_work(&nt->link_work, 0);
+	nt->link_is_up = false;
+	ntb_link_enable(ndev, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+	ntb_link_event(ndev);
 
 	return 0;
 
 err4:
-	ntb_unregister_event_callback(nt->ndev);
+	ntb_clear_ctx(ndev);
 err3:
-	kfree(nt->qps);
+	kfree(nt->qp_vec);
 err2:
-	kfree(nt->mw);
+	kfree(nt->mw_vec);
 err1:
-	ntb_unregister_transport(nt->ndev);
+	while (i--) {
+		mw = &nt->mw_vec[i];
+		iounmap(mw->vbase);
+	}
 err:
 	kfree(nt);
 	return rc;
 }
 
-void ntb_transport_free(void *transport)
+static void ntb_transport_free(struct ntb_client *self, struct ntb_dev *ndev)
 {
-	struct ntb_transport *nt = transport;
-	struct ntb_device *ndev = nt->ndev;
+	struct ntb_transport_ctx *nt = ndev->ctx;
+	struct ntb_transport_qp *qp;
+	u64 qp_bitmap_alloc;
 	int i;
 
 	ntb_transport_link_cleanup(nt);
+	cancel_work_sync(&nt->link_cleanup);
+	cancel_delayed_work_sync(&nt->link_work);
+
+	qp_bitmap_alloc = nt->qp_bitmap & ~nt->qp_bitmap_free;
 
 	/* verify that all the qp's are freed */
-	for (i = 0; i < nt->max_qps; i++) {
-		if (!test_bit(i, &nt->qp_bitmap))
-			ntb_transport_free_queue(&nt->qps[i]);
-		debugfs_remove_recursive(nt->qps[i].debugfs_dir);
+	for (i = 0; i < nt->qp_count; i++) {
+		qp = &nt->qp_vec[i];
+		if (qp_bitmap_alloc & BIT_ULL(i))
+			ntb_transport_free_queue(qp);
+		debugfs_remove_recursive(qp->debugfs_dir);
 	}
 
-	ntb_bus_remove(nt);
+	ntb_link_disable(ndev);
+	ntb_clear_ctx(ndev);
 
-	cancel_delayed_work_sync(&nt->link_work);
-
-	ntb_unregister_event_callback(ndev);
+	ntb_bus_remove(nt);
 
-	for (i = 0; i < ntb_max_mw(ndev); i++)
+	for (i = nt->mw_count; i--; ) {
 		ntb_free_mw(nt, i);
+		iounmap(nt->mw_vec[i].vbase);
+	}
 
-	kfree(nt->qps);
-	kfree(nt->mw);
-	ntb_unregister_transport(ndev);
+	kfree(nt->qp_vec);
+	kfree(nt->mw_vec);
 	kfree(nt);
 }
 
@@ -1028,15 +1083,13 @@ static void ntb_rx_copy_callback(void *data)
 	unsigned int len = entry->len;
 	struct ntb_payload_header *hdr = entry->rx_hdr;
 
-	/* Ensure that the data is fully copied out before clearing the flag */
-	wmb();
 	hdr->flags = 0;
 
 	iowrite32(entry->index, &qp->rx_info->entry);
 
 	ntb_list_add(&qp->ntb_rx_free_q_lock, &entry->entry, &qp->rx_free_q);
 
-	if (qp->rx_handler && qp->client_ready == NTB_LINK_UP)
+	if (qp->rx_handler && qp->client_ready)
 		qp->rx_handler(qp, qp->cb_data, cb_data, len);
 }
 
@@ -1047,6 +1100,9 @@ static void ntb_memcpy_rx(struct ntb_queue_entry *entry, void *offset)
 
 	memcpy(buf, offset, len);
 
+	/* Ensure that the data is fully copied out before clearing the flag */
+	wmb();
+
 	ntb_rx_copy_callback(entry);
 }
 
@@ -1071,8 +1127,8 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset,
 		goto err_wait;
 
 	device = chan->device;
-	pay_off = (size_t) offset & ~PAGE_MASK;
-	buff_off = (size_t) buf & ~PAGE_MASK;
+	pay_off = (size_t)offset & ~PAGE_MASK;
+	buff_off = (size_t)buf & ~PAGE_MASK;
 
 	if (!is_dma_copy_aligned(device, pay_off, buff_off, len))
 		goto err_wait;
@@ -1138,86 +1194,104 @@ static int ntb_process_rxc(struct ntb_transport_qp *qp)
 	struct ntb_payload_header *hdr;
 	struct ntb_queue_entry *entry;
 	void *offset;
+	int rc;
 
 	offset = qp->rx_buff + qp->rx_max_frame * qp->rx_index;
 	hdr = offset + qp->rx_max_frame - sizeof(struct ntb_payload_header);
 
-	entry = ntb_list_rm(&qp->ntb_rx_pend_q_lock, &qp->rx_pend_q);
-	if (!entry) {
-		dev_dbg(&ntb_query_pdev(qp->ndev)->dev,
-			"no buffer - HDR ver %u, len %d, flags %x\n",
-			hdr->ver, hdr->len, hdr->flags);
-		qp->rx_err_no_buf++;
-		return -ENOMEM;
-	}
+	dev_dbg(&qp->ndev->pdev->dev, "qp %d: RX ver %u len %d flags %x\n",
+		qp->qp_num, hdr->ver, hdr->len, hdr->flags);
 
 	if (!(hdr->flags & DESC_DONE_FLAG)) {
-		ntb_list_add(&qp->ntb_rx_pend_q_lock, &entry->entry,
-			     &qp->rx_pend_q);
+		dev_dbg(&qp->ndev->pdev->dev, "done flag not set\n");
 		qp->rx_ring_empty++;
 		return -EAGAIN;
 	}
 
-	if (hdr->ver != (u32) qp->rx_pkts) {
-		dev_dbg(&ntb_query_pdev(qp->ndev)->dev,
-			"qp %d: version mismatch, expected %llu - got %u\n",
-			qp->qp_num, qp->rx_pkts, hdr->ver);
-		ntb_list_add(&qp->ntb_rx_pend_q_lock, &entry->entry,
-			     &qp->rx_pend_q);
+	if (hdr->flags & LINK_DOWN_FLAG) {
+		dev_dbg(&qp->ndev->pdev->dev, "link down flag set\n");
+		ntb_qp_link_down(qp);
+		hdr->flags = 0;
+		iowrite32(qp->rx_index, &qp->rx_info->entry);
+		return 0;
+	}
+
+	if (hdr->ver != (u32)qp->rx_pkts) {
+		dev_dbg(&qp->ndev->pdev->dev,
+			"version mismatch, expected %llu - got %u\n",
+			qp->rx_pkts, hdr->ver);
 		qp->rx_err_ver++;
 		return -EIO;
 	}
 
-	if (hdr->flags & LINK_DOWN_FLAG) {
-		ntb_qp_link_down(qp);
+	entry = ntb_list_rm(&qp->ntb_rx_pend_q_lock, &qp->rx_pend_q);
+	if (!entry) {
+		dev_dbg(&qp->ndev->pdev->dev, "no receive buffer\n");
+		qp->rx_err_no_buf++;
 
+		rc = -ENOMEM;
 		goto err;
 	}
 
-	dev_dbg(&ntb_query_pdev(qp->ndev)->dev,
-		"rx offset %u, ver %u - %d payload received, buf size %d\n",
-		qp->rx_index, hdr->ver, hdr->len, entry->len);
-
-	qp->rx_bytes += hdr->len;
-	qp->rx_pkts++;
-
 	if (hdr->len > entry->len) {
-		qp->rx_err_oflow++;
-		dev_dbg(&ntb_query_pdev(qp->ndev)->dev,
-			"RX overflow! Wanted %d got %d\n",
+		dev_dbg(&qp->ndev->pdev->dev,
+			"receive buffer overflow! Wanted %d got %d\n",
 			hdr->len, entry->len);
+		qp->rx_err_oflow++;
 
+		rc = -EIO;
 		goto err;
 	}
 
+	dev_dbg(&qp->ndev->pdev->dev,
+		"RX OK index %u ver %u size %d into buf size %d\n",
+		qp->rx_index, hdr->ver, hdr->len, entry->len);
+
+	qp->rx_bytes += hdr->len;
+	qp->rx_pkts++;
+
 	entry->index = qp->rx_index;
 	entry->rx_hdr = hdr;
 
 	ntb_async_rx(entry, offset, hdr->len);
 
-out:
 	qp->rx_index++;
 	qp->rx_index %= qp->rx_max_entry;
 
 	return 0;
 
 err:
-	ntb_list_add(&qp->ntb_rx_pend_q_lock, &entry->entry, &qp->rx_pend_q);
-	/* Ensure that the data is fully copied out before clearing the flag */
-	wmb();
+	/* FIXME: if this syncrhonous update of the rx_index gets ahead of
+	 * asyncrhonous ntb_rx_copy_callback of previous entry, there are three
+	 * scenarios:
+	 *
+	 * 1) The peer might miss this update, but observe the update
+	 * from the memcpy completion callback.  In this case, the buffer will
+	 * not be freed on the peer to be reused for a different packet.  The
+	 * successful rx of a later packet would clear the condition, but the
+	 * condition could persist if several rx fail in a row.
+	 *
+	 * 2) The peer may observe this update before the asyncrhonous copy of
+	 * prior packets is completed.  The peer may overwrite the buffers of
+	 * the prior packets before they are copied.
+	 *
+	 * 3) Both: the peer may observe the update, and then observe the index
+	 * decrement by the asynchronous completion callback.  Who knows what
+	 * badness that will cause.
+	 */
 	hdr->flags = 0;
 	iowrite32(qp->rx_index, &qp->rx_info->entry);
 
-	goto out;
+	return rc;
 }
 
-static int ntb_transport_rxc_db(void *data, int db_num)
+static void ntb_transport_rxc_db(unsigned long data)
 {
-	struct ntb_transport_qp *qp = data;
+	struct ntb_transport_qp *qp = (void *)data;
 	int rc, i;
 
-	dev_dbg(&ntb_query_pdev(qp->ndev)->dev, "%s: doorbell %d received\n",
-		__func__, db_num);
+	dev_dbg(&qp->ndev->pdev->dev, "%s: doorbell %d received\n",
+		__func__, qp->qp_num);
 
 	/* Limit the number of packets processed in a single interrupt to
 	 * provide fairness to others
@@ -1231,7 +1305,21 @@ static int ntb_transport_rxc_db(void *data, int db_num)
 	if (qp->dma_chan)
 		dma_async_issue_pending(qp->dma_chan);
 
-	return i;
+	if (i == qp->rx_max_entry) {
+		/* there is more work to do */
+		tasklet_schedule(&qp->rxc_db_work);
+	} else if (ntb_db_read(qp->ndev) & BIT_ULL(qp->qp_num)) {
+		/* the doorbell bit is set: clear it */
+		ntb_db_clear(qp->ndev, BIT_ULL(qp->qp_num));
+		/* ntb_db_read ensures ntb_db_clear write is committed */
+		ntb_db_read(qp->ndev);
+
+		/* an interrupt may have arrived between finishing
+		 * ntb_process_rxc and clearing the doorbell bit:
+		 * there might be some more work to do.
+		 */
+		tasklet_schedule(&qp->rxc_db_work);
+	}
 }
 
 static void ntb_tx_copy_callback(void *data)
@@ -1240,11 +1328,9 @@ static void ntb_tx_copy_callback(void *data)
 	struct ntb_transport_qp *qp = entry->qp;
 	struct ntb_payload_header __iomem *hdr = entry->tx_hdr;
 
-	/* Ensure that the data is fully copied out before setting the flags */
-	wmb();
 	iowrite32(entry->flags | DESC_DONE_FLAG, &hdr->flags);
 
-	ntb_ring_doorbell(qp->ndev, qp->qp_num);
+	ntb_peer_db_set(qp->ndev, BIT_ULL(qp->qp_num));
 
 	/* The entry length can only be zero if the packet is intended to be a
 	 * "link down" or similar.  Since no payload is being sent in these
@@ -1265,6 +1351,9 @@ static void ntb_memcpy_tx(struct ntb_queue_entry *entry, void __iomem *offset)
 {
 	memcpy_toio(offset, entry->buf, entry->len);
 
+	/* Ensure that the data is fully copied out before setting the flags */
+	wmb();
+
 	ntb_tx_copy_callback(entry);
 }
 
@@ -1288,7 +1377,7 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 	entry->tx_hdr = hdr;
 
 	iowrite32(entry->len, &hdr->len);
-	iowrite32((u32) qp->tx_pkts, &hdr->ver);
+	iowrite32((u32)qp->tx_pkts, &hdr->ver);
 
 	if (!chan)
 		goto err;
@@ -1298,8 +1387,8 @@ static void ntb_async_tx(struct ntb_transport_qp *qp,
 
 	device = chan->device;
 	dest = qp->tx_mw_phys + qp->tx_max_frame * qp->tx_index;
-	buff_off = (size_t) buf & ~PAGE_MASK;
-	dest_off = (size_t) dest & ~PAGE_MASK;
+	buff_off = (size_t)buf & ~PAGE_MASK;
+	dest_off = (size_t)dest & ~PAGE_MASK;
 
 	if (!is_dma_copy_aligned(device, buff_off, dest_off, len))
 		goto err;
@@ -1347,9 +1436,6 @@ err:
 static int ntb_process_tx(struct ntb_transport_qp *qp,
 			  struct ntb_queue_entry *entry)
 {
-	dev_dbg(&ntb_query_pdev(qp->ndev)->dev, "%lld - tx %u, entry len %d flags %x buff %p\n",
-		qp->tx_pkts, qp->tx_index, entry->len, entry->flags,
-		entry->buf);
 	if (qp->tx_index == qp->remote_rx_info->entry) {
 		qp->tx_ring_full++;
 		return -EAGAIN;
@@ -1376,14 +1462,14 @@ static int ntb_process_tx(struct ntb_transport_qp *qp,
 
 static void ntb_send_link_down(struct ntb_transport_qp *qp)
 {
-	struct pci_dev *pdev = ntb_query_pdev(qp->ndev);
+	struct pci_dev *pdev = qp->ndev->pdev;
 	struct ntb_queue_entry *entry;
 	int i, rc;
 
-	if (qp->qp_link == NTB_LINK_DOWN)
+	if (!qp->link_is_up)
 		return;
 
-	qp->qp_link = NTB_LINK_DOWN;
+	qp->link_is_up = false;
 	dev_info(&pdev->dev, "qp %d: Link Down\n", qp->qp_num);
 
 	for (i = 0; i < NTB_LINK_DOWN_TIMEOUT; i++) {
@@ -1422,18 +1508,21 @@ static void ntb_send_link_down(struct ntb_transport_qp *qp)
  * RETURNS: pointer to newly created ntb_queue, NULL on error.
  */
 struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct pci_dev *pdev,
+ntb_transport_create_queue(void *data, struct device *client_dev,
 			   const struct ntb_queue_handlers *handlers)
 {
+	struct ntb_dev *ndev;
+	struct pci_dev *pdev;
+	struct ntb_transport_ctx *nt;
 	struct ntb_queue_entry *entry;
 	struct ntb_transport_qp *qp;
-	struct ntb_transport *nt;
+	u64 qp_bit;
 	unsigned int free_queue;
-	int rc, i;
+	int i;
 
-	nt = ntb_find_transport(pdev);
-	if (!nt)
-		goto err;
+	ndev = dev_ntb(client_dev->parent);
+	pdev = ndev->pdev;
+	nt = ndev->ctx;
 
 	free_queue = ffs(nt->qp_bitmap);
 	if (!free_queue)
@@ -1442,9 +1531,11 @@ ntb_transport_create_queue(void *data, struct pci_dev *pdev,
 	/* decrement free_queue to make it zero based */
 	free_queue--;
 
-	clear_bit(free_queue, &nt->qp_bitmap);
+	qp = &nt->qp_vec[free_queue];
+	qp_bit = BIT_ULL(qp->qp_num);
+
+	nt->qp_bitmap_free &= ~qp_bit;
 
-	qp = &nt->qps[free_queue];
 	qp->cb_data = data;
 	qp->rx_handler = handlers->rx_handler;
 	qp->tx_handler = handlers->tx_handler;
@@ -1458,7 +1549,7 @@ ntb_transport_create_queue(void *data, struct pci_dev *pdev,
 	}
 
 	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-		entry = kzalloc(sizeof(struct ntb_queue_entry), GFP_ATOMIC);
+		entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 		if (!entry)
 			goto err1;
 
@@ -1468,7 +1559,7 @@ ntb_transport_create_queue(void *data, struct pci_dev *pdev,
 	}
 
 	for (i = 0; i < NTB_QP_DEF_NUM_ENTRIES; i++) {
-		entry = kzalloc(sizeof(struct ntb_queue_entry), GFP_ATOMIC);
+		entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
 		if (!entry)
 			goto err2;
 
@@ -1477,10 +1568,8 @@ ntb_transport_create_queue(void *data, struct pci_dev *pdev,
 			     &qp->tx_free_q);
 	}
 
-	rc = ntb_register_db_callback(qp->ndev, free_queue, qp,
-				      ntb_transport_rxc_db);
-	if (rc)
-		goto err2;
+	ntb_db_clear(qp->ndev, qp_bit);
+	ntb_db_clear_mask(qp->ndev, qp_bit);
 
 	dev_info(&pdev->dev, "NTB Transport QP %d created\n", qp->qp_num);
 
@@ -1494,7 +1583,7 @@ err1:
 		kfree(entry);
 	if (qp->dma_chan)
 		dmaengine_put();
-	set_bit(free_queue, &nt->qp_bitmap);
+	nt->qp_bitmap_free |= qp_bit;
 err:
 	return NULL;
 }
@@ -1508,13 +1597,15 @@ EXPORT_SYMBOL_GPL(ntb_transport_create_queue);
  */
 void ntb_transport_free_queue(struct ntb_transport_qp *qp)
 {
+	struct ntb_transport_ctx *nt = qp->transport;
 	struct pci_dev *pdev;
 	struct ntb_queue_entry *entry;
+	u64 qp_bit;
 
 	if (!qp)
 		return;
 
-	pdev = ntb_query_pdev(qp->ndev);
+	pdev = qp->ndev->pdev;
 
 	if (qp->dma_chan) {
 		struct dma_chan *chan = qp->dma_chan;
@@ -1531,10 +1622,18 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
 		dmaengine_put();
 	}
 
-	ntb_unregister_db_callback(qp->ndev, qp->qp_num);
+	qp_bit = BIT_ULL(qp->qp_num);
+
+	ntb_db_set_mask(qp->ndev, qp_bit);
+	tasklet_disable(&qp->rxc_db_work);
 
 	cancel_delayed_work_sync(&qp->link_work);
 
+	qp->cb_data = NULL;
+	qp->rx_handler = NULL;
+	qp->tx_handler = NULL;
+	qp->event_handler = NULL;
+
 	while ((entry = ntb_list_rm(&qp->ntb_rx_free_q_lock, &qp->rx_free_q)))
 		kfree(entry);
 
@@ -1546,7 +1645,7 @@ void ntb_transport_free_queue(struct ntb_transport_qp *qp)
 	while ((entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q)))
 		kfree(entry);
 
-	set_bit(qp->qp_num, &qp->transport->qp_bitmap);
+	nt->qp_bitmap_free |= qp_bit;
 
 	dev_info(&pdev->dev, "NTB Transport QP %d freed\n", qp->qp_num);
 }
@@ -1567,7 +1666,7 @@ void *ntb_transport_rx_remove(struct ntb_transport_qp *qp, unsigned int *len)
 	struct ntb_queue_entry *entry;
 	void *buf;
 
-	if (!qp || qp->client_ready == NTB_LINK_UP)
+	if (!qp || qp->client_ready)
 		return NULL;
 
 	entry = ntb_list_rm(&qp->ntb_rx_pend_q_lock, &qp->rx_pend_q);
@@ -1636,7 +1735,7 @@ int ntb_transport_tx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
 	struct ntb_queue_entry *entry;
 	int rc;
 
-	if (!qp || qp->qp_link != NTB_LINK_UP || !len)
+	if (!qp || !qp->link_is_up || !len)
 		return -EINVAL;
 
 	entry = ntb_list_rm(&qp->ntb_tx_free_q_lock, &qp->tx_free_q);
@@ -1670,9 +1769,9 @@ void ntb_transport_link_up(struct ntb_transport_qp *qp)
 	if (!qp)
 		return;
 
-	qp->client_ready = NTB_LINK_UP;
+	qp->client_ready = true;
 
-	if (qp->transport->transport_link == NTB_LINK_UP)
+	if (qp->transport->link_is_up)
 		schedule_delayed_work(&qp->link_work, 0);
 }
 EXPORT_SYMBOL_GPL(ntb_transport_link_up);
@@ -1688,27 +1787,20 @@ EXPORT_SYMBOL_GPL(ntb_transport_link_up);
 void ntb_transport_link_down(struct ntb_transport_qp *qp)
 {
 	struct pci_dev *pdev;
-	int rc, val;
+	int val;
 
 	if (!qp)
 		return;
 
-	pdev = ntb_query_pdev(qp->ndev);
-	qp->client_ready = NTB_LINK_DOWN;
+	pdev = qp->ndev->pdev;
+	qp->client_ready = false;
 
-	rc = ntb_read_local_spad(qp->ndev, QP_LINKS, &val);
-	if (rc) {
-		dev_err(&pdev->dev, "Error reading spad %d\n", QP_LINKS);
-		return;
-	}
+	val = ntb_spad_read(qp->ndev, QP_LINKS);
 
-	rc = ntb_write_remote_spad(qp->ndev, QP_LINKS,
-				   val & ~(1 << qp->qp_num));
-	if (rc)
-		dev_err(&pdev->dev, "Error writing %x to remote spad %d\n",
-			val & ~(1 << qp->qp_num), QP_LINKS);
+	ntb_peer_spad_write(qp->ndev, QP_LINKS,
+			    val & ~BIT(qp->qp_num));
 
-	if (qp->qp_link == NTB_LINK_UP)
+	if (qp->link_is_up)
 		ntb_send_link_down(qp);
 	else
 		cancel_delayed_work_sync(&qp->link_work);
@@ -1728,7 +1820,7 @@ bool ntb_transport_link_query(struct ntb_transport_qp *qp)
 	if (!qp)
 		return false;
 
-	return qp->qp_link == NTB_LINK_UP;
+	return qp->link_is_up;
 }
 EXPORT_SYMBOL_GPL(ntb_transport_link_query);
 
@@ -1774,3 +1866,69 @@ unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp)
 	return max;
 }
 EXPORT_SYMBOL_GPL(ntb_transport_max_size);
+
+static void ntb_transport_doorbell_callback(void *data, int vector)
+{
+	struct ntb_transport_ctx *nt = data;
+	struct ntb_transport_qp *qp;
+	u64 db_bits;
+	unsigned int qp_num;
+
+	db_bits = (nt->qp_bitmap & ~nt->qp_bitmap_free &
+		   ntb_db_vector_mask(nt->ndev, vector));
+
+	while (db_bits) {
+		qp_num = __ffs(db_bits);
+		qp = &nt->qp_vec[qp_num];
+
+		tasklet_schedule(&qp->rxc_db_work);
+
+		db_bits &= ~BIT_ULL(qp_num);
+	}
+}
+
+static const struct ntb_ctx_ops ntb_transport_ops = {
+	.link_event = ntb_transport_event_callback,
+	.db_event = ntb_transport_doorbell_callback,
+};
+
+static struct ntb_client ntb_transport_client = {
+	.ops = {
+		.probe = ntb_transport_probe,
+		.remove = ntb_transport_free,
+	},
+};
+
+static int __init ntb_transport_init(void)
+{
+	int rc;
+
+	if (debugfs_initialized())
+		nt_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL);
+
+	rc = bus_register(&ntb_transport_bus);
+	if (rc)
+		goto err_bus;
+
+	rc = ntb_register_client(&ntb_transport_client);
+	if (rc)
+		goto err_client;
+
+	return 0;
+
+err_client:
+	bus_unregister(&ntb_transport_bus);
+err_bus:
+	debugfs_remove_recursive(nt_debugfs_dir);
+	return rc;
+}
+module_init(ntb_transport_init);
+
+static void __exit ntb_transport_exit(void)
+{
+	debugfs_remove_recursive(nt_debugfs_dir);
+
+	ntb_unregister_client(&ntb_transport_client);
+	bus_unregister(&ntb_transport_bus);
+}
+module_exit(ntb_transport_exit);
diff --git a/include/linux/ntb_transport.h b/include/linux/ntb_transport.h
index f78b64cc09d5..2862861366a5 100644
--- a/include/linux/ntb_transport.h
+++ b/include/linux/ntb_transport.h
@@ -5,6 +5,7 @@
  *   GPL LICENSE SUMMARY
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of version 2 of the GNU General Public License as
@@ -13,6 +14,7 @@
  *   BSD LICENSE
  *
  *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *   Copyright (C) 2015 EMC Corporation. All Rights Reserved.
  *
  *   Redistribution and use in source and binary forms, with or without
  *   modification, are permitted provided that the following conditions
@@ -40,7 +42,7 @@
  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Intel PCIe NTB Linux driver
+ * PCIe NTB Transport Linux driver
  *
  * Contact Information:
  * Jon Mason <jon.mason@intel.com>
@@ -48,21 +50,16 @@
 
 struct ntb_transport_qp;
 
-struct ntb_client {
+struct ntb_transport_client {
 	struct device_driver driver;
-	int (*probe)(struct pci_dev *pdev);
-	void (*remove)(struct pci_dev *pdev);
+	int (*probe)(struct device *client_dev);
+	void (*remove)(struct device *client_dev);
 };
 
-enum {
-	NTB_LINK_DOWN = 0,
-	NTB_LINK_UP,
-};
-
-int ntb_transport_register_client(struct ntb_client *drvr);
-void ntb_transport_unregister_client(struct ntb_client *drvr);
-int ntb_register_client_dev(char *device_name);
-void ntb_unregister_client_dev(char *device_name);
+int ntb_transport_register_client(struct ntb_transport_client *drvr);
+void ntb_transport_unregister_client(struct ntb_transport_client *drvr);
+int ntb_transport_register_client_dev(char *device_name);
+void ntb_transport_unregister_client_dev(char *device_name);
 
 struct ntb_queue_handlers {
 	void (*rx_handler)(struct ntb_transport_qp *qp, void *qp_data,
@@ -75,7 +72,7 @@ struct ntb_queue_handlers {
 unsigned char ntb_transport_qp_num(struct ntb_transport_qp *qp);
 unsigned int ntb_transport_max_size(struct ntb_transport_qp *qp);
 struct ntb_transport_qp *
-ntb_transport_create_queue(void *data, struct pci_dev *pdev,
+ntb_transport_create_queue(void *data, struct device *client_dev,
 			   const struct ntb_queue_handlers *handlers);
 void ntb_transport_free_queue(struct ntb_transport_qp *qp);
 int ntb_transport_rx_enqueue(struct ntb_transport_qp *qp, void *cb, void *data,
-- 
cgit v1.2.3


From 0294112ee3135fbd15eaa70015af8283642dd970 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 4 Jul 2015 03:09:03 +0200
Subject: ACPI / PNP: Reserve ACPI resources at the fs_initcall_sync stage

This effectively reverts the following three commits:

 7bc10388ccdd ACPI / resources: free memory on error in add_region_before()
 0f1b414d1907 ACPI / PNP: Avoid conflicting resource reservations
 b9a5e5e18fbf ACPI / init: Fix the ordering of acpi_reserve_resources()

(commit b9a5e5e18fbf introduced regressions some of which, but not
all, were addressed by commit 0f1b414d1907 and commit 7bc10388ccdd
was a fixup on top of the latter) and causes ACPI fixed hardware
resources to be reserved at the fs_initcall_sync stage of system
initialization.

The story is as follows.  First, a boot regression was reported due
to an apparent resource reservation ordering change after a commit
that shouldn't lead to such changes.  Investigation led to the
conclusion that the problem happened because acpi_reserve_resources()
was executed at the device_initcall() stage of system initialization
which wasn't strictly ordered with respect to driver initialization
(and with respect to the initialization of the pcieport driver in
particular), so a random change causing the device initcalls to be
run in a different order might break things.

The response to that was to attempt to run acpi_reserve_resources()
as soon as we knew that ACPI would be in use (commit b9a5e5e18fbf).
However, that turned out to be too early, because it caused resource
reservations made by the PNP system driver to fail on at least one
system and that failure was addressed by commit 0f1b414d1907.

That fix still turned out to be insufficient, though, because
calling acpi_reserve_resources() before the fs_initcall stage of
system initialization caused a boot regression to happen on the
eCAFE EC-800-H20G/S netbook.  That meant that we only could call
acpi_reserve_resources() at the fs_initcall initialization stage
or later, but then we might just as well call it after the PNP
initalization in which case commit 0f1b414d1907 wouldn't be
necessary any more.

For this reason, the changes made by commit 0f1b414d1907 are reverted
(along with a memory leak fixup on top of that commit), the changes
made by commit b9a5e5e18fbf that went too far are reverted too and
acpi_reserve_resources() is changed into fs_initcall_sync, which
will cause it to be executed after the PNP subsystem initialization
(which is an fs_initcall) and before device initcalls (including
the pcieport driver initialization) which should avoid the initial
issue.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=100581
Link: http://marc.info/?t=143092384600002&r=1&w=2
Link: https://bugzilla.kernel.org/show_bug.cgi?id=99831
Link: http://marc.info/?t=143389402600001&r=1&w=2
Fixes: b9a5e5e18fbf "ACPI / init: Fix the ordering of acpi_reserve_resources()"
Reported-by: Roland Dreier <roland@purestorage.com>
Cc: All applicable <stable@vger.kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/osl.c      |  12 +++-
 drivers/acpi/resource.c | 162 ------------------------------------------------
 drivers/pnp/system.c    |  35 +++--------
 include/linux/acpi.h    |  10 ---
 4 files changed, 18 insertions(+), 201 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index c262e4acd68d..3b8963f21b36 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -175,10 +175,14 @@ static void __init acpi_request_region (struct acpi_generic_address *gas,
 	if (!addr || !length)
 		return;
 
-	acpi_reserve_region(addr, length, gas->space_id, 0, desc);
+	/* Resources are never freed */
+	if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_IO)
+		request_region(addr, length, desc);
+	else if (gas->space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
+		request_mem_region(addr, length, desc);
 }
 
-static void __init acpi_reserve_resources(void)
+static int __init acpi_reserve_resources(void)
 {
 	acpi_request_region(&acpi_gbl_FADT.xpm1a_event_block, acpi_gbl_FADT.pm1_event_length,
 		"ACPI PM1a_EVT_BLK");
@@ -207,7 +211,10 @@ static void __init acpi_reserve_resources(void)
 	if (!(acpi_gbl_FADT.gpe1_block_length & 0x1))
 		acpi_request_region(&acpi_gbl_FADT.xgpe1_block,
 			       acpi_gbl_FADT.gpe1_block_length, "ACPI GPE1_BLK");
+
+	return 0;
 }
+fs_initcall_sync(acpi_reserve_resources);
 
 void acpi_os_printf(const char *fmt, ...)
 {
@@ -1862,7 +1869,6 @@ acpi_status __init acpi_os_initialize(void)
 
 acpi_status __init acpi_os_initialize1(void)
 {
-	acpi_reserve_resources();
 	kacpid_wq = alloc_workqueue("kacpid", 0, 1);
 	kacpi_notify_wq = alloc_workqueue("kacpi_notify", 0, 1);
 	kacpi_hotplug_wq = alloc_ordered_workqueue("kacpi_hotplug", 0);
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 10561ce16ed1..8244f013f210 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -26,7 +26,6 @@
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/ioport.h>
-#include <linux/list.h>
 #include <linux/slab.h>
 
 #ifdef CONFIG_X86
@@ -622,164 +621,3 @@ int acpi_dev_filter_resource_type(struct acpi_resource *ares,
 	return (type & types) ? 0 : 1;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_filter_resource_type);
-
-struct reserved_region {
-	struct list_head node;
-	u64 start;
-	u64 end;
-};
-
-static LIST_HEAD(reserved_io_regions);
-static LIST_HEAD(reserved_mem_regions);
-
-static int request_range(u64 start, u64 end, u8 space_id, unsigned long flags,
-			 char *desc)
-{
-	unsigned int length = end - start + 1;
-	struct resource *res;
-
-	res = space_id == ACPI_ADR_SPACE_SYSTEM_IO ?
-		request_region(start, length, desc) :
-		request_mem_region(start, length, desc);
-	if (!res)
-		return -EIO;
-
-	res->flags &= ~flags;
-	return 0;
-}
-
-static int add_region_before(u64 start, u64 end, u8 space_id,
-			     unsigned long flags, char *desc,
-			     struct list_head *head)
-{
-	struct reserved_region *reg;
-	int error;
-
-	reg = kmalloc(sizeof(*reg), GFP_KERNEL);
-	if (!reg)
-		return -ENOMEM;
-
-	error = request_range(start, end, space_id, flags, desc);
-	if (error) {
-		kfree(reg);
-		return error;
-	}
-
-	reg->start = start;
-	reg->end = end;
-	list_add_tail(&reg->node, head);
-	return 0;
-}
-
-/**
- * acpi_reserve_region - Reserve an I/O or memory region as a system resource.
- * @start: Starting address of the region.
- * @length: Length of the region.
- * @space_id: Identifier of address space to reserve the region from.
- * @flags: Resource flags to clear for the region after requesting it.
- * @desc: Region description (for messages).
- *
- * Reserve an I/O or memory region as a system resource to prevent others from
- * using it.  If the new region overlaps with one of the regions (in the given
- * address space) already reserved by this routine, only the non-overlapping
- * parts of it will be reserved.
- *
- * Returned is either 0 (success) or a negative error code indicating a resource
- * reservation problem.  It is the code of the first encountered error, but the
- * routine doesn't abort until it has attempted to request all of the parts of
- * the new region that don't overlap with other regions reserved previously.
- *
- * The resources requested by this routine are never released.
- */
-int acpi_reserve_region(u64 start, unsigned int length, u8 space_id,
-			unsigned long flags, char *desc)
-{
-	struct list_head *regions;
-	struct reserved_region *reg;
-	u64 end = start + length - 1;
-	int ret = 0, error = 0;
-
-	if (space_id == ACPI_ADR_SPACE_SYSTEM_IO)
-		regions = &reserved_io_regions;
-	else if (space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
-		regions = &reserved_mem_regions;
-	else
-		return -EINVAL;
-
-	if (list_empty(regions))
-		return add_region_before(start, end, space_id, flags, desc, regions);
-
-	list_for_each_entry(reg, regions, node)
-		if (reg->start == end + 1) {
-			/* The new region can be prepended to this one. */
-			ret = request_range(start, end, space_id, flags, desc);
-			if (!ret)
-				reg->start = start;
-
-			return ret;
-		} else if (reg->start > end) {
-			/* No overlap.  Add the new region here and get out. */
-			return add_region_before(start, end, space_id, flags,
-						 desc, &reg->node);
-		} else if (reg->end == start - 1) {
-			goto combine;
-		} else if (reg->end >= start) {
-			goto overlap;
-		}
-
-	/* The new region goes after the last existing one. */
-	return add_region_before(start, end, space_id, flags, desc, regions);
-
- overlap:
-	/*
-	 * The new region overlaps an existing one.
-	 *
-	 * The head part of the new region immediately preceding the existing
-	 * overlapping one can be combined with it right away.
-	 */
-	if (reg->start > start) {
-		error = request_range(start, reg->start - 1, space_id, flags, desc);
-		if (error)
-			ret = error;
-		else
-			reg->start = start;
-	}
-
- combine:
-	/*
-	 * The new region is adjacent to an existing one.  If it extends beyond
-	 * that region all the way to the next one, it is possible to combine
-	 * all three of them.
-	 */
-	while (reg->end < end) {
-		struct reserved_region *next = NULL;
-		u64 a = reg->end + 1, b = end;
-
-		if (!list_is_last(&reg->node, regions)) {
-			next = list_next_entry(reg, node);
-			if (next->start <= end)
-				b = next->start - 1;
-		}
-		error = request_range(a, b, space_id, flags, desc);
-		if (!error) {
-			if (next && next->start == b + 1) {
-				reg->end = next->end;
-				list_del(&next->node);
-				kfree(next);
-			} else {
-				reg->end = end;
-				break;
-			}
-		} else if (next) {
-			if (!ret)
-				ret = error;
-
-			reg = next;
-		} else {
-			break;
-		}
-	}
-
-	return ret ? ret : error;
-}
-EXPORT_SYMBOL_GPL(acpi_reserve_region);
diff --git a/drivers/pnp/system.c b/drivers/pnp/system.c
index 515f33882ab8..49c1720df59a 100644
--- a/drivers/pnp/system.c
+++ b/drivers/pnp/system.c
@@ -7,7 +7,6 @@
  *	Bjorn Helgaas <bjorn.helgaas@hp.com>
  */
 
-#include <linux/acpi.h>
 #include <linux/pnp.h>
 #include <linux/device.h>
 #include <linux/init.h>
@@ -23,41 +22,25 @@ static const struct pnp_device_id pnp_dev_table[] = {
 	{"", 0}
 };
 
-#ifdef CONFIG_ACPI
-static bool __reserve_range(u64 start, unsigned int length, bool io, char *desc)
-{
-	u8 space_id = io ? ACPI_ADR_SPACE_SYSTEM_IO : ACPI_ADR_SPACE_SYSTEM_MEMORY;
-	return !acpi_reserve_region(start, length, space_id, IORESOURCE_BUSY, desc);
-}
-#else
-static bool __reserve_range(u64 start, unsigned int length, bool io, char *desc)
-{
-	struct resource *res;
-
-	res = io ? request_region(start, length, desc) :
-		request_mem_region(start, length, desc);
-	if (res) {
-		res->flags &= ~IORESOURCE_BUSY;
-		return true;
-	}
-	return false;
-}
-#endif
-
 static void reserve_range(struct pnp_dev *dev, struct resource *r, int port)
 {
 	char *regionid;
 	const char *pnpid = dev_name(&dev->dev);
 	resource_size_t start = r->start, end = r->end;
-	bool reserved;
+	struct resource *res;
 
 	regionid = kmalloc(16, GFP_KERNEL);
 	if (!regionid)
 		return;
 
 	snprintf(regionid, 16, "pnp %s", pnpid);
-	reserved = __reserve_range(start, end - start + 1, !!port, regionid);
-	if (!reserved)
+	if (port)
+		res = request_region(start, end - start + 1, regionid);
+	else
+		res = request_mem_region(start, end - start + 1, regionid);
+	if (res)
+		res->flags &= ~IORESOURCE_BUSY;
+	else
 		kfree(regionid);
 
 	/*
@@ -66,7 +49,7 @@ static void reserve_range(struct pnp_dev *dev, struct resource *r, int port)
 	 * have double reservations.
 	 */
 	dev_info(&dev->dev, "%pR %s reserved\n", r,
-		 reserved ? "has been" : "could not be");
+		 res ? "has been" : "could not be");
 }
 
 static void reserve_resources_of_dev(struct pnp_dev *dev)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c471dfc93b71..fedfd1bea3bf 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -309,9 +309,6 @@ int acpi_check_region(resource_size_t start, resource_size_t n,
 
 int acpi_resources_are_enforced(void);
 
-int acpi_reserve_region(u64 start, unsigned int length, u8 space_id,
-			unsigned long flags, char *desc);
-
 #ifdef CONFIG_HIBERNATION
 void __init acpi_no_s4_hw_signature(void);
 #endif
@@ -507,13 +504,6 @@ static inline int acpi_check_region(resource_size_t start, resource_size_t n,
 	return 0;
 }
 
-static inline int acpi_reserve_region(u64 start, unsigned int length,
-				      u8 space_id, unsigned long flags,
-				      char *desc)
-{
-	return -ENXIO;
-}
-
 struct acpi_table_header;
 static inline int acpi_table_parse(char *id,
 				int (*handler)(struct acpi_table_header *))
-- 
cgit v1.2.3


From 26095a01d359827eeccec5459c28ddd976183179 Mon Sep 17 00:00:00 2001
From: "Suthikulpanit, Suravee" <Suravee.Suthikulpanit@amd.com>
Date: Tue, 7 Jul 2015 01:55:20 +0200
Subject: ACPI / scan: Add support for ACPI _CLS device matching

Device drivers typically use ACPI _HIDs/_CIDs listed in struct device_driver
acpi_match_table to match devices. However, for generic drivers, we do not
want to list _HID for all supported devices. Also, certain classes of devices
do not have _CID (e.g. SATA, USB). Instead, we can leverage ACPI _CLS,
which specifies PCI-defined class code (i.e. base-class, subclass and
programming interface). This patch adds support for matching ACPI devices using
the _CLS method.

To support loadable module, current design uses _HID or _CID to match device's
modalias. With the new way of matching with _CLS this would requires modification
to the current ACPI modalias key to include _CLS. This patch appends PCI-defined
class-code to the existing ACPI modalias as following.

    acpi:<HID>:<CID1>:<CID2>:..:<CIDn>:<bbsspp>:
E.g:
    # cat /sys/devices/platform/AMDI0600:00/modalias
    acpi:AMDI0600:010601:

where bb is th base-class code, ss is te sub-class code, and pp is the
programming interface code

Since there would not be _HID/_CID in the ACPI matching table of the driver,
this patch adds a field to acpi_device_id to specify the matching _CLS.

    static const struct acpi_device_id ahci_acpi_match[] = {
        { ACPI_DEVICE_CLASS(PCI_CLASS_STORAGE_SATA_AHCI, 0xffffff) },
        {},
    };

In this case, the corresponded entry in modules.alias file would be:

    alias acpi*:010601:* ahci_platform

Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/scan.c               | 32 ++++++++++++++++++++++++++++++--
 include/linux/acpi.h              | 14 ++++++++++++++
 include/linux/mod_devicetable.h   |  2 ++
 scripts/mod/devicetable-offsets.c |  2 ++
 scripts/mod/file2alias.c          | 32 ++++++++++++++++++++++++++++++--
 5 files changed, 78 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 2649a068671d..ec256352f423 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1019,6 +1019,29 @@ static bool acpi_of_match_device(struct acpi_device *adev,
 	return false;
 }
 
+static bool __acpi_match_device_cls(const struct acpi_device_id *id,
+				    struct acpi_hardware_id *hwid)
+{
+	int i, msk, byte_shift;
+	char buf[3];
+
+	if (!id->cls)
+		return false;
+
+	/* Apply class-code bitmask, before checking each class-code byte */
+	for (i = 1; i <= 3; i++) {
+		byte_shift = 8 * (3 - i);
+		msk = (id->cls_msk >> byte_shift) & 0xFF;
+		if (!msk)
+			continue;
+
+		sprintf(buf, "%02x", (id->cls >> byte_shift) & msk);
+		if (strncmp(buf, &hwid->id[(i - 1) * 2], 2))
+			return false;
+	}
+	return true;
+}
+
 static const struct acpi_device_id *__acpi_match_device(
 	struct acpi_device *device,
 	const struct acpi_device_id *ids,
@@ -1036,9 +1059,12 @@ static const struct acpi_device_id *__acpi_match_device(
 
 	list_for_each_entry(hwid, &device->pnp.ids, list) {
 		/* First, check the ACPI/PNP IDs provided by the caller. */
-		for (id = ids; id->id[0]; id++)
-			if (!strcmp((char *) id->id, hwid->id))
+		for (id = ids; id->id[0] || id->cls; id++) {
+			if (id->id[0] && !strcmp((char *) id->id, hwid->id))
 				return id;
+			else if (id->cls && __acpi_match_device_cls(id, hwid))
+				return id;
+		}
 
 		/*
 		 * Next, check ACPI_DT_NAMESPACE_HID and try to match the
@@ -2101,6 +2127,8 @@ static void acpi_set_pnp_ids(acpi_handle handle, struct acpi_device_pnp *pnp,
 		if (info->valid & ACPI_VALID_UID)
 			pnp->unique_id = kstrdup(info->unique_id.string,
 							GFP_KERNEL);
+		if (info->valid & ACPI_VALID_CLS)
+			acpi_add_id(pnp, info->class_code.string);
 
 		kfree(info);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c471dfc93b71..bdfdb9f65c23 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -58,6 +58,19 @@ static inline acpi_handle acpi_device_handle(struct acpi_device *adev)
 	acpi_fwnode_handle(adev) : NULL)
 #define ACPI_HANDLE(dev)		acpi_device_handle(ACPI_COMPANION(dev))
 
+/**
+ * ACPI_DEVICE_CLASS - macro used to describe an ACPI device with
+ * the PCI-defined class-code information
+ *
+ * @_cls : the class, subclass, prog-if triple for this device
+ * @_msk : the class mask for this device
+ *
+ * This macro is used to create a struct acpi_device_id that matches a
+ * specific PCI class. The .id and .driver_data fields will be left
+ * initialized with the default value.
+ */
+#define ACPI_DEVICE_CLASS(_cls, _msk)	.cls = (_cls), .cls_msk = (_msk),
+
 static inline bool has_acpi_companion(struct device *dev)
 {
 	return is_acpi_node(dev->fwnode);
@@ -446,6 +459,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *);
 #define ACPI_COMPANION(dev)		(NULL)
 #define ACPI_COMPANION_SET(dev, adev)	do { } while (0)
 #define ACPI_HANDLE(dev)		(NULL)
+#define ACPI_DEVICE_CLASS(_cls, _msk)	.cls = (0), .cls_msk = (0),
 
 struct fwnode_handle;
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 8183d6640ca7..34f25b7bf642 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -189,6 +189,8 @@ struct css_device_id {
 struct acpi_device_id {
 	__u8 id[ACPI_ID_LEN];
 	kernel_ulong_t driver_data;
+	__u32 cls;
+	__u32 cls_msk;
 };
 
 #define PNP_ID_LEN	8
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index eff7de1fc82e..e70fcd12eeeb 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -63,6 +63,8 @@ int main(void)
 
 	DEVID(acpi_device_id);
 	DEVID_FIELD(acpi_device_id, id);
+	DEVID_FIELD(acpi_device_id, cls);
+	DEVID_FIELD(acpi_device_id, cls_msk);
 
 	DEVID(pnp_device_id);
 	DEVID_FIELD(pnp_device_id, id);
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 84c86f3cd6cd..5f2088209132 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -523,12 +523,40 @@ static int do_serio_entry(const char *filename,
 }
 ADD_TO_DEVTABLE("serio", serio_device_id, do_serio_entry);
 
-/* looks like: "acpi:ACPI0003 or acpi:PNP0C0B" or "acpi:LNXVIDEO" */
+/* looks like: "acpi:ACPI0003" or "acpi:PNP0C0B" or "acpi:LNXVIDEO" or
+ *             "acpi:bbsspp" (bb=base-class, ss=sub-class, pp=prog-if)
+ *
+ * NOTE: Each driver should use one of the following : _HID, _CIDs
+ *       or _CLS. Also, bb, ss, and pp can be substituted with ??
+ *       as don't care byte.
+ */
 static int do_acpi_entry(const char *filename,
 			void *symval, char *alias)
 {
 	DEF_FIELD_ADDR(symval, acpi_device_id, id);
-	sprintf(alias, "acpi*:%s:*", *id);
+	DEF_FIELD_ADDR(symval, acpi_device_id, cls);
+	DEF_FIELD_ADDR(symval, acpi_device_id, cls_msk);
+
+	if (id && strlen((const char *)*id))
+		sprintf(alias, "acpi*:%s:*", *id);
+	else if (cls) {
+		int i, byte_shift, cnt = 0;
+		unsigned int msk;
+
+		sprintf(&alias[cnt], "acpi*:");
+		cnt = 6;
+		for (i = 1; i <= 3; i++) {
+			byte_shift = 8 * (3-i);
+			msk = (*cls_msk >> byte_shift) & 0xFF;
+			if (msk)
+				sprintf(&alias[cnt], "%02x",
+					(*cls >> byte_shift) & 0xFF);
+			else
+				sprintf(&alias[cnt], "??");
+			cnt += 2;
+		}
+		sprintf(&alias[cnt], ":*");
+	}
 	return 1;
 }
 ADD_TO_DEVTABLE("acpi", acpi_device_id, do_acpi_entry);
-- 
cgit v1.2.3


From f32dd117051185da6e923b35491a44d7debeeea5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jul 2015 16:29:38 +0200
Subject: tick/broadcast: Make idle check independent from mode and config

Currently the broadcast busy check, which prevents the idle code from
going into deep idle, works only in one shot mode.

If NOHZ and HIGHRES are off (config or command line) there is no
sanity check at all, so under certain conditions cpus are allowed to
go into deep idle, where the local timer stops, and are not woken up
again because there is no broadcast timer installed or a hrtimer based
broadcast device is not evaluated.

Move tick_broadcast_oneshot_control() into the common code and provide
proper subfunctions for the various config combinations.

The common check in tick_broadcast_oneshot_control() is for the C3STOP
misfeature flag of the local clock event device. If its not set, idle
can proceed. If set, further checks are necessary.

Provide checks for the trivial cases:

 - If broadcast is disabled in the config, then return busy

 - If oneshot mode (NOHZ/HIGHES) is disabled in the config, return
   busy if the broadcast device is hrtimer based.

 - If oneshot mode is enabled in the config call the original
   tick_broadcast_oneshot_control() function. That function needs
   extra checks which will be implemented in seperate patches.

[ Split out from a larger combo patch ]

Reported-and-tested-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Suzuki Poulose <Suzuki.Poulose@arm.com>
Cc: Lorenzo Pieralisi <Lorenzo.Pieralisi@arm.com>
Cc: Catalin Marinas <Catalin.Marinas@arm.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos
---
 include/linux/tick.h         |  4 ----
 kernel/time/tick-broadcast.c | 26 +++++++++++---------------
 kernel/time/tick-common.c    | 21 +++++++++++++++++++++
 kernel/time/tick-sched.h     | 10 ++++++++++
 4 files changed, 42 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 3741ba1a652c..6916dcb61857 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -67,11 +67,7 @@ extern void tick_broadcast_control(enum tick_broadcast_mode mode);
 static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { }
 #endif /* BROADCAST */
 
-#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
 extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);
-#else
-static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; }
-#endif
 
 static inline void tick_broadcast_enable(void)
 {
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9877d0b0aefc..ef77b16ad5df 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -685,18 +685,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
 	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 }
 
-/**
- * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
- * @state:	The target state (enter/exit)
- *
- * The system enters/leaves a state, where affected devices might stop
- * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
- *
- * Called with interrupts disabled, so clockevents_lock is not
- * required here because the local clock event device cannot go away
- * under us.
- */
-int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 {
 	struct clock_event_device *bc, *dev;
 	struct tick_device *td;
@@ -717,9 +706,6 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 	td = this_cpu_ptr(&tick_cpu_device);
 	dev = td->evtdev;
 
-	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-		return 0;
-
 	raw_spin_lock(&tick_broadcast_lock);
 	bc = tick_broadcast_device.evtdev;
 	cpu = smp_processor_id();
@@ -961,6 +947,16 @@ bool tick_broadcast_oneshot_available(void)
 	return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
 }
 
+#else
+int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+	if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER))
+		return -EBUSY;
+
+	return 0;
+}
 #endif
 
 void __init tick_broadcast_init(void)
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 76446cb5dfe1..55e13efff1ab 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -343,6 +343,27 @@ out_bc:
 	tick_install_broadcast_device(newdev);
 }
 
+/**
+ * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode
+ * @state:	The target state (enter/exit)
+ *
+ * The system enters/leaves a state, where affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
+ *
+ * Called with interrupts disabled, so clockevents_lock is not
+ * required here because the local clock event device cannot go away
+ * under us.
+ */
+int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+
+	if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP))
+		return 0;
+
+	return __tick_broadcast_oneshot_control(state);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Transfer the do_timer job away from a dying cpu.
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 42fdf4958bcc..a4a8d4e9baa1 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu);
 static inline void tick_cancel_sched_timer(int cpu) { }
 #endif
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int
+__tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	return -EBUSY;
+}
+#endif
+
 #endif
-- 
cgit v1.2.3


From 37b64a42067a04a22468c4e52c12af00d72e462b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 7 Jul 2015 21:56:34 +0200
Subject: tick/broadcast: Unbreak CONFIG_GENERIC_CLOCKEVENTS=n build

Making tick_broadcast_oneshot_control() independent from
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST broke the build for
CONFIG_GENERIC_CLOCKEVENTS=n because the function is not defined
there.

Provide a proper stub inline.

Fixes: f32dd1170511 'tick/broadcast: Make idle check independent from mode and config'
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 6916dcb61857..edbfc9a5293e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -67,7 +67,14 @@ extern void tick_broadcast_control(enum tick_broadcast_mode mode);
 static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { }
 #endif /* BROADCAST */
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state);
+#else
+static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
+{
+	return 0;
+}
+#endif
 
 static inline void tick_broadcast_enable(void)
 {
-- 
cgit v1.2.3


From a899418167264c7bac574b1a0f1b2c26c5b0995a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 5 Jul 2015 17:12:30 +0000
Subject: hotplug: Prevent alloc/free of irq descriptors during cpu up/down

When a cpu goes up some architectures (e.g. x86) have to walk the irq
space to set up the vector space for the cpu. While this needs extra
protection at the architecture level we can avoid a few race
conditions by preventing the concurrent allocation/free of irq
descriptors and the associated data.

When a cpu goes down it moves the interrupts which are targeted to
this cpu away by reassigning the affinities. While this happens
interrupts can be allocated and freed, which opens a can of race
conditions in the code which reassignes the affinities because
interrupt descriptors might be freed underneath.

Example:

CPU1				CPU2
cpu_up/down
 irq_desc = irq_to_desc(irq);
				remove_from_radix_tree(desc);
 raw_spin_lock(&desc->lock);
				free(desc);

We could protect the irq descriptors with RCU, but that would require
a full tree change of all accesses to interrupt descriptors. But
fortunately these kind of race conditions are rather limited to a few
things like cpu hotplug. The normal setup/teardown is very well
serialized. So the simpler and obvious solution is:

Prevent allocation and freeing of interrupt descriptors accross cpu
hotplug.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: xiao jin <jin.xiao@intel.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Link: http://lkml.kernel.org/r/20150705171102.063519515@linutronix.de
---
 include/linux/irqdesc.h |  7 ++++++-
 kernel/cpu.c            | 22 +++++++++++++++++++++-
 kernel/irq/internals.h  |  4 ----
 3 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 624a668e61f1..fcea4e48e21f 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -87,7 +87,12 @@ struct irq_desc {
 	const char		*name;
 } ____cacheline_internodealigned_in_smp;
 
-#ifndef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_SPARSE_IRQ
+extern void irq_lock_sparse(void);
+extern void irq_unlock_sparse(void);
+#else
+static inline void irq_lock_sparse(void) { }
+static inline void irq_unlock_sparse(void) { }
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9c9c9fab16cc..6a374544d495 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,7 @@
 #include <linux/suspend.h>
 #include <linux/lockdep.h>
 #include <linux/tick.h>
+#include <linux/irq.h>
 #include <trace/events/power.h>
 
 #include "smpboot.h"
@@ -392,13 +393,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	smpboot_park_threads(cpu);
 
 	/*
-	 * So now all preempt/rcu users must observe !cpu_active().
+	 * Prevent irq alloc/free while the dying cpu reorganizes the
+	 * interrupt affinities.
 	 */
+	irq_lock_sparse();
 
+	/*
+	 * So now all preempt/rcu users must observe !cpu_active().
+	 */
 	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+		irq_unlock_sparse();
 		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
@@ -415,6 +422,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
 	per_cpu(cpu_dead_idle, cpu) = false;
 
+	/* Interrupts are moved away from the dying cpu, reenable alloc/free */
+	irq_unlock_sparse();
+
 	hotplug_cpu__broadcast_tick_pull(cpu);
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
@@ -517,8 +527,18 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	}
 
+	/*
+	 * Some architectures have to walk the irq descriptors to
+	 * setup the vector space for the cpu which comes online.
+	 * Prevent irq alloc/free across the bringup.
+	 */
+	irq_lock_sparse();
+
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu, idle);
+
+	irq_unlock_sparse();
+
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4834ee828c41..61008b8433ab 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -76,12 +76,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc);
 
 #ifdef CONFIG_SPARSE_IRQ
 static inline void irq_mark_irq(unsigned int irq) { }
-extern void irq_lock_sparse(void);
-extern void irq_unlock_sparse(void);
 #else
 extern void irq_mark_irq(unsigned int irq);
-static inline void irq_lock_sparse(void) { }
-static inline void irq_unlock_sparse(void) { }
 #endif
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-- 
cgit v1.2.3


From 1f6823faa8c563431a94e614d2b63ce16bb6f658 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 9 Jul 2015 10:51:46 +0200
Subject: time: Get rid of do_posix_clock_monotonic_gettime

All users gone. Remove it before we get another one.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timekeeping.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 3aa72e648650..6e191e4e6ab6 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -145,7 +145,6 @@ static inline void getboottime(struct timespec *ts)
 }
 #endif
 
-#define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
 #define ktime_get_real_ts64(ts)	getnstimeofday64(ts)
 
 /*
-- 
cgit v1.2.3


From 757856d2b9568a701df9ea6a4be68effbb9d6f44 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 25 Jun 2015 17:47:45 +0300
Subject: libceph: enable ceph in a non-default network namespace

Grab a reference on a network namespace of the 'rbd map' (in case of
rbd) or 'mount' (in case of ceph) process and use that to open sockets
instead of always using init_net and bailing if network namespace is
anything but init_net.  Be careful to not share struct ceph_client
instances between different namespaces and don't add any code in the
!CONFIG_NET_NS case.

This is based on a patch from Hong Zhiguo <zhiguohong@tencent.com>.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Sage Weil <sage@redhat.com>
---
 include/linux/ceph/messenger.h |  3 +++
 net/ceph/ceph_common.c         | 16 ++++++++++------
 net/ceph/messenger.c           | 10 +++++++++-
 3 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e15499422fdc..37753278987a 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -8,6 +8,7 @@
 #include <linux/radix-tree.h>
 #include <linux/uio.h>
 #include <linux/workqueue.h>
+#include <net/net_namespace.h>
 
 #include <linux/ceph/types.h>
 #include <linux/ceph/buffer.h>
@@ -56,6 +57,7 @@ struct ceph_messenger {
 	struct ceph_entity_addr my_enc_addr;
 
 	atomic_t stopping;
+	possible_net_t net;
 	bool nocrc;
 	bool tcp_nodelay;
 
@@ -267,6 +269,7 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr,
 			u64 required_features,
 			bool nocrc,
 			bool tcp_nodelay);
+extern void ceph_messenger_fini(struct ceph_messenger *msgr);
 
 extern void ceph_con_init(struct ceph_connection *con, void *private,
 			const struct ceph_connection_operations *ops,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index cb7db320dd27..f30329f72641 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -9,6 +9,7 @@
 #include <keys/ceph-type.h>
 #include <linux/module.h>
 #include <linux/mount.h>
+#include <linux/nsproxy.h>
 #include <linux/parser.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
@@ -16,8 +17,6 @@
 #include <linux/statfs.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
-#include <linux/nsproxy.h>
-#include <net/net_namespace.h>
 
 
 #include <linux/ceph/ceph_features.h>
@@ -131,6 +130,13 @@ int ceph_compare_options(struct ceph_options *new_opt,
 	int i;
 	int ret;
 
+	/*
+	 * Don't bother comparing options if network namespaces don't
+	 * match.
+	 */
+	if (!net_eq(current->nsproxy->net_ns, read_pnet(&client->msgr.net)))
+		return -1;
+
 	ret = memcmp(opt1, opt2, ofs);
 	if (ret)
 		return ret;
@@ -335,9 +341,6 @@ ceph_parse_options(char *options, const char *dev_name,
 	int err = -ENOMEM;
 	substring_t argstr[MAX_OPT_ARGS];
 
-	if (current->nsproxy->net_ns != &init_net)
-		return ERR_PTR(-EINVAL);
-
 	opt = kzalloc(sizeof(*opt), GFP_KERNEL);
 	if (!opt)
 		return ERR_PTR(-ENOMEM);
@@ -608,6 +611,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
 fail_monc:
 	ceph_monc_stop(&client->monc);
 fail:
+	ceph_messenger_fini(&client->msgr);
 	kfree(client);
 	return ERR_PTR(err);
 }
@@ -621,8 +625,8 @@ void ceph_destroy_client(struct ceph_client *client)
 
 	/* unmount */
 	ceph_osdc_stop(&client->osdc);
-
 	ceph_monc_stop(&client->monc);
+	ceph_messenger_fini(&client->msgr);
 
 	ceph_debugfs_client_cleanup(client);
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1679f47280e2..5c1f98ea6741 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -6,6 +6,7 @@
 #include <linux/inet.h>
 #include <linux/kthread.h>
 #include <linux/net.h>
+#include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/socket.h>
 #include <linux/string.h>
@@ -479,7 +480,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
 	int ret;
 
 	BUG_ON(con->sock);
-	ret = sock_create_kern(&init_net, con->peer_addr.in_addr.ss_family,
+	ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
 			       SOCK_STREAM, IPPROTO_TCP, &sock);
 	if (ret)
 		return ret;
@@ -2944,11 +2945,18 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
 	msgr->tcp_nodelay = tcp_nodelay;
 
 	atomic_set(&msgr->stopping, 0);
+	write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
 
 	dout("%s %p\n", __func__, msgr);
 }
 EXPORT_SYMBOL(ceph_messenger_init);
 
+void ceph_messenger_fini(struct ceph_messenger *msgr)
+{
+	put_net(read_pnet(&msgr->net));
+}
+EXPORT_SYMBOL(ceph_messenger_fini);
+
 static void clear_standby(struct ceph_connection *con)
 {
 	/* come back from STANDBY? */
-- 
cgit v1.2.3


From 4a0e3e989d66bb7204b163d9cfaa7fa96d0f2023 Mon Sep 17 00:00:00 2001
From: Enrico Mioso <mrkiko.rs@gmail.com>
Date: Wed, 8 Jul 2015 13:05:57 +0200
Subject: cdc_ncm: Add support for moving NDP to end of NCM frame

NCM specs are not actually mandating a specific position in the frame for
the NDP (Network Datagram Pointer). However, some Huawei devices will
ignore our aggregates if it is not placed after the datagrams it points
to. Add support for doing just this, in a per-device configurable way.
While at it, update NCM subdrivers, disabling this functionality in all of
them, except in huawei_cdc_ncm where it is enabled instead.
We aren't making any distinction between different Huawei NCM devices,
based on what the vendor driver does. Standard NCM devices are left
unaffected: if they are compliant, they should be always usable, still
stay on the safe side.

This change has been tested and working with a Huawei E3131 device (which
works regardless of NDP position), a Huawei E3531 (also working both
ways) and an E3372 (which mandates NDP to be after indexed datagrams).

V1->V2:
- corrected wrong NDP acronym definition
- fixed possible NULL pointer dereference
- patch cleanup
V2->V3:
- Properly account for the NDP size when writing new packets to SKB

Signed-off-by: Enrico Mioso <mrkiko.rs@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_mbim.c       |  2 +-
 drivers/net/usb/cdc_ncm.c        | 61 ++++++++++++++++++++++++++++++++++++----
 drivers/net/usb/huawei_cdc_ncm.c |  7 +++--
 include/linux/usb/cdc_ncm.h      |  7 ++++-
 4 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index e4b7a47a825c..efc18e05af0a 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -158,7 +158,7 @@ static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (!cdc_ncm_comm_intf_is_mbim(intf->cur_altsetting))
 		goto err;
 
-	ret = cdc_ncm_bind_common(dev, intf, data_altsetting);
+	ret = cdc_ncm_bind_common(dev, intf, data_altsetting, 0);
 	if (ret)
 		goto err;
 
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 8067b8fbb0ee..1991e4a24657 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -684,10 +684,12 @@ static void cdc_ncm_free(struct cdc_ncm_ctx *ctx)
 		ctx->tx_curr_skb = NULL;
 	}
 
+	kfree(ctx->delayed_ndp16);
+
 	kfree(ctx);
 }
 
-int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting)
+int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags)
 {
 	const struct usb_cdc_union_desc *union_desc = NULL;
 	struct cdc_ncm_ctx *ctx;
@@ -855,6 +857,17 @@ advance:
 	/* finish setting up the device specific data */
 	cdc_ncm_setup(dev);
 
+	/* Device-specific flags */
+	ctx->drvflags = drvflags;
+
+	/* Allocate the delayed NDP if needed. */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
+		ctx->delayed_ndp16 = kzalloc(ctx->max_ndp_size, GFP_KERNEL);
+		if (!ctx->delayed_ndp16)
+			goto error2;
+		dev_info(&intf->dev, "NDP will be placed at end of frame for this device.");
+	}
+
 	/* override ethtool_ops */
 	dev->net->ethtool_ops = &cdc_ncm_ethtool_ops;
 
@@ -954,8 +967,11 @@ static int cdc_ncm_bind(struct usbnet *dev, struct usb_interface *intf)
 	if (cdc_ncm_select_altsetting(intf) != CDC_NCM_COMM_ALTSETTING_NCM)
 		return -ENODEV;
 
-	/* The NCM data altsetting is fixed */
-	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM);
+	/* The NCM data altsetting is fixed, so we hard-coded it.
+	 * Additionally, generic NCM devices are assumed to accept arbitrarily
+	 * placed NDP.
+	 */
+	ret = cdc_ncm_bind_common(dev, intf, CDC_NCM_DATA_ALTSETTING_NCM, 0);
 
 	/*
 	 * We should get an event when network connection is "connected" or
@@ -986,6 +1002,14 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 	struct usb_cdc_ncm_nth16 *nth16 = (void *)skb->data;
 	size_t ndpoffset = le16_to_cpu(nth16->wNdpIndex);
 
+	/* If NDP should be moved to the end of the NCM package, we can't follow the
+	* NTH16 header as we would normally do. NDP isn't written to the SKB yet, and
+	* the wNdpIndex field in the header is actually not consistent with reality. It will be later.
+	*/
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)
+		if (ctx->delayed_ndp16->dwSignature == sign)
+			return ctx->delayed_ndp16;
+
 	/* follow the chain of NDPs, looking for a match */
 	while (ndpoffset) {
 		ndp16 = (struct usb_cdc_ncm_ndp16 *)(skb->data + ndpoffset);
@@ -995,7 +1019,8 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 	}
 
 	/* align new NDP */
-	cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_max);
+	if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
+		cdc_ncm_align_tail(skb, ctx->tx_ndp_modulus, 0, ctx->tx_max);
 
 	/* verify that there is room for the NDP and the datagram (reserve) */
 	if ((ctx->tx_max - skb->len - reserve) < ctx->max_ndp_size)
@@ -1008,7 +1033,11 @@ static struct usb_cdc_ncm_ndp16 *cdc_ncm_ndp(struct cdc_ncm_ctx *ctx, struct sk_
 		nth16->wNdpIndex = cpu_to_le16(skb->len);
 
 	/* push a new empty NDP */
-	ndp16 = (struct usb_cdc_ncm_ndp16 *)memset(skb_put(skb, ctx->max_ndp_size), 0, ctx->max_ndp_size);
+	if (!(ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END))
+		ndp16 = (struct usb_cdc_ncm_ndp16 *)memset(skb_put(skb, ctx->max_ndp_size), 0, ctx->max_ndp_size);
+	else
+		ndp16 = ctx->delayed_ndp16;
+
 	ndp16->dwSignature = sign;
 	ndp16->wLength = cpu_to_le16(sizeof(struct usb_cdc_ncm_ndp16) + sizeof(struct usb_cdc_ncm_dpe16));
 	return ndp16;
@@ -1023,6 +1052,15 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	struct sk_buff *skb_out;
 	u16 n = 0, index, ndplen;
 	u8 ready2send = 0;
+	u32 delayed_ndp_size;
+
+	/* When our NDP gets written in cdc_ncm_ndp(), then skb_out->len gets updated
+	 * accordingly. Otherwise, we should check here.
+	 */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END)
+		delayed_ndp_size = ctx->max_ndp_size;
+	else
+		delayed_ndp_size = 0;
 
 	/* if there is a remaining skb, it gets priority */
 	if (skb != NULL) {
@@ -1077,7 +1115,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		cdc_ncm_align_tail(skb_out,  ctx->tx_modulus, ctx->tx_remainder, ctx->tx_max);
 
 		/* check if we had enough room left for both NDP and frame */
-		if (!ndp16 || skb_out->len + skb->len > ctx->tx_max) {
+		if (!ndp16 || skb_out->len + skb->len + delayed_ndp_size > ctx->tx_max) {
 			if (n == 0) {
 				/* won't fit, MTU problem? */
 				dev_kfree_skb_any(skb);
@@ -1150,6 +1188,17 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 		/* variables will be reset at next call */
 	}
 
+	/* If requested, put NDP at end of frame. */
+	if (ctx->drvflags & CDC_NCM_FLAG_NDP_TO_END) {
+		nth16 = (struct usb_cdc_ncm_nth16 *)skb_out->data;
+		cdc_ncm_align_tail(skb_out, ctx->tx_ndp_modulus, 0, ctx->tx_max);
+		nth16->wNdpIndex = cpu_to_le16(skb_out->len);
+		memcpy(skb_put(skb_out, ctx->max_ndp_size), ctx->delayed_ndp16, ctx->max_ndp_size);
+
+		/* Zero out delayed NDP - signature checking will naturally fail. */
+		ndp16 = memset(ctx->delayed_ndp16, 0, ctx->max_ndp_size);
+	}
+
 	/* If collected data size is less or equal ctx->min_tx_pkt
 	 * bytes, we send buffers as it is. If we get more data, it
 	 * would be more efficient for USB HS mobile device with DMA
diff --git a/drivers/net/usb/huawei_cdc_ncm.c b/drivers/net/usb/huawei_cdc_ncm.c
index 735f7dadb9a0..2680a65cd5e4 100644
--- a/drivers/net/usb/huawei_cdc_ncm.c
+++ b/drivers/net/usb/huawei_cdc_ncm.c
@@ -73,11 +73,14 @@ static int huawei_cdc_ncm_bind(struct usbnet *usbnet_dev,
 	struct usb_driver *subdriver = ERR_PTR(-ENODEV);
 	int ret = -ENODEV;
 	struct huawei_cdc_ncm_state *drvstate = (void *)&usbnet_dev->data;
+	int drvflags = 0;
 
 	/* altsetting should always be 1 for NCM devices - so we hard-coded
-	 * it here
+	 * it here. Some huawei devices will need the NDP part of the NCM package to
+	 * be at the end of the frame.
 	 */
-	ret = cdc_ncm_bind_common(usbnet_dev, intf, 1);
+	drvflags |= CDC_NCM_FLAG_NDP_TO_END;
+	ret = cdc_ncm_bind_common(usbnet_dev, intf, 1, drvflags);
 	if (ret)
 		goto err;
 
diff --git a/include/linux/usb/cdc_ncm.h b/include/linux/usb/cdc_ncm.h
index 7c9b484735c5..1f6526c76ee8 100644
--- a/include/linux/usb/cdc_ncm.h
+++ b/include/linux/usb/cdc_ncm.h
@@ -80,6 +80,9 @@
 #define CDC_NCM_TIMER_INTERVAL_MIN		5UL
 #define CDC_NCM_TIMER_INTERVAL_MAX		(U32_MAX / NSEC_PER_USEC)
 
+/* Driver flags */
+#define CDC_NCM_FLAG_NDP_TO_END	0x02		/* NDP is placed at end of frame */
+
 #define cdc_ncm_comm_intf_is_mbim(x)  ((x)->desc.bInterfaceSubClass == USB_CDC_SUBCLASS_MBIM && \
 				       (x)->desc.bInterfaceProtocol == USB_CDC_PROTO_NONE)
 #define cdc_ncm_data_intf_is_mbim(x)  ((x)->desc.bInterfaceProtocol == USB_CDC_MBIM_PROTO_NTB)
@@ -103,9 +106,11 @@ struct cdc_ncm_ctx {
 
 	spinlock_t mtx;
 	atomic_t stop;
+	int drvflags;
 
 	u32 timer_interval;
 	u32 max_ndp_size;
+	struct usb_cdc_ncm_ndp16 *delayed_ndp16;
 
 	u32 tx_timer_pending;
 	u32 tx_curr_frame_num;
@@ -133,7 +138,7 @@ struct cdc_ncm_ctx {
 };
 
 u8 cdc_ncm_select_altsetting(struct usb_interface *intf);
-int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting);
+int cdc_ncm_bind_common(struct usbnet *dev, struct usb_interface *intf, u8 data_altsetting, int drvflags);
 void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf);
 struct sk_buff *cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign);
 int cdc_ncm_rx_verify_nth16(struct cdc_ncm_ctx *ctx, struct sk_buff *skb_in);
-- 
cgit v1.2.3


From d3b58c47d330de8c29898fe9746f7530408f8a59 Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Fri, 26 Jun 2015 11:58:19 +0200
Subject: can: replace timestamp as unique skb attribute

Commit 514ac99c64b "can: fix multiple delivery of a single CAN frame for
overlapping CAN filters" requires the skb->tstamp to be set to check for
identical CAN skbs.

Without timestamping to be required by user space applications this timestamp
was not generated which lead to commit 36c01245eb8 "can: fix loss of CAN frames
in raw_rcv" - which forces the timestamp to be set in all CAN related skbuffs
by introducing several __net_timestamp() calls.

This forces e.g. out of tree drivers which are not using alloc_can{,fd}_skb()
to add __net_timestamp() after skbuff creation to prevent the frame loss fixed
in mainline Linux.

This patch removes the timestamp dependency and uses an atomic counter to
create an unique identifier together with the skbuff pointer.

Btw: the new skbcnt element introduced in struct can_skb_priv has to be
initialized with zero in out-of-tree drivers which are not using
alloc_can{,fd}_skb() too.

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Cc: linux-stable <stable@vger.kernel.org>
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 drivers/net/can/dev.c   |  7 ++-----
 drivers/net/can/slcan.c |  2 +-
 drivers/net/can/vcan.c  |  3 ---
 include/linux/can/skb.h |  2 ++
 net/can/af_can.c        | 12 +++++++-----
 net/can/bcm.c           |  2 ++
 net/can/raw.c           |  7 ++++---
 7 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index e9b1810d319f..aede704605c6 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -440,9 +440,6 @@ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx)
 		struct can_frame *cf = (struct can_frame *)skb->data;
 		u8 dlc = cf->can_dlc;
 
-		if (!(skb->tstamp.tv64))
-			__net_timestamp(skb);
-
 		netif_rx(priv->echo_skb[idx]);
 		priv->echo_skb[idx] = NULL;
 
@@ -578,7 +575,6 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
 	if (unlikely(!skb))
 		return NULL;
 
-	__net_timestamp(skb);
 	skb->protocol = htons(ETH_P_CAN);
 	skb->pkt_type = PACKET_BROADCAST;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -589,6 +585,7 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	*cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
 	memset(*cf, 0, sizeof(struct can_frame));
@@ -607,7 +604,6 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 	if (unlikely(!skb))
 		return NULL;
 
-	__net_timestamp(skb);
 	skb->protocol = htons(ETH_P_CANFD);
 	skb->pkt_type = PACKET_BROADCAST;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -618,6 +614,7 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	*cfd = (struct canfd_frame *)skb_put(skb, sizeof(struct canfd_frame));
 	memset(*cfd, 0, sizeof(struct canfd_frame));
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index f64f5290d6f8..a23a7af8eb9a 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -207,7 +207,6 @@ static void slc_bump(struct slcan *sl)
 	if (!skb)
 		return;
 
-	__net_timestamp(skb);
 	skb->dev = sl->dev;
 	skb->protocol = htons(ETH_P_CAN);
 	skb->pkt_type = PACKET_BROADCAST;
@@ -215,6 +214,7 @@ static void slc_bump(struct slcan *sl)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = sl->dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	memcpy(skb_put(skb, sizeof(struct can_frame)),
 	       &cf, sizeof(struct can_frame));
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index 0ce868de855d..674f367087c5 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -78,9 +78,6 @@ static void vcan_rx(struct sk_buff *skb, struct net_device *dev)
 	skb->dev       = dev;
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-	if (!(skb->tstamp.tv64))
-		__net_timestamp(skb);
-
 	netif_rx_ni(skb);
 }
 
diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h
index b6a52a4b457a..51bb6532785c 100644
--- a/include/linux/can/skb.h
+++ b/include/linux/can/skb.h
@@ -27,10 +27,12 @@
 /**
  * struct can_skb_priv - private additional data inside CAN sk_buffs
  * @ifindex:	ifindex of the first interface the CAN frame appeared on
+ * @skbcnt:	atomic counter to have an unique id together with skb pointer
  * @cf:		align to the following CAN frame at skb->data
  */
 struct can_skb_priv {
 	int ifindex;
+	int skbcnt;
 	struct can_frame cf[0];
 };
 
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 7933e62a7318..166d436196c1 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -89,6 +89,8 @@ struct timer_list can_stattimer;   /* timer for statistics update */
 struct s_stats    can_stats;       /* packet statistics */
 struct s_pstats   can_pstats;      /* receive list statistics */
 
+static atomic_t skbcounter = ATOMIC_INIT(0);
+
 /*
  * af_can socket functions
  */
@@ -310,12 +312,8 @@ int can_send(struct sk_buff *skb, int loop)
 		return err;
 	}
 
-	if (newskb) {
-		if (!(newskb->tstamp.tv64))
-			__net_timestamp(newskb);
-
+	if (newskb)
 		netif_rx_ni(newskb);
-	}
 
 	/* update statistics */
 	can_stats.tx_frames++;
@@ -683,6 +681,10 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
 	can_stats.rx_frames++;
 	can_stats.rx_frames_delta++;
 
+	/* create non-zero unique skb identifier together with *skb */
+	while (!(can_skb_prv(skb)->skbcnt))
+		can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter);
+
 	rcu_read_lock();
 
 	/* deliver the packet to sockets listening on all devices */
diff --git a/net/can/bcm.c b/net/can/bcm.c
index b523453585be..a1ba6875c2a2 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -261,6 +261,7 @@ static void bcm_can_tx(struct bcm_op *op)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	memcpy(skb_put(skb, CFSIZ), cf, CFSIZ);
 
@@ -1217,6 +1218,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk)
 	}
 
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 	skb->dev = dev;
 	can_skb_set_owner(skb, sk);
 	err = can_send(skb, 1); /* send with loopback */
diff --git a/net/can/raw.c b/net/can/raw.c
index 31b9748cbb4e..2e67b1423cd3 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -75,7 +75,7 @@ MODULE_ALIAS("can-proto-1");
  */
 
 struct uniqframe {
-	ktime_t tstamp;
+	int skbcnt;
 	const struct sk_buff *skb;
 	unsigned int join_rx_count;
 };
@@ -133,7 +133,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 
 	/* eliminate multiple filter matches for the same skb */
 	if (this_cpu_ptr(ro->uniq)->skb == oskb &&
-	    ktime_equal(this_cpu_ptr(ro->uniq)->tstamp, oskb->tstamp)) {
+	    this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) {
 		if (ro->join_filters) {
 			this_cpu_inc(ro->uniq->join_rx_count);
 			/* drop frame until all enabled filters matched */
@@ -144,7 +144,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
 		}
 	} else {
 		this_cpu_ptr(ro->uniq)->skb = oskb;
-		this_cpu_ptr(ro->uniq)->tstamp = oskb->tstamp;
+		this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt;
 		this_cpu_ptr(ro->uniq)->join_rx_count = 1;
 		/* drop first frame to check all enabled filters? */
 		if (ro->join_filters && ro->count > 1)
@@ -749,6 +749,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
 
 	can_skb_reserve(skb);
 	can_skb_prv(skb)->ifindex = dev->ifindex;
+	can_skb_prv(skb)->skbcnt = 0;
 
 	err = memcpy_from_msg(skb_put(skb, size), msg, size);
 	if (err < 0)
-- 
cgit v1.2.3