From ee41d11d53c8fc4968f0816504651541d606cf40 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Mon, 8 Dec 2014 19:17:55 +1100 Subject: cxl: Change contexts_lock to a mutex to fix sleep while atomic bug We had a known sleep while atomic bug if a CXL device was forcefully unbound while it was in use. This could occur as a result of EEH, or manually induced with something like this while the device was in use: echo 0000:01:00.0 > /sys/bus/pci/drivers/cxl-pci/unbind The issue was that in this code path we iterated over each context and forcefully detached it with the contexts_lock spin lock held, however the detach also needed to take the spu_mutex, and call schedule. This patch changes the contexts_lock to a mutex so that we are not in atomic context while doing the detach, thereby avoiding the sleep while atomic. Also delete the related TODO comment, which suggested an alternate solution which turned out to not be workable. Cc: stable@vger.kernel.org Signed-off-by: Ian Munsie Signed-off-by: Michael Ellerman --- drivers/misc/cxl/context.c | 15 ++++++++------- drivers/misc/cxl/cxl.h | 2 +- drivers/misc/cxl/native.c | 7 ------- drivers/misc/cxl/pci.c | 2 +- drivers/misc/cxl/sysfs.c | 10 +++++----- 5 files changed, 15 insertions(+), 21 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index cca472109135..4aa31a3fb448 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -82,12 +82,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master) * Allocating IDR! We better make sure everything's setup that * dereferences from it. */ + mutex_lock(&afu->contexts_lock); idr_preload(GFP_KERNEL); - spin_lock(&afu->contexts_lock); i = idr_alloc(&ctx->afu->contexts_idr, ctx, 0, ctx->afu->num_procs, GFP_NOWAIT); - spin_unlock(&afu->contexts_lock); idr_preload_end(); + mutex_unlock(&afu->contexts_lock); if (i < 0) return i; @@ -168,21 +168,22 @@ void cxl_context_detach_all(struct cxl_afu *afu) struct cxl_context *ctx; int tmp; - rcu_read_lock(); - idr_for_each_entry(&afu->contexts_idr, ctx, tmp) + mutex_lock(&afu->contexts_lock); + idr_for_each_entry(&afu->contexts_idr, ctx, tmp) { /* * Anything done in here needs to be setup before the IDR is * created and torn down after the IDR removed */ __detach_context(ctx); - rcu_read_unlock(); + } + mutex_unlock(&afu->contexts_lock); } void cxl_context_free(struct cxl_context *ctx) { - spin_lock(&ctx->afu->contexts_lock); + mutex_lock(&ctx->afu->contexts_lock); idr_remove(&ctx->afu->contexts_idr, ctx->pe); - spin_unlock(&ctx->afu->contexts_lock); + mutex_unlock(&ctx->afu->contexts_lock); synchronize_rcu(); free_page((u64)ctx->sstp); diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index b5b6bda44a00..7c05239359df 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -351,7 +351,7 @@ struct cxl_afu { struct device *chardev_s, *chardev_m, *chardev_d; struct idr contexts_idr; struct dentry *debugfs; - spinlock_t contexts_lock; + struct mutex contexts_lock; struct mutex spa_mutex; spinlock_t afu_cntl_lock; diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 9a5a442269a8..1001cf49af94 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -610,13 +610,6 @@ static inline int detach_process_native_dedicated(struct cxl_context *ctx) return 0; } -/* - * TODO: handle case when this is called inside a rcu_read_lock() which may - * happen when we unbind the driver (ie. cxl_context_detach_all()) . Terminate - * & remove use a mutex lock and schedule which will not good with lock held. - * May need to write do_process_element_cmd() that handles outstanding page - * faults synchronously. - */ static inline int detach_process_native_afu_directed(struct cxl_context *ctx) { if (!ctx->pe_inserted) diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 10c98ab7f46e..0f2cc9f8b4db 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -502,7 +502,7 @@ static struct cxl_afu *cxl_alloc_afu(struct cxl *adapter, int slice) afu->dev.release = cxl_release_afu; afu->slice = slice; idr_init(&afu->contexts_idr); - spin_lock_init(&afu->contexts_lock); + mutex_init(&afu->contexts_lock); spin_lock_init(&afu->afu_cntl_lock); mutex_init(&afu->spa_mutex); diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c index ce7ec06d87d1..461bdbd5d483 100644 --- a/drivers/misc/cxl/sysfs.c +++ b/drivers/misc/cxl/sysfs.c @@ -121,7 +121,7 @@ static ssize_t reset_store_afu(struct device *device, int rc; /* Not safe to reset if it is currently in use */ - spin_lock(&afu->contexts_lock); + mutex_lock(&afu->contexts_lock); if (!idr_is_empty(&afu->contexts_idr)) { rc = -EBUSY; goto err; @@ -132,7 +132,7 @@ static ssize_t reset_store_afu(struct device *device, rc = count; err: - spin_unlock(&afu->contexts_lock); + mutex_unlock(&afu->contexts_lock); return rc; } @@ -247,7 +247,7 @@ static ssize_t mode_store(struct device *device, struct device_attribute *attr, int rc = -EBUSY; /* can't change this if we have a user */ - spin_lock(&afu->contexts_lock); + mutex_lock(&afu->contexts_lock); if (!idr_is_empty(&afu->contexts_idr)) goto err; @@ -271,7 +271,7 @@ static ssize_t mode_store(struct device *device, struct device_attribute *attr, afu->current_mode = 0; afu->num_procs = 0; - spin_unlock(&afu->contexts_lock); + mutex_unlock(&afu->contexts_lock); if ((rc = _cxl_afu_deactivate_mode(afu, old_mode))) return rc; @@ -280,7 +280,7 @@ static ssize_t mode_store(struct device *device, struct device_attribute *attr, return count; err: - spin_unlock(&afu->contexts_lock); + mutex_unlock(&afu->contexts_lock); return rc; } -- cgit v1.2.3 From a98e6e9f4e0224d85b4d951edc44af16dfe6094a Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Mon, 8 Dec 2014 19:17:56 +1100 Subject: cxl: Add timeout to process element commands In the event that something goes wrong in the hardware and it is unable to complete a process element comment we would end up polling forever, effectively making the associated process unkillable. This patch adds a timeout to the process element command code path, so that we will give up if the hardware does not respond in a reasonable time. Cc: stable@vger.kernel.org Signed-off-by: Ian Munsie Signed-off-by: Michael Ellerman --- drivers/misc/cxl/native.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c index 1001cf49af94..f2b37b41a0da 100644 --- a/drivers/misc/cxl/native.c +++ b/drivers/misc/cxl/native.c @@ -277,6 +277,7 @@ static int do_process_element_cmd(struct cxl_context *ctx, u64 cmd, u64 pe_state) { u64 state; + unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT); WARN_ON(!ctx->afu->enabled); @@ -286,6 +287,10 @@ static int do_process_element_cmd(struct cxl_context *ctx, smp_mb(); cxl_p1n_write(ctx->afu, CXL_PSL_LLCMD_An, cmd | ctx->pe); while (1) { + if (time_after_eq(jiffies, timeout)) { + dev_warn(&ctx->afu->dev, "WARNING: Process Element Command timed out!\n"); + return -EBUSY; + } state = be64_to_cpup(ctx->afu->sw_command_status); if (state == ~0ULL) { pr_err("cxl: Error adding process element to AFU\n"); -- cgit v1.2.3 From b123429e6a9e8d03aacf888d23262835f0081448 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Mon, 8 Dec 2014 19:18:01 +1100 Subject: cxl: Unmap MMIO regions when detaching a context If we need to force detach a context (e.g. due to EEH or simply force unbinding the driver) we should prevent the userspace contexts from being able to access the Problem State Area MMIO region further, which they may have mapped with mmap(). This patch unmaps any mapped MMIO regions when detaching a userspace context. Cc: stable@vger.kernel.org Signed-off-by: Ian Munsie Signed-off-by: Michael Ellerman --- drivers/misc/cxl/context.c | 11 ++++++++++- drivers/misc/cxl/cxl.h | 7 ++++++- drivers/misc/cxl/file.c | 6 +++++- 3 files changed, 21 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 4aa31a3fb448..51fd6b524371 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -34,7 +34,8 @@ struct cxl_context *cxl_context_alloc(void) /* * Initialises a CXL context. */ -int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master) +int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, + struct address_space *mapping) { int i; @@ -42,6 +43,8 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master) ctx->afu = afu; ctx->master = master; ctx->pid = NULL; /* Set in start work ioctl */ + mutex_init(&ctx->mapping_lock); + ctx->mapping = mapping; /* * Allocate the segment table before we put it in the IDR so that we @@ -147,6 +150,12 @@ static void __detach_context(struct cxl_context *ctx) afu_release_irqs(ctx); flush_work(&ctx->fault_work); /* Only needed for dedicated process */ wake_up_all(&ctx->wq); + + /* Release Problem State Area mapping */ + mutex_lock(&ctx->mapping_lock); + if (ctx->mapping) + unmap_mapping_range(ctx->mapping, 0, 0, 1); + mutex_unlock(&ctx->mapping_lock); } /* diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index 7c05239359df..28078f8894a5 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -398,6 +398,10 @@ struct cxl_context { phys_addr_t psn_phys; u64 psn_size; + /* Used to unmap any mmaps when force detaching */ + struct address_space *mapping; + struct mutex mapping_lock; + spinlock_t sste_lock; /* Protects segment table entries */ struct cxl_sste *sstp; u64 sstp0, sstp1; @@ -599,7 +603,8 @@ int cxl_alloc_sst(struct cxl_context *ctx); void init_cxl_native(void); struct cxl_context *cxl_context_alloc(void); -int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master); +int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, + struct address_space *mapping); void cxl_context_free(struct cxl_context *ctx); int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma); diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 378b099e7c0b..e9f2f10dbb37 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -77,7 +77,7 @@ static int __afu_open(struct inode *inode, struct file *file, bool master) goto err_put_afu; } - if ((rc = cxl_context_init(ctx, afu, master))) + if ((rc = cxl_context_init(ctx, afu, master, inode->i_mapping))) goto err_put_afu; pr_devel("afu_open pe: %i\n", ctx->pe); @@ -113,6 +113,10 @@ static int afu_release(struct inode *inode, struct file *file) __func__, ctx->pe); cxl_context_detach(ctx); + mutex_lock(&ctx->mapping_lock); + ctx->mapping = NULL; + mutex_unlock(&ctx->mapping_lock); + put_device(&ctx->afu->dev); /* -- cgit v1.2.3