From 866f4c8e0e26293b5819fd61c241502c79023775 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Tue, 22 May 2018 12:42:57 +0200 Subject: s390/net: add pnetid support s390 hardware supports the definition of a so-call Physical NETwork IDentifier (short PNETID) per network device port. These PNETIDS can be used to identify network devices that are attached to the same physical network (broadcast domain). This patch provides the interface to extract the PNETID of a port of a device attached to the ccw-bus or pci-bus. Parts of this patch are based on an initial implementation by Thomas Richter. Signed-off-by: Ursula Braun Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 3 ++ arch/s390/include/asm/pnet.h | 23 ++++++++++++++ arch/s390/net/Makefile | 1 + arch/s390/net/pnet.c | 76 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+) create mode 100644 arch/s390/include/asm/pnet.h create mode 100644 arch/s390/net/pnet.c diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 199ac3e4da1d..33072e0bc589 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -59,6 +59,9 @@ config PCI_QUIRKS config ARCH_SUPPORTS_UPROBES def_bool y +config HAVE_PNETID + def_bool y if SMC + config S390 def_bool y select ARCH_BINFMT_ELF_STATE diff --git a/arch/s390/include/asm/pnet.h b/arch/s390/include/asm/pnet.h new file mode 100644 index 000000000000..6e278584f8f1 --- /dev/null +++ b/arch/s390/include/asm/pnet.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * IBM System z PNET ID Support + * + * Copyright IBM Corp. 2018 + */ + +#ifndef _ASM_S390_PNET_H +#define _ASM_S390_PNET_H + +#include +#include + +#define PNETIDS_LEN 64 /* Total utility string length in bytes + * to cover up to 4 PNETIDs of 16 bytes + * for up to 4 device ports + */ +#define MAX_PNETID_LEN 16 /* Max.length of a single port PNETID */ +#define MAX_PNETID_PORTS (PNETIDS_LEN / MAX_PNETID_LEN) + /* Max. # of ports with a PNETID */ + +int pnet_id_by_dev_port(struct device *dev, unsigned short port, u8 *pnetid); +#endif /* _ASM_S390_PNET_H */ diff --git a/arch/s390/net/Makefile b/arch/s390/net/Makefile index e0d5f245e42b..e2b85ffdbb0c 100644 --- a/arch/s390/net/Makefile +++ b/arch/s390/net/Makefile @@ -3,3 +3,4 @@ # Arch-specific network modules # obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o +obj-$(CONFIG_HAVE_PNETID) += pnet.o diff --git a/arch/s390/net/pnet.c b/arch/s390/net/pnet.c new file mode 100644 index 000000000000..ae958ba5337f --- /dev/null +++ b/arch/s390/net/pnet.c @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * IBM System z PNET ID Support + * + * Copyright IBM Corp. 2018 + */ + +#include +#include +#include +#include +#include +#include + +/* + * Get the PNETIDs from a device. + * s390 hardware supports the definition of a so-called Physical Network + * Identifier (short PNETID) per network device port. These PNETIDs can be + * used to identify network devices that are attached to the same physical + * network (broadcast domain). + * + * The device can be + * - a ccwgroup device with all bundled subchannels having the same PNETID + * - a PCI attached network device + * + * Returns: + * 0: PNETIDs extracted from device. + * -ENOMEM: No memory to extract utility string. + * -EOPNOTSUPP: Device type without utility string support + */ +static int pnet_ids_by_device(struct device *dev, u8 *pnetids) +{ + memset(pnetids, 0, PNETIDS_LEN); + if (dev_is_ccwgroup(dev)) { + struct ccwgroup_device *gdev = to_ccwgroupdev(dev); + u8 *util_str; + + util_str = ccw_device_get_util_str(gdev->cdev[0], 0); + if (!util_str) + return -ENOMEM; + memcpy(pnetids, util_str, PNETIDS_LEN); + kfree(util_str); + return 0; + } + if (dev_is_pci(dev)) { + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); + + memcpy(pnetids, zdev->util_str, sizeof(zdev->util_str)); + return 0; + } + return -EOPNOTSUPP; +} + +/* + * Extract the pnetid for a device port. + * + * Return 0 if a pnetid is found and -ENOENT otherwise. + */ +int pnet_id_by_dev_port(struct device *dev, unsigned short port, u8 *pnetid) +{ + u8 pnetids[MAX_PNETID_PORTS][MAX_PNETID_LEN]; + static const u8 zero[MAX_PNETID_LEN] = { 0 }; + int rc = 0; + + if (!dev || port >= MAX_PNETID_PORTS) + return -ENOENT; + + if (!pnet_ids_by_device(dev, (u8 *)pnetids) && + memcmp(pnetids[port], zero, MAX_PNETID_LEN)) + memcpy(pnetid, pnetids[port], MAX_PNETID_LEN); + else + rc = -ENOENT; + + return rc; +} +EXPORT_SYMBOL_GPL(pnet_id_by_dev_port); -- cgit v1.2.3 From 3376d98021e915196f4894d835325a884e635a04 Mon Sep 17 00:00:00 2001 From: Harald Freudenberger Date: Wed, 25 Apr 2018 11:43:17 +0200 Subject: s390/archrandom: Rework arch random implementation. The arch_get_random_seed_long() invocation done by the random device driver is done in interrupt context and may be invoked very very frequently. The existing s390 arch_get_random_seed*() implementation uses the PRNO(TRNG) instruction which produces excellent high quality entropy but is relatively slow and thus expensive. This fix reworks the arch_get_random_seed* implementation. It introduces a buffer concept to decouple the delivery of random data via arch_get_random_seed*() from the generation of new random bytes. The buffer of random data is filled asynchronously by a workqueue thread. If there are enough bytes in the buffer the s390_arch_random_generate() just delivers these bytes. Otherwise false is returned until the worker thread refills the buffer. The worker fills the rng buffer by pulling fresh entropy from the high quality (but slow) true hardware random generator. This entropy is then spread over the buffer with an pseudo random generator. As the arch_get_random_seed_long() fetches 8 bytes and the calling function add_interrupt_randomness() counts this as 1 bit entropy the distribution needs to make sure there is in fact 1 bit entropy contained in 8 bytes of the buffer. The current values pull 32 byte entropy and scatter this into a 2048 byte buffer. So 8 byte in the buffer will contain 1 bit of entropy. The worker thread is rescheduled based on the charge level of the buffer but at least with 500 ms delay to avoid too much cpu consumption. So the max. amount of rng data delivered via arch_get_random_seed is limited to 4Kb per second. Signed-off-by: Harald Freudenberger Reviewed-by: Patrick Steuer Signed-off-by: Martin Schwidefsky --- arch/s390/crypto/arch_random.c | 103 +++++++++++++++++++++++++++++++++++-- arch/s390/include/asm/archrandom.h | 13 ++--- 2 files changed, 102 insertions(+), 14 deletions(-) diff --git a/arch/s390/crypto/arch_random.c b/arch/s390/crypto/arch_random.c index 8720e9203ecf..dd95cdbd22ce 100644 --- a/arch/s390/crypto/arch_random.c +++ b/arch/s390/crypto/arch_random.c @@ -2,14 +2,37 @@ /* * s390 arch random implementation. * - * Copyright IBM Corp. 2017 - * Author(s): Harald Freudenberger + * Copyright IBM Corp. 2017, 2018 + * Author(s): Harald Freudenberger + * + * The s390_arch_random_generate() function may be called from random.c + * in interrupt context. So this implementation does the best to be very + * fast. There is a buffer of random data which is asynchronously checked + * and filled by a workqueue thread. + * If there are enough bytes in the buffer the s390_arch_random_generate() + * just delivers these bytes. Otherwise false is returned until the + * worker thread refills the buffer. + * The worker fills the rng buffer by pulling fresh entropy from the + * high quality (but slow) true hardware random generator. This entropy + * is then spread over the buffer with an pseudo random generator PRNG. + * As the arch_get_random_seed_long() fetches 8 bytes and the calling + * function add_interrupt_randomness() counts this as 1 bit entropy the + * distribution needs to make sure there is in fact 1 bit entropy contained + * in 8 bytes of the buffer. The current values pull 32 byte entropy + * and scatter this into a 2048 byte buffer. So 8 byte in the buffer + * will contain 1 bit of entropy. + * The worker thread is rescheduled based on the charge level of the + * buffer but at least with 500 ms delay to avoid too much CPU consumption. + * So the max. amount of rng data delivered via arch_get_random_seed is + * limited to 4k bytes per second. */ #include #include #include +#include #include +#include #include DEFINE_STATIC_KEY_FALSE(s390_arch_random_available); @@ -17,11 +40,83 @@ DEFINE_STATIC_KEY_FALSE(s390_arch_random_available); atomic64_t s390_arch_random_counter = ATOMIC64_INIT(0); EXPORT_SYMBOL(s390_arch_random_counter); +#define ARCH_REFILL_TICKS (HZ/2) +#define ARCH_PRNG_SEED_SIZE 32 +#define ARCH_RNG_BUF_SIZE 2048 + +static DEFINE_SPINLOCK(arch_rng_lock); +static u8 *arch_rng_buf; +static unsigned int arch_rng_buf_idx; + +static void arch_rng_refill_buffer(struct work_struct *); +static DECLARE_DELAYED_WORK(arch_rng_work, arch_rng_refill_buffer); + +bool s390_arch_random_generate(u8 *buf, unsigned int nbytes) +{ + /* lock rng buffer */ + if (!spin_trylock(&arch_rng_lock)) + return false; + + /* try to resolve the requested amount of bytes from the buffer */ + arch_rng_buf_idx -= nbytes; + if (arch_rng_buf_idx < ARCH_RNG_BUF_SIZE) { + memcpy(buf, arch_rng_buf + arch_rng_buf_idx, nbytes); + atomic64_add(nbytes, &s390_arch_random_counter); + spin_unlock(&arch_rng_lock); + return true; + } + + /* not enough bytes in rng buffer, refill is done asynchronously */ + spin_unlock(&arch_rng_lock); + + return false; +} +EXPORT_SYMBOL(s390_arch_random_generate); + +static void arch_rng_refill_buffer(struct work_struct *unused) +{ + unsigned int delay = ARCH_REFILL_TICKS; + + spin_lock(&arch_rng_lock); + if (arch_rng_buf_idx > ARCH_RNG_BUF_SIZE) { + /* buffer is exhausted and needs refill */ + u8 seed[ARCH_PRNG_SEED_SIZE]; + u8 prng_wa[240]; + /* fetch ARCH_PRNG_SEED_SIZE bytes of entropy */ + cpacf_trng(NULL, 0, seed, sizeof(seed)); + /* blow this entropy up to ARCH_RNG_BUF_SIZE with PRNG */ + memset(prng_wa, 0, sizeof(prng_wa)); + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, + &prng_wa, NULL, 0, seed, sizeof(seed)); + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, + &prng_wa, arch_rng_buf, ARCH_RNG_BUF_SIZE, NULL, 0); + arch_rng_buf_idx = ARCH_RNG_BUF_SIZE; + } + delay += (ARCH_REFILL_TICKS * arch_rng_buf_idx) / ARCH_RNG_BUF_SIZE; + spin_unlock(&arch_rng_lock); + + /* kick next check */ + queue_delayed_work(system_long_wq, &arch_rng_work, delay); +} + static int __init s390_arch_random_init(void) { - /* check if subfunction CPACF_PRNO_TRNG is available */ - if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + /* all the needed PRNO subfunctions available ? */ + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG) && + cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) { + + /* alloc arch random working buffer */ + arch_rng_buf = kmalloc(ARCH_RNG_BUF_SIZE, GFP_KERNEL); + if (!arch_rng_buf) + return -ENOMEM; + + /* kick worker queue job to fill the random buffer */ + queue_delayed_work(system_long_wq, + &arch_rng_work, ARCH_REFILL_TICKS); + + /* enable arch random to the outside world */ static_branch_enable(&s390_arch_random_available); + } return 0; } diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index 09aed1095336..c67b82dfa558 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -15,16 +15,11 @@ #include #include -#include DECLARE_STATIC_KEY_FALSE(s390_arch_random_available); extern atomic64_t s390_arch_random_counter; -static void s390_arch_random_generate(u8 *buf, unsigned int nbytes) -{ - cpacf_trng(NULL, 0, buf, nbytes); - atomic64_add(nbytes, &s390_arch_random_counter); -} +bool s390_arch_random_generate(u8 *buf, unsigned int nbytes); static inline bool arch_has_random(void) { @@ -51,8 +46,7 @@ static inline bool arch_get_random_int(unsigned int *v) static inline bool arch_get_random_seed_long(unsigned long *v) { if (static_branch_likely(&s390_arch_random_available)) { - s390_arch_random_generate((u8 *)v, sizeof(*v)); - return true; + return s390_arch_random_generate((u8 *)v, sizeof(*v)); } return false; } @@ -60,8 +54,7 @@ static inline bool arch_get_random_seed_long(unsigned long *v) static inline bool arch_get_random_seed_int(unsigned int *v) { if (static_branch_likely(&s390_arch_random_available)) { - s390_arch_random_generate((u8 *)v, sizeof(*v)); - return true; + return s390_arch_random_generate((u8 *)v, sizeof(*v)); } return false; } -- cgit v1.2.3 From 2c861d89ccda2fbcea9358eff9cc5f8fae548be5 Mon Sep 17 00:00:00 2001 From: Dong Jia Shi Date: Wed, 2 May 2018 09:25:59 +0200 Subject: vfio: ccw: fix error return in vfio_ccw_sch_event If the device has not been registered, or there is work pending, we should reschedule a sch_event call again. Signed-off-by: Dong Jia Shi Message-Id: <20180502072559.50691-1-bjsdjshi@linux.vnet.ibm.com> Reviewed-by: Cornelia Huck Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index ea6a2d0b2894..770fa9cfc310 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -177,6 +177,7 @@ static int vfio_ccw_sch_event(struct subchannel *sch, int process) { struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); unsigned long flags; + int rc = -EAGAIN; spin_lock_irqsave(sch->lock, flags); if (!device_is_registered(&sch->dev)) @@ -187,6 +188,7 @@ static int vfio_ccw_sch_event(struct subchannel *sch, int process) if (cio_update_schib(sch)) { vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_NOT_OPER); + rc = 0; goto out_unlock; } @@ -195,11 +197,12 @@ static int vfio_ccw_sch_event(struct subchannel *sch, int process) private->state = private->mdev ? VFIO_CCW_STATE_IDLE : VFIO_CCW_STATE_STANDBY; } + rc = 0; out_unlock: spin_unlock_irqrestore(sch->lock, flags); - return 0; + return rc; } static struct css_device_id vfio_ccw_sch_ids[] = { -- cgit v1.2.3 From fb9e7880af357f0244f57a3dc4dd365091970b1a Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Wed, 16 May 2018 19:33:42 +0200 Subject: vfio: ccw: push down unsupported IDA check There is at least one relevant guest OS that doesn't set the IDA flags in the ORB as we would like them, but never uses any IDA. So instead of saying -EOPNOTSUPP when observing an ORB, such that a channel program specified by it could be a not supported one, let us say -EOPNOTSUPP only if the channel program is a not supported one. Of course, the real solution would be doing proper translation for all IDA. This is possible, but given the current code not straight forward. Signed-off-by: Halil Pasic Tested-by: Jason J. Herne Message-Id: <20180516173342.15174-1-pasic@linux.ibm.com> Reviewed-by: Dong Jia Shi Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index dce92b2a895d..9a2a39df1056 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -365,6 +365,9 @@ static void cp_unpin_free(struct channel_program *cp) * This is the chain length not considering any TICs. * You need to do a new round for each TIC target. * + * The program is also validated for absence of not yet supported + * indirect data addressing scenarios. + * * Returns: the length of the ccw chain or -errno. */ static int ccwchain_calc_length(u64 iova, struct channel_program *cp) @@ -391,6 +394,14 @@ static int ccwchain_calc_length(u64 iova, struct channel_program *cp) do { cnt++; + /* + * As we don't want to fail direct addressing even if the + * orb specified one of the unsupported formats, we defer + * checking for IDAWs in unsupported formats to here. + */ + if ((!cp->orb.cmd.c64 || cp->orb.cmd.i2k) && ccw_is_idal(ccw)) + return -EOPNOTSUPP; + if ((!ccw_is_chain(ccw)) && (!ccw_is_tic(ccw))) break; @@ -656,10 +667,8 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) /* * XXX: * Only support prefetch enable mode now. - * Only support 64bit addressing idal. - * Only support 4k IDAW. */ - if (!orb->cmd.pfch || !orb->cmd.c64 || orb->cmd.i2k) + if (!orb->cmd.pfch) return -EOPNOTSUPP; INIT_LIST_HEAD(&cp->ccwchain_list); @@ -688,6 +697,10 @@ int cp_init(struct channel_program *cp, struct device *mdev, union orb *orb) ret = ccwchain_loop_tic(chain, cp); if (ret) cp_unpin_free(cp); + /* It is safe to force: if not set but idals used + * ccwchain_calc_length returns an error. + */ + cp->orb.cmd.c64 = 1; return ret; } -- cgit v1.2.3 From 80c57f7a075b0c53944113e42ce114d8bf0977e4 Mon Sep 17 00:00:00 2001 From: Dong Jia Shi Date: Wed, 23 May 2018 04:56:42 +0200 Subject: vfio: ccw: shorten kernel doc description for pfn_array_pin() The kernel doc description for usage of the struct pfn_array in pfn_array_pin() is unnecessary long. Let's shorten it by describing the contents of the struct pfn_array fields at the struct's definition instead. Suggested-by: Cornelia Huck Signed-off-by: Dong Jia Shi Message-Id: <20180523025645.8978-2-bjsdjshi@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index 9a2a39df1056..c532939c1c3f 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -23,9 +23,13 @@ #define CCWCHAIN_LEN_MAX 256 struct pfn_array { + /* Starting guest physical I/O address. */ unsigned long pa_iova; + /* Array that stores PFNs of the pages need to pin. */ unsigned long *pa_iova_pfn; + /* Array that receives PFNs of the pages pinned. */ unsigned long *pa_pfn; + /* Number of pages to pin/pinned from @pa_iova. */ int pa_nr; }; @@ -53,14 +57,8 @@ struct ccwchain { * Attempt to pin user pages in memory. * * Usage of pfn_array: - * @pa->pa_iova starting guest physical I/O address. Assigned by caller. - * @pa->pa_iova_pfn array that stores PFNs of the pages need to pin. Allocated - * by caller. - * @pa->pa_pfn array that receives PFNs of the pages pinned. Allocated by - * caller. - * @pa->pa_nr number of pages from @pa->pa_iova to pin. Assigned by - * caller. - * number of pages pinned. Assigned by callee. + * Any field in this structure should be initialized by caller. + * We expect @pa->pa_nr > 0, and its value will be assigned by callee. * * Returns: * Number of pages pinned on success. -- cgit v1.2.3 From 5c1cfb1c3948fe93a32dfcd75223dda0f1558bb7 Mon Sep 17 00:00:00 2001 From: Dong Jia Shi Date: Wed, 23 May 2018 04:56:43 +0200 Subject: vfio: ccw: refactor and improve pfn_array_alloc_pin() This refactors pfn_array_alloc_pin() and also improves it by adding defensive code in error handling so that calling pfn_array_unpin_free() after error return won't lead to problem. This mainly does: 1. Merge pfn_array_pin() into pfn_array_alloc_pin(), since there is no other user of pfn_array_pin(). As a result, also remove kernel-doc for pfn_array_pin() and add/update kernel-doc for pfn_array_alloc_pin() and struct pfn_array. 2. For a vfio_pin_pages() failure, set pa->pa_nr to zero to indicate zero pages were pinned. 3. Set pa->pa_iova_pfn to NULL right after it was freed. Suggested-by: Pierre Morel Signed-off-by: Dong Jia Shi Message-Id: <20180523025645.8978-3-bjsdjshi@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 82 +++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index c532939c1c3f..b0f20230fc72 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -29,7 +29,7 @@ struct pfn_array { unsigned long *pa_iova_pfn; /* Array that receives PFNs of the pages pinned. */ unsigned long *pa_pfn; - /* Number of pages to pin/pinned from @pa_iova. */ + /* Number of pages pinned from @pa_iova. */ int pa_nr; }; @@ -50,64 +50,33 @@ struct ccwchain { }; /* - * pfn_array_pin() - pin user pages in memory + * pfn_array_alloc_pin() - alloc memory for PFNs, then pin user pages in memory * @pa: pfn_array on which to perform the operation * @mdev: the mediated device to perform pin/unpin operations + * @iova: target guest physical address + * @len: number of bytes that should be pinned from @iova * - * Attempt to pin user pages in memory. + * Attempt to allocate memory for PFNs, and pin user pages in memory. * * Usage of pfn_array: - * Any field in this structure should be initialized by caller. - * We expect @pa->pa_nr > 0, and its value will be assigned by callee. + * We expect (pa_nr == 0) and (pa_iova_pfn == NULL), any field in + * this structure will be filled in by this function. * * Returns: * Number of pages pinned on success. - * If @pa->pa_nr is 0 or negative, returns 0. + * If @pa->pa_nr is not 0, or @pa->pa_iova_pfn is not NULL initially, + * returns -EINVAL. * If no pages were pinned, returns -errno. */ -static int pfn_array_pin(struct pfn_array *pa, struct device *mdev) -{ - int i, ret; - - if (pa->pa_nr <= 0) { - pa->pa_nr = 0; - return 0; - } - - pa->pa_iova_pfn[0] = pa->pa_iova >> PAGE_SHIFT; - for (i = 1; i < pa->pa_nr; i++) - pa->pa_iova_pfn[i] = pa->pa_iova_pfn[i - 1] + 1; - - ret = vfio_pin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr, - IOMMU_READ | IOMMU_WRITE, pa->pa_pfn); - - if (ret > 0 && ret != pa->pa_nr) { - vfio_unpin_pages(mdev, pa->pa_iova_pfn, ret); - pa->pa_nr = 0; - return 0; - } - - return ret; -} - -/* Unpin the pages before releasing the memory. */ -static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev) -{ - vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr); - pa->pa_nr = 0; - kfree(pa->pa_iova_pfn); -} - -/* Alloc memory for PFNs, then pin pages with them. */ static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev, u64 iova, unsigned int len) { - int ret = 0; + int i, ret = 0; if (!len) return 0; - if (pa->pa_nr) + if (pa->pa_nr || pa->pa_iova_pfn) return -EINVAL; pa->pa_iova = iova; @@ -124,18 +93,39 @@ static int pfn_array_alloc_pin(struct pfn_array *pa, struct device *mdev, return -ENOMEM; pa->pa_pfn = pa->pa_iova_pfn + pa->pa_nr; - ret = pfn_array_pin(pa, mdev); + pa->pa_iova_pfn[0] = pa->pa_iova >> PAGE_SHIFT; + for (i = 1; i < pa->pa_nr; i++) + pa->pa_iova_pfn[i] = pa->pa_iova_pfn[i - 1] + 1; + + ret = vfio_pin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr, + IOMMU_READ | IOMMU_WRITE, pa->pa_pfn); - if (ret > 0) - return ret; - else if (!ret) + if (ret < 0) { + goto err_out; + } else if (ret > 0 && ret != pa->pa_nr) { + vfio_unpin_pages(mdev, pa->pa_iova_pfn, ret); ret = -EINVAL; + goto err_out; + } + + return ret; +err_out: + pa->pa_nr = 0; kfree(pa->pa_iova_pfn); + pa->pa_iova_pfn = NULL; return ret; } +/* Unpin the pages before releasing the memory. */ +static void pfn_array_unpin_free(struct pfn_array *pa, struct device *mdev) +{ + vfio_unpin_pages(mdev, pa->pa_iova_pfn, pa->pa_nr); + pa->pa_nr = 0; + kfree(pa->pa_iova_pfn); +} + static int pfn_array_table_init(struct pfn_array_table *pat, int nr) { pat->pat_pa = kcalloc(nr, sizeof(*pat->pat_pa), GFP_KERNEL); -- cgit v1.2.3 From 6238f92132a6da64b731de1a728fa46ffaa21f62 Mon Sep 17 00:00:00 2001 From: Dong Jia Shi Date: Wed, 23 May 2018 04:56:44 +0200 Subject: vfio: ccw: set ccw->cda to NULL defensively Let's avoid free on ccw->cda that points to a guest address or an already freed memory area by setting it to NULL if memory allocation didn't happen or failed. Signed-off-by: Dong Jia Shi Message-Id: <20180523025645.8978-4-bjsdjshi@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/vfio_ccw_cp.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_cp.c b/drivers/s390/cio/vfio_ccw_cp.c index b0f20230fc72..dbe7c7ac9ac8 100644 --- a/drivers/s390/cio/vfio_ccw_cp.c +++ b/drivers/s390/cio/vfio_ccw_cp.c @@ -502,7 +502,7 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, struct ccw1 *ccw; struct pfn_array_table *pat; unsigned long *idaws; - int idaw_nr; + int ret; ccw = chain->ch_ccw + idx; @@ -522,18 +522,19 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, * needed when translating a direct ccw to a idal ccw. */ pat = chain->ch_pat + idx; - if (pfn_array_table_init(pat, 1)) - return -ENOMEM; - idaw_nr = pfn_array_alloc_pin(pat->pat_pa, cp->mdev, - ccw->cda, ccw->count); - if (idaw_nr < 0) - return idaw_nr; + ret = pfn_array_table_init(pat, 1); + if (ret) + goto out_init; + + ret = pfn_array_alloc_pin(pat->pat_pa, cp->mdev, ccw->cda, ccw->count); + if (ret < 0) + goto out_init; /* Translate this direct ccw to a idal ccw. */ - idaws = kcalloc(idaw_nr, sizeof(*idaws), GFP_DMA | GFP_KERNEL); + idaws = kcalloc(ret, sizeof(*idaws), GFP_DMA | GFP_KERNEL); if (!idaws) { - pfn_array_table_unpin_free(pat, cp->mdev); - return -ENOMEM; + ret = -ENOMEM; + goto out_unpin; } ccw->cda = (__u32) virt_to_phys(idaws); ccw->flags |= CCW_FLAG_IDA; @@ -541,6 +542,12 @@ static int ccwchain_fetch_direct(struct ccwchain *chain, pfn_array_table_idal_create_words(pat, idaws); return 0; + +out_unpin: + pfn_array_table_unpin_free(pat, cp->mdev); +out_init: + ccw->cda = 0; + return ret; } static int ccwchain_fetch_idal(struct ccwchain *chain, @@ -570,7 +577,7 @@ static int ccwchain_fetch_idal(struct ccwchain *chain, pat = chain->ch_pat + idx; ret = pfn_array_table_init(pat, idaw_nr); if (ret) - return ret; + goto out_init; /* Translate idal ccw to use new allocated idaws. */ idaws = kzalloc(idaw_len, GFP_DMA | GFP_KERNEL); @@ -602,6 +609,8 @@ out_free_idaws: kfree(idaws); out_unpin: pfn_array_table_unpin_free(pat, cp->mdev); +out_init: + ccw->cda = 0; return ret; } -- cgit v1.2.3 From 3cd90214b70f7f971496bffc3c34d23b2141feb3 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Wed, 23 May 2018 04:56:45 +0200 Subject: vfio: ccw: add tracepoints for interesting error paths Add some tracepoints so we can inspect what is not working as is should. Signed-off-by: Halil Pasic Signed-off-by: Dong Jia Shi Message-Id: <20180523025645.8978-5-bjsdjshi@linux.ibm.com> Signed-off-by: Cornelia Huck --- drivers/s390/cio/Makefile | 1 + drivers/s390/cio/vfio_ccw_fsm.c | 17 +++++++++++- drivers/s390/cio/vfio_ccw_trace.h | 54 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 drivers/s390/cio/vfio_ccw_trace.h diff --git a/drivers/s390/cio/Makefile b/drivers/s390/cio/Makefile index a070ef0efe65..f230516abb96 100644 --- a/drivers/s390/cio/Makefile +++ b/drivers/s390/cio/Makefile @@ -5,6 +5,7 @@ # The following is required for define_trace.h to find ./trace.h CFLAGS_trace.o := -I$(src) +CFLAGS_vfio_ccw_fsm.o := -I$(src) obj-y += airq.o blacklist.o chsc.o cio.o css.o chp.o idset.o isc.o \ fcx.o itcw.o crw.o ccwreq.o trace.o ioasm.o diff --git a/drivers/s390/cio/vfio_ccw_fsm.c b/drivers/s390/cio/vfio_ccw_fsm.c index 3c800642134e..797a82731159 100644 --- a/drivers/s390/cio/vfio_ccw_fsm.c +++ b/drivers/s390/cio/vfio_ccw_fsm.c @@ -13,6 +13,9 @@ #include "ioasm.h" #include "vfio_ccw_private.h" +#define CREATE_TRACE_POINTS +#include "vfio_ccw_trace.h" + static int fsm_io_helper(struct vfio_ccw_private *private) { struct subchannel *sch; @@ -110,6 +113,10 @@ static void fsm_disabled_irq(struct vfio_ccw_private *private, */ cio_disable_subchannel(sch); } +inline struct subchannel_id get_schid(struct vfio_ccw_private *p) +{ + return p->sch->schid; +} /* * Deal with the ccw command request from the userspace. @@ -121,6 +128,7 @@ static void fsm_io_request(struct vfio_ccw_private *private, union scsw *scsw = &private->scsw; struct ccw_io_region *io_region = &private->io_region; struct mdev_device *mdev = private->mdev; + char *errstr = "request"; private->state = VFIO_CCW_STATE_BOXED; @@ -132,15 +140,19 @@ static void fsm_io_request(struct vfio_ccw_private *private, /* Don't try to build a cp if transport mode is specified. */ if (orb->tm.b) { io_region->ret_code = -EOPNOTSUPP; + errstr = "transport mode"; goto err_out; } io_region->ret_code = cp_init(&private->cp, mdev_dev(mdev), orb); - if (io_region->ret_code) + if (io_region->ret_code) { + errstr = "cp init"; goto err_out; + } io_region->ret_code = cp_prefetch(&private->cp); if (io_region->ret_code) { + errstr = "cp prefetch"; cp_free(&private->cp); goto err_out; } @@ -148,6 +160,7 @@ static void fsm_io_request(struct vfio_ccw_private *private, /* Start channel program and wait for I/O interrupt. */ io_region->ret_code = fsm_io_helper(private); if (io_region->ret_code) { + errstr = "cp fsm_io_helper"; cp_free(&private->cp); goto err_out; } @@ -164,6 +177,8 @@ static void fsm_io_request(struct vfio_ccw_private *private, err_out: private->state = VFIO_CCW_STATE_IDLE; + trace_vfio_ccw_io_fctl(scsw->cmd.fctl, get_schid(private), + io_region->ret_code, errstr); } /* diff --git a/drivers/s390/cio/vfio_ccw_trace.h b/drivers/s390/cio/vfio_ccw_trace.h new file mode 100644 index 000000000000..b1da53ddec1f --- /dev/null +++ b/drivers/s390/cio/vfio_ccw_trace.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Tracepoints for vfio_ccw driver + * + * Copyright IBM Corp. 2018 + * + * Author(s): Dong Jia Shi + * Halil Pasic + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vfio_ccw + +#if !defined(_VFIO_CCW_TRACE_) || defined(TRACE_HEADER_MULTI_READ) +#define _VFIO_CCW_TRACE_ + +#include + +TRACE_EVENT(vfio_ccw_io_fctl, + TP_PROTO(int fctl, struct subchannel_id schid, int errno, char *errstr), + TP_ARGS(fctl, schid, errno, errstr), + + TP_STRUCT__entry( + __field(int, fctl) + __field_struct(struct subchannel_id, schid) + __field(int, errno) + __field(char*, errstr) + ), + + TP_fast_assign( + __entry->fctl = fctl; + __entry->schid = schid; + __entry->errno = errno; + __entry->errstr = errstr; + ), + + TP_printk("schid=%x.%x.%04x fctl=%x errno=%d info=%s", + __entry->schid.cssid, + __entry->schid.ssid, + __entry->schid.sch_no, + __entry->fctl, + __entry->errno, + __entry->errstr) +); + +#endif /* _VFIO_CCW_TRACE_ */ + +/* This part must be outside protection */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE vfio_ccw_trace + +#include -- cgit v1.2.3 From 069035c5db3459b9b5f12caf3bffed9a863fa5c4 Mon Sep 17 00:00:00 2001 From: Oleksandr Andrushchenko Date: Tue, 22 May 2018 17:13:04 +0300 Subject: drm: Fix possible race conditions while unplugging DRM device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When unplugging a hotpluggable DRM device we first unregister it with drm_dev_unregister and then set drm_device.unplugged flag which is used to mark device critical sections with drm_dev_enter()/ drm_dev_exit() preventing access to device resources that are not available after the device is gone. But drm_dev_unregister may lead to hotplug uevent(s) fired to user-space on card and/or connector removal, thus making it possible for user-space to try accessing a disconnected device. Fix this by first making sure device is properly marked as disconnected and only then unregister it. Fixes: bee330f3d672 ("drm: Use srcu to protect drm_device.unplugged") Signed-off-by: Oleksandr Andrushchenko Reported-by: Andrii Chepurnyi Cc: "Noralf Trønnes" Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20180522141304.18646-1-andr2000@gmail.com --- drivers/gpu/drm/drm_drv.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index f6910ebe4d0e..cc2675550e28 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -369,13 +369,6 @@ EXPORT_SYMBOL(drm_dev_exit); */ void drm_dev_unplug(struct drm_device *dev) { - drm_dev_unregister(dev); - - mutex_lock(&drm_global_mutex); - if (dev->open_count == 0) - drm_dev_put(dev); - mutex_unlock(&drm_global_mutex); - /* * After synchronizing any critical read section is guaranteed to see * the new value of ->unplugged, and any critical section which might @@ -384,6 +377,13 @@ void drm_dev_unplug(struct drm_device *dev) */ dev->unplugged = true; synchronize_srcu(&drm_unplug_srcu); + + drm_dev_unregister(dev); + + mutex_lock(&drm_global_mutex); + if (dev->open_count == 0) + drm_dev_put(dev); + mutex_unlock(&drm_global_mutex); } EXPORT_SYMBOL(drm_dev_unplug); -- cgit v1.2.3 From 889ad63d41eea20184b0483e7e585e5b20fb6cfe Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Fri, 1 Jun 2018 16:05:32 -0400 Subject: drm/qxl: Call qxl_bo_unref outside atomic context "qxl_bo_unref" may sleep, but calling "qxl_release_map" causes "preempt_disable()" to be called and "preempt_enable()" isn't called until "qxl_release_unmap" is used. Move the call to "qxl_bo_unref" out from in between the two to avoid sleeping from an atomic context. This issue can be demonstrated on a kernel with CONFIG_LOCKDEP=y by creating a VM using QXL, using a desktop environment using Xorg, then moving the cursor on or off a window. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1571128 Fixes: 9428088c90b6 ("drm/qxl: reapply cursor after resetting primary") Cc: stable@vger.kernel.org Signed-off-by: Jeremy Cline Link: http://patchwork.freedesktop.org/patch/msgid/20180601200532.13619-1-jcline@redhat.com Signed-off-by: Gerd Hoffmann --- drivers/gpu/drm/qxl/qxl_display.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index ecb35ed0eac8..61e51516fec5 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -630,7 +630,7 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, struct qxl_cursor_cmd *cmd; struct qxl_cursor *cursor; struct drm_gem_object *obj; - struct qxl_bo *cursor_bo = NULL, *user_bo = NULL; + struct qxl_bo *cursor_bo = NULL, *user_bo = NULL, *old_cursor_bo = NULL; int ret; void *user_ptr; int size = 64*64*4; @@ -684,7 +684,7 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, cursor_bo, 0); cmd->type = QXL_CURSOR_SET; - qxl_bo_unref(&qcrtc->cursor_bo); + old_cursor_bo = qcrtc->cursor_bo; qcrtc->cursor_bo = cursor_bo; cursor_bo = NULL; } else { @@ -704,6 +704,9 @@ static void qxl_cursor_atomic_update(struct drm_plane *plane, qxl_push_cursor_ring_release(qdev, release, QXL_CMD_CURSOR, false); qxl_release_fence_buffer_objects(release); + if (old_cursor_bo) + qxl_bo_unref(&old_cursor_bo); + qxl_bo_unref(&cursor_bo); return; -- cgit v1.2.3 From 92d34134193e5b129dc24f8d79cb9196626e8d7a Mon Sep 17 00:00:00 2001 From: Shankara Pailoor Date: Tue, 5 Jun 2018 08:33:27 -0500 Subject: jfs: Fix inconsistency between memory allocation and ea_buf->max_size The code is assuming the buffer is max_size length, but we weren't allocating enough space for it. Signed-off-by: Shankara Pailoor Signed-off-by: Dave Kleikamp --- fs/jfs/xattr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index c60f3d32ee91..a6797986b625 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -491,15 +491,17 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) if (size > PSIZE) { /* * To keep the rest of the code simple. Allocate a - * contiguous buffer to work with + * contiguous buffer to work with. Make the buffer large + * enough to make use of the whole extent. */ - ea_buf->xattr = kmalloc(size, GFP_KERNEL); + ea_buf->max_size = (size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + ea_buf->xattr = kmalloc(ea_buf->max_size, GFP_KERNEL); if (ea_buf->xattr == NULL) return -ENOMEM; ea_buf->flag = EA_MALLOC; - ea_buf->max_size = (size + sb->s_blocksize - 1) & - ~(sb->s_blocksize - 1); if (ea_size == 0) return 0; -- cgit v1.2.3 From 41477acf092251eb0cfe83068f48dbcb2521478a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 7 Jun 2018 14:19:54 -0300 Subject: perf hists: Save the callchain_size in struct hist_entry So that we can figure out the real size of the struct and also be able to tell if callchains may be present in this histogram entry. Since we can't always guarantee that from hist_entry->hists we can use hists_to_evsel, to then look at evsel->attr.sample_type for PERF_SAMPLE_CALLCHAIN, like with the 'perf c2c' tool, that uses plain 'struct hists' instances, we need another way of deciding if a specific hist_entry instance has callchains associated with it, i.e. if its hist_entry->callchain[0] has space allocated for. Cc: Adrian Hunter Cc: David Ahern Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-ptvndealxs1k7myluvu9flnq@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 6 ++++-- tools/perf/util/sort.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 52e8fda93a47..0441a92b855f 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -370,9 +370,11 @@ void hists__delete_entries(struct hists *hists) static int hist_entry__init(struct hist_entry *he, struct hist_entry *template, - bool sample_self) + bool sample_self, + size_t callchain_size) { *he = *template; + he->callchain_size = callchain_size; if (symbol_conf.cumulate_callchain) { he->stat_acc = malloc(sizeof(he->stat)); @@ -473,7 +475,7 @@ static struct hist_entry *hist_entry__new(struct hist_entry *template, he = ops->new(callchain_size); if (he) { - err = hist_entry__init(he, template, sample_self); + err = hist_entry__init(he, template, sample_self, callchain_size); if (err) { ops->free(he); he = NULL; diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 7cf2d5cc038e..9ab9257ed887 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -112,6 +112,8 @@ struct hist_entry { char level; u8 filtered; + + u16 callchain_size; union { /* * Since perf diff only supports the stdio output, TUI -- cgit v1.2.3 From e5654455795f2f89328f7b301dacb6926e57e2b8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 7 Jun 2018 14:27:19 -0300 Subject: perf hists: Make hist_entry__has_callchains() work with 'perf c2c' Since 'perf c2c' uses 'struct hists' not allocated together with a 'struct perf_evsel' instance, we can't go from a 'struct hist_entry' pointer to a 'struct perf_evsel' via he->hists, so, instead, check if space was set aside for hist_entry->callchain[0] at hist_entry__new() time. Reported-by: Jin Yao Reported-by: Jiri Olsa Cc: Adrian Hunter Cc: David Ahern Cc: Namhyung Kim Cc: Wang Nan Fixes: fabd37b837f6 ("perf hists: Check if a hist_entry has callchains before using them") Link: https://lkml.kernel.org/n/tip-e8ife8djvvvwmeze3s4yodii@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/sort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 9ab9257ed887..8bf302cafcec 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -155,7 +155,7 @@ struct hist_entry { static __pure inline bool hist_entry__has_callchains(struct hist_entry *he) { - return hists__has_callchains(he->hists); + return he->callchain_size != 0; } static inline bool hist_entry__has_pairs(struct hist_entry *he) -- cgit v1.2.3 From 29f9fcdd3f8edccad5809cf939ce921752460fe7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 7 Jun 2018 14:33:31 -0300 Subject: perf hists browser gtk: Use hist_entry__has_callchains() Since we can't go from struct hists to struct evsel for all cases (c2c is an exception) and we have access to the hist_entry, use hist_entry__has_callchains() in the GTK+ hists browser to figure out if callchains are available. Cc: Adrian Hunter Cc: David Ahern Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-8owkgrruzzi5emvblwh4e6le@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/ui/gtk/hists.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/ui/gtk/hists.c b/tools/perf/ui/gtk/hists.c index b085f1b3e34d..4ab663ec3e5e 100644 --- a/tools/perf/ui/gtk/hists.c +++ b/tools/perf/ui/gtk/hists.c @@ -382,7 +382,7 @@ static void perf_gtk__show_hists(GtkWidget *window, struct hists *hists, gtk_tree_store_set(store, &iter, col_idx++, s, -1); } - if (hists__has_callchains(hists) && + if (hist_entry__has_callchains(h) && symbol_conf.use_callchain && hists__has(hists, sym)) { if (callchain_param.mode == CHAIN_GRAPH_REL) total = symbol_conf.cumulate_callchain ? -- cgit v1.2.3 From c9d366287042489090da0391318df528bdce9941 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 7 Jun 2018 14:42:27 -0300 Subject: perf hists: Reimplement hists__has_callchains() There are places where we have only access to struct hists and need to know if any of its hist_entries has callchains, like when drawing headers for the various output modes (stdio, TUI, etc), so, when adding a new hist_entry, check if it has callchains, storing this info for later use by hists__has_callchains(). This reimplementation is necessary because not always a 'struct hists' is allocated together with a 'struct perf evsel', so we can't go from 'hists' to 'perf_event_attr.sample_type & PERF_SAMPLE_CALLCHAIN'. Cc: Adrian Hunter Cc: David Ahern Cc: Jin Yao Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-hg5g7yddjio3ljwyqnnaj5dt@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/hist.c | 6 ++++-- tools/perf/util/hist.h | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 0441a92b855f..828cb9794c76 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -621,9 +621,11 @@ __hists__add_entry(struct hists *hists, .raw_data = sample->raw_data, .raw_size = sample->raw_size, .ops = ops, - }; + }, *he = hists__findnew_entry(hists, &entry, al, sample_self); - return hists__findnew_entry(hists, &entry, al, sample_self); + if (!hists->has_callchains && he && he->callchain_size != 0) + hists->has_callchains = true; + return he; } struct hist_entry *hists__add_entry(struct hists *hists, diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 06607c434949..73049f7f0f60 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -85,6 +85,7 @@ struct hists { struct events_stats stats; u64 event_stream; u16 col_len[HISTC_NR_COLS]; + bool has_callchains; int socket_filter; struct perf_hpp_list *hpp_list; struct list_head hpp_formats; @@ -222,8 +223,7 @@ static inline struct hists *evsel__hists(struct perf_evsel *evsel) static __pure inline bool hists__has_callchains(struct hists *hists) { - const struct perf_evsel *evsel = hists_to_evsel(hists); - return evsel__has_callchain(evsel); + return hists->has_callchains; } int hists__init(void); -- cgit v1.2.3 From f7fa827f5f432a0b1f34e10fc49da93aeef9f817 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Jun 2018 00:15:05 +0200 Subject: perf tools: Fix error index for pmu event parser For events we provide specific error message we need to set error column index, PMU parser is missing that, adding it. Before: $ perf stat -e cycles,krava/cycles/ kill event syntax error: 'cycles,krava/cycles/' \___ Cannot find PMU `krava'. Missing kernel support? After: $ perf stat -e cycles,krava/cycles/ kill event syntax error: 'cycles,krava/cycles/' \___ Cannot find PMU `krava'. Missing kernel support? Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Frederic Weisbecker Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20180606221513.11302-3-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.y | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 155d2570274f..da8fe57691b8 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -227,11 +227,16 @@ event_def: event_pmu | event_pmu: PE_NAME opt_pmu_config { + struct parse_events_state *parse_state = _parse_state; + struct parse_events_error *error = parse_state->error; struct list_head *list, *orig_terms, *terms; if (parse_events_copy_term_list($2, &orig_terms)) YYABORT; + if (error) + error->idx = @1.first_column; + ALLOC_LIST(list); if (parse_events_add_pmu(_parse_state, list, $1, $2, false, false)) { struct perf_pmu *pmu = NULL; -- cgit v1.2.3 From 9660e08ee8cbc94ac835f2c30576c6e51fbece8f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Jun 2018 00:15:06 +0200 Subject: perf stat: Add --interval-clear option Adding --interval-clear option to clear the screen before next interval. Committer testing: # perf stat -I 1000 --interval-clear And, as expected, it behaves almost like: # watch -n 0 perf stat -a sleep 1 Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Frederic Weisbecker Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20180606221513.11302-4-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-stat.txt | 3 +++ tools/perf/builtin-stat.c | 11 +++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 5dfe102fb5b5..b10a90b6a718 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -178,6 +178,9 @@ Print count deltas for fixed number of times. This option should be used together with "-I" option. example: 'perf stat -I 1000 --interval-count 2 -e cycles -a' +--interval-clear:: +Clear the screen before next interval. + --timeout msecs:: Stop the 'perf stat' session and print count deltas after N milliseconds (minimum: 10 ms). This option is not supported with the "-I" option. diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 096ccb25c11f..f1532e3ac7d7 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,6 +65,7 @@ #include "util/tool.h" #include "util/string2.h" #include "util/metricgroup.h" +#include "util/top.h" #include "asm/bug.h" #include @@ -173,6 +174,7 @@ static struct cpu_map *aggr_map; static aggr_get_id_t aggr_get_id; static bool append_file; static bool interval_count; +static bool interval_clear; static const char *output_name; static int output_fd; static int print_free_counters_hint; @@ -1704,9 +1706,12 @@ static void print_interval(char *prefix, struct timespec *ts) FILE *output = stat_config.output; static int num_print_interval; + if (interval_clear) + puts(CONSOLE_CLEAR); + sprintf(prefix, "%6lu.%09lu%s", ts->tv_sec, ts->tv_nsec, csv_sep); - if (num_print_interval == 0 && !csv_output) { + if ((num_print_interval == 0 && !csv_output) || interval_clear) { switch (stat_config.aggr_mode) { case AGGR_SOCKET: fprintf(output, "# time socket cpus"); @@ -1738,7 +1743,7 @@ static void print_interval(char *prefix, struct timespec *ts) } } - if (num_print_interval == 0 && metric_only) + if ((num_print_interval == 0 && metric_only) || interval_clear) print_metric_headers(" ", true); if (++num_print_interval == 25) num_print_interval = 0; @@ -2057,6 +2062,8 @@ static const struct option stat_options[] = { "(overhead is possible for values <= 100ms)"), OPT_INTEGER(0, "interval-count", &stat_config.times, "print counts for fixed number of times"), + OPT_BOOLEAN(0, "interval-clear", &interval_clear, + "clear screen in between new interval"), OPT_UINTEGER(0, "timeout", &stat_config.timeout, "stop workload and print counts after a timeout period in ms (>= 10ms)"), OPT_SET_UINT(0, "per-socket", &stat_config.aggr_mode, -- cgit v1.2.3 From b37d33edbf41b532ddd156707c037c6f4784e40b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Jun 2018 00:15:07 +0200 Subject: perf stat: Use only color_fprintf call in print_metric_only We can call color_fprintf also for non color case, it's handled properly. This change simplifies following patch. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Frederic Weisbecker Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20180606221513.11302-5-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f1532e3ac7d7..9e7b6f108956 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1008,10 +1008,7 @@ static void print_metric_only(void *ctx, const char *color, const char *fmt, if (!valid_only_metric(unit)) return; unit = fixunit(buf, os->evsel, unit); - if (color) - n = color_fprintf(out, color, fmt, val); - else - n = fprintf(out, fmt, val); + n = color_fprintf(out, color ?: "", fmt, val); if (n > METRIC_ONLY_LEN) n = METRIC_ONLY_LEN; if (mlen < strlen(unit)) -- cgit v1.2.3 From f515572734fb323aa0efe9ea2c546cd7fee327f7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Jun 2018 00:15:08 +0200 Subject: perf stat: Fix metric column header display alignment Make the metric only display aligned. Before: # perf stat --topdown -I 1000 # time core cpus retiring bad speculation frontend bound backend bound 1.000394323 S0-C0 2 37.4% 12.0% 31.4% 19.2% 1.000394323 S0-C1 2 25.1% 9.2% 43.8% 21.9% 2.001521204 S0-C0 2 36.4% 11.4% 32.4% 19.8% 2.001521204 S0-C1 2 26.2% 9.4% 43.1% 21.3% 3.001930208 S0-C0 2 35.1% 10.7% 33.6% 20.6% 3.001930208 S0-C1 2 28.9% 10.0% 40.0% 21.1% After: # perf stat --topdown -I 1000 # time core cpus retiring bad speculation frontend bound backend bound 1.000303722 S0-C0 2 34.2% 7.6% 34.2% 24.0% 1.000303722 S0-C1 2 33.1% 6.4% 36.9% 23.6% 2.001281055 S0-C0 2 34.6% 6.7% 36.8% 21.8% 2.001281055 S0-C1 2 32.8% 7.1% 38.1% 22.0% 3.001546080 S0-C0 2 39.3% 5.5% 32.7% 22.5% 3.001546080 S0-C1 2 37.8% 6.0% 33.1% 23.1% Signed-off-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Frederic Weisbecker Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20180606221513.11302-6-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 9e7b6f108956..8f3fdc052728 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1001,19 +1001,20 @@ static void print_metric_only(void *ctx, const char *color, const char *fmt, { struct outstate *os = ctx; FILE *out = os->fh; - int n; - char buf[1024]; + char buf[1024], str[1024]; unsigned mlen = METRIC_ONLY_LEN; if (!valid_only_metric(unit)) return; unit = fixunit(buf, os->evsel, unit); - n = color_fprintf(out, color ?: "", fmt, val); - if (n > METRIC_ONLY_LEN) - n = METRIC_ONLY_LEN; if (mlen < strlen(unit)) mlen = strlen(unit) + 1; - fprintf(out, "%*s", mlen - n, ""); + + if (color) + mlen += strlen(color) + sizeof(PERF_COLOR_RESET) - 1; + + color_snprintf(str, sizeof(str), color ?: "", fmt, val); + fprintf(out, "%*s ", mlen, str); } static void print_metric_only_csv(void *ctx, const char *color __maybe_unused, @@ -1053,7 +1054,7 @@ static void print_metric_header(void *ctx, const char *color __maybe_unused, if (csv_output) fprintf(os->fh, "%s%s", unit, csv_sep); else - fprintf(os->fh, "%-*s ", METRIC_ONLY_LEN, unit); + fprintf(os->fh, "%*s ", METRIC_ONLY_LEN, unit); } static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg) @@ -1721,7 +1722,7 @@ static void print_interval(char *prefix, struct timespec *ts) fprintf(output, " counts %*s events\n", unit_width, "unit"); break; case AGGR_NONE: - fprintf(output, "# time CPU"); + fprintf(output, "# time CPU "); if (!metric_only) fprintf(output, " counts %*s events\n", unit_width, "unit"); break; -- cgit v1.2.3 From c1a1f5d9da800dc715d8c1d8a9692c63c70c2955 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Jun 2018 00:15:09 +0200 Subject: perf stat: Allow to specify specific metric column len The following change will introduce new metrics, that doesn't need such wide hard coded spacing. Switch METRIC_ONLY_LEN macro usage with metric_only_len variable. Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Frederic Weisbecker Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20180606221513.11302-7-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 8f3fdc052728..3fc1f5286d50 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -145,6 +145,8 @@ static struct target target = { typedef int (*aggr_get_id_t)(struct cpu_map *m, int cpu); +#define METRIC_ONLY_LEN 20 + static int run_count = 1; static bool no_inherit = false; static volatile pid_t child_pid = -1; @@ -182,6 +184,7 @@ static int print_mixed_hw_group_error; static u64 *walltime_run; static bool ru_display = false; static struct rusage ru_data; +static unsigned int metric_only_len = METRIC_ONLY_LEN; struct perf_stat { bool record; @@ -969,8 +972,6 @@ static void print_metric_csv(void *ctx, fprintf(out, "%s%s%s%s", csv_sep, vals, csv_sep, unit); } -#define METRIC_ONLY_LEN 20 - /* Filter out some columns that don't work well in metrics only mode */ static bool valid_only_metric(const char *unit) @@ -1002,7 +1003,7 @@ static void print_metric_only(void *ctx, const char *color, const char *fmt, struct outstate *os = ctx; FILE *out = os->fh; char buf[1024], str[1024]; - unsigned mlen = METRIC_ONLY_LEN; + unsigned mlen = metric_only_len; if (!valid_only_metric(unit)) return; @@ -1054,7 +1055,7 @@ static void print_metric_header(void *ctx, const char *color __maybe_unused, if (csv_output) fprintf(os->fh, "%s%s", unit, csv_sep); else - fprintf(os->fh, "%*s ", METRIC_ONLY_LEN, unit); + fprintf(os->fh, "%*s ", metric_only_len, unit); } static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg) -- cgit v1.2.3 From a5cfa6217c94a1f1cfad4481fc14f5fc399abde3 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 7 Jun 2018 00:15:10 +0200 Subject: perf stat: Add event parsing error handling to add_default_attributes Add missing error handling for parse_events calls in add_default_attributes functions. The error handler displays error details, like for transactions (-T): Before: $ perf stat -T Cannot set up transaction events After: $ perf stat -T Cannot set up transaction events event syntax error: '..cycles,cpu/cycles-t/,cpu/tx-start/,cpu/el-start/,cpu/cycles-ct/}' \___ unknown term Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Ahern Cc: Frederic Weisbecker Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/20180606221513.11302-8-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 3fc1f5286d50..22547a490e1f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2442,14 +2442,13 @@ static int add_default_attributes(void) (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, }; + struct parse_events_error errinfo; /* Set attrs if no event is selected and !null_run: */ if (null_run) return 0; if (transaction_run) { - struct parse_events_error errinfo; - if (pmu_have_event("cpu", "cycles-ct") && pmu_have_event("cpu", "el-start")) err = parse_events(evsel_list, transaction_attrs, @@ -2460,6 +2459,7 @@ static int add_default_attributes(void) &errinfo); if (err) { fprintf(stderr, "Cannot set up transaction events\n"); + parse_events_print_error(&errinfo, transaction_attrs); return -1; } return 0; @@ -2485,10 +2485,11 @@ static int add_default_attributes(void) pmu_have_event("msr", "smi")) { if (!force_metric_only) metric_only = true; - err = parse_events(evsel_list, smi_cost_attrs, NULL); + err = parse_events(evsel_list, smi_cost_attrs, &errinfo); } else { fprintf(stderr, "To measure SMI cost, it needs " "msr/aperf/, msr/smi/ and cpu/cycles/ support\n"); + parse_events_print_error(&errinfo, smi_cost_attrs); return -1; } if (err) { @@ -2523,12 +2524,13 @@ static int add_default_attributes(void) if (topdown_attrs[0] && str) { if (warn) arch_topdown_group_warn(); - err = parse_events(evsel_list, str, NULL); + err = parse_events(evsel_list, str, &errinfo); if (err) { fprintf(stderr, "Cannot set up top down events %s: %d\n", str, err); free(str); + parse_events_print_error(&errinfo, str); return -1; } } else { -- cgit v1.2.3 From c7d606f560e4c698884697fef503e4abacdd8c25 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 25 May 2018 14:41:39 -0700 Subject: x86/mce: Improve error message when kernel cannot recover Since we added support to add recovery from some errors inside the kernel in: commit b2f9d678e28c ("x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries") we have done a less than stellar job at reporting the cause of recoverable machine checks that occur in other parts of the kernel. The user just gets the unhelpful message: mce: [Hardware Error]: Machine check: Action required: unknown MCACOD doubly unhelpful when they check the manual for the reported IA32_MSR_STATUS.MCACOD and see that it is listed as one of the standard recoverable values. Add an extra rule to the MCE severity table to catch this case and report it as: mce: [Hardware Error]: Machine check: Data load in unrecoverable area of kernel Fixes: b2f9d678e28c ("x86/mce: Check for faults tagged in EXTABLE_CLASS_FAULT exception table entries") Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Qiuxu Zhuo Cc: Ashok Raj Cc: stable@vger.kernel.org # 4.6+ Cc: Dan Williams Cc: Borislav Petkov Link: https://lkml.kernel.org/r/4cc7c465150a9a48b8b9f45d0b840278e77eb9b5.1527283897.git.tony.luck@intel.com --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 5bbd06f38ff6..f34d89c01edc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -160,6 +160,11 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), USER ), + MCESEV( + PANIC, "Data load in unrecoverable area of kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + KERNEL + ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", -- cgit v1.2.3 From 4c5717da1d021cf368eabb3cb1adcaead56c0d1e Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 25 May 2018 14:42:09 -0700 Subject: x86/mce: Check for alternate indication of machine check recovery on Skylake Currently we just check the "CAPID0" register to see whether the CPU can recover from machine checks. But there are also some special SKUs which do not have all advanced RAS features, but do enable machine check recovery for use with NVDIMMs. Add a check for any of bits {8:5} in the "CAPID5" register (each reports some NVDIMM mode available, if any of them are set, then the system supports memory machine check recovery). Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Cc: Qiuxu Zhuo Cc: Ashok Raj Cc: stable@vger.kernel.org # 4.9 Cc: Dan Williams Cc: Borislav Petkov Link: https://lkml.kernel.org/r/03cbed6e99ddafb51c2eadf9a3b7c8d7a0cc204e.1527283897.git.tony.luck@intel.com --- arch/x86/kernel/quirks.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 697a4ce04308..736348ead421 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) /* Skylake */ static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) { - u32 capid0; + u32 capid0, capid5; pci_read_config_dword(pdev, 0x84, &capid0); + pci_read_config_dword(pdev, 0x98, &capid5); - if ((capid0 & 0xc0) == 0xc0) + /* + * CAPID0{7:6} indicate whether this is an advanced RAS SKU + * CAPID5{8:5} indicate that various NVDIMM usage modes are + * enabled, so memory machine check recovery is also enabled. + */ + if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0)) static_branch_inc(&mcsafe_key); + } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); -- cgit v1.2.3 From 4c8205273626f27b9e5a64bdc194ab483a8cce66 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Jun 2018 02:22:11 +0200 Subject: perf c2c: Keep struct hist_entry at the end of struct c2c_hist_entry Exactly as the comment just before 'struct c2c_hist_entry" says, i.e. the last entry in struct hist_entry is a zero length array, that when allocating space for hist_entry gets extra space if callchains are in use, which, if hist_entry is not at the end of c2c_hist_entry, the members after it gets corrupted when callchains get added to the rb trees collecting them, etc. Signed-off-by: Jiri Olsa Reported-by: Arnaldo Carvalho de Melo Cc: Jin Yao Fixes: 7f834c2e84bb ("perf c2c report: Display node for cacheline address") Link: http://lkml.kernel.org/n/tip-bh0ke4fh2ygpj3yowna7o1di@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index 307b3594525f..6a8738f7ead3 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -56,16 +56,16 @@ struct c2c_hist_entry { struct compute_stats cstats; + unsigned long paddr; + unsigned long paddr_cnt; + bool paddr_zero; + char *nodestr; + /* * must be at the end, * because of its callchain dynamic entry */ struct hist_entry he; - - unsigned long paddr; - unsigned long paddr_cnt; - bool paddr_zero; - char *nodestr; }; static char const *coalesce_default = "pid,iaddr"; -- cgit v1.2.3 From fad76d4333fe73cf3f73704aa34d4ce523b1c458 Mon Sep 17 00:00:00 2001 From: Seeteena Thoufeek Date: Fri, 8 Jun 2018 16:32:28 +0530 Subject: perf script: Show hw-cache events 'perf script' fails to report hardware cache events (PERF_TYPE_HW_CACHE) where as 'perf report' shows the samples. Fix it. Ex, # perf record -e L1-dcache-loads ./a.out [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.008 MB perf.data (11 samples)] Before patch: # perf script | wc -l 0 After patch: # perf script | wc -l 11 Committer testing: [root@jouet ~]# perf script | head -30 | tail Timer 9803 [2] 8.963330: 1554 L1-dcache-loads: 7ffef89baae4 __vdso_clock_gettime+0xf4 ([vdso]) swapper 0 [2] 8.963343: 5626 L1-dcache-loads: ffffffffa66f4f6b cpuidle_not_av+0xb (/lib/modules/4.17.0-rc5/build/vmlinux) firefox 4853 [2] 8.964070: 18935 L1-dcache-loads: 7f0b9a00dc30 xcb_poll_for_event+0x0 (/usr/lib64/libxcb.so.1.1.0) Softwar~cTh 4928 [2] 8.964548: 15928 L1-dcache-loads: ffffffffa60d795c update_curr+0x10c (/lib/modules/4.17.0-rc5/build/vmlinux) firefox 4853 [2] 8.964675: 14978 L1-dcache-loads: ffffffffa6897018 mutex_unlock+0x18 (/lib/modules/4.17.0-rc5/build/vmlinux) gnome-shell 2026 [3] 8.964693: 50670 L1-dcache-loads: 7fa08854de6d g_source_iter_next+0x6d (/usr/lib64/libglib-2.0.so.0.5400.3) Compositor 4929 [1] 8.964784: 71772 L1-dcache-loads: 7f0b936bf078 [unknown] (/usr/lib64/firefox/libxul.so) Xwayland 2096 [2] 8.964919: 16799 L1-dcache-loads: 7f68ce2fcb8a glXGetCurrentContext+0x1a (/usr/lib64/libGLX.so.0.0.0) gnome-shell 2026 [3] 8.964997: 50670 L1-dcache-loads: 7fa08854de6d g_source_iter_next+0x6d (/usr/lib64/libglib-2.0.so.0.5400.3) [root@jouet ~]# Signed-off-by: Seeteena Thoufeek Tested-by: Arnaldo Carvalho de Melo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1528455748-20087-1-git-send-email-s1seetee@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-script.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index b3bf35512d21..a31d7082188e 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -180,6 +180,18 @@ static struct { PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE }, + [PERF_TYPE_HW_CACHE] = { + .user_set = false, + + .fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID | + PERF_OUTPUT_CPU | PERF_OUTPUT_TIME | + PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | + PERF_OUTPUT_SYM | PERF_OUTPUT_SYMOFFSET | + PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD, + + .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT, + }, + [PERF_TYPE_RAW] = { .user_set = false, -- cgit v1.2.3 From 3e84c7651dde7cca43c5cfd7385086599cce5a5d Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Fri, 8 Jun 2018 08:14:50 +0800 Subject: HID: google: Add support for whiskers Another device in the hammer class, with USB id 0x5030. Signed-off-by: Nicolas Boichat Acked-by: Benjamin Tissoires Signed-off-by: Jiri Kosina --- drivers/hid/hid-google-hammer.c | 2 ++ drivers/hid/hid-ids.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/hid/hid-google-hammer.c b/drivers/hid/hid-google-hammer.c index 7b8e17b03cb8..6bf4da7ad63a 100644 --- a/drivers/hid/hid-google-hammer.c +++ b/drivers/hid/hid-google-hammer.c @@ -124,6 +124,8 @@ static const struct hid_device_id hammer_devices[] = { USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_STAFF) }, { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_WAND) }, + { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, + USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_WHISKERS) }, { } }; MODULE_DEVICE_TABLE(hid, hammer_devices); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index a85634fe033f..c7981ddd8776 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -452,6 +452,7 @@ #define USB_DEVICE_ID_GOOGLE_TOUCH_ROSE 0x5028 #define USB_DEVICE_ID_GOOGLE_STAFF 0x502b #define USB_DEVICE_ID_GOOGLE_WAND 0x502d +#define USB_DEVICE_ID_GOOGLE_WHISKERS 0x5030 #define USB_VENDOR_ID_GOTOP 0x08f2 #define USB_DEVICE_ID_SUPER_Q2 0x007f -- cgit v1.2.3 From 828d810550abc1fffff9b20545fec4bc150d5e82 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 7 Jun 2018 02:32:52 -0400 Subject: IB/rxe: avoid double kfree skb In rxe_send, when network_type is not RDMA_NETWORK_IPV4 or RDMA_NETWORK_IPV6, skb is freed and -EINVAL is returned. Then rxe_xmit_packet will return -EINVAL, too. In rxe_requester, this skb is double freed. In rxe_requester, kfree_skb is needed only after fill_packet fails. So kfree_skb is moved from label err to test fill_packet. Fixes: 5793b4652155 ("IB/rxe: remove unnecessary skb_clone in xmit") Reported-by: Dan Carpenter Signed-off-by: Zhu Yanjun Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index f30eeba3f772..829ecb93661f 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -709,6 +709,7 @@ next_wqe: if (fill_packet(qp, wqe, &pkt, skb, payload)) { pr_debug("qp#%d Error during fill packet\n", qp_num(qp)); + kfree_skb(skb); goto err; } @@ -740,7 +741,6 @@ next_wqe: goto next_wqe; err: - kfree_skb(skb); wqe->status = IB_WC_LOC_PROT_ERR; wqe->state = wqe_state_error; __rxe_do_task(&qp->comp.task); -- cgit v1.2.3 From 299eafee39a22a9d9a7c19ae592b230bd199f259 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 Jun 2018 14:19:15 -0500 Subject: IB/mlx5: Fix memory leak in mlx5_ib_create_flow In case memory resources for *ucmd* were allocated, release them before return. Addresses-Coverity-ID: 1469857 ("Resource leak") Fixes: 3b3233fbf02e ("IB/mlx5: Add flow counters binding support") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3544150f3469..f93b30060ca7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3546,29 +3546,35 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, return ERR_PTR(-ENOMEM); err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz); - if (err) { - kfree(ucmd); - return ERR_PTR(err); - } + if (err) + goto free_ucmd; } - if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) - return ERR_PTR(-ENOMEM); + if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) { + err = -ENOMEM; + goto free_ucmd; + } if (domain != IB_FLOW_DOMAIN_USER || flow_attr->port > dev->num_ports || (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP | - IB_FLOW_ATTR_FLAGS_EGRESS))) - return ERR_PTR(-EINVAL); + IB_FLOW_ATTR_FLAGS_EGRESS))) { + err = -EINVAL; + goto free_ucmd; + } if (is_egress && (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || - flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) - return ERR_PTR(-EINVAL); + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { + err = -EINVAL; + goto free_ucmd; + } dst = kzalloc(sizeof(*dst), GFP_KERNEL); - if (!dst) - return ERR_PTR(-ENOMEM); + if (!dst) { + err = -ENOMEM; + goto free_ucmd; + } mutex_lock(&dev->flow_db->lock); @@ -3637,8 +3643,8 @@ destroy_ft: unlock: mutex_unlock(&dev->flow_db->lock); kfree(dst); +free_ucmd: kfree(ucmd); - kfree(handler); return ERR_PTR(err); } -- cgit v1.2.3 From e31abf76f4d4d3202ca16b9668b11178df23d473 Mon Sep 17 00:00:00 2001 From: "weiyongjun (A)" Date: Thu, 7 Jun 2018 01:47:41 +0000 Subject: IB/mlx5: Fix return value check in flow_counters_set_data() In case of error, the function mlx5_fc_create() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: 3b3233fbf02e ("IB/mlx5: Add flow counters binding support") Signed-off-by: Wei Yongjun Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index f93b30060ca7..645fc69997bc 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3199,8 +3199,8 @@ static int flow_counters_set_data(struct ib_counters *ibcounters, if (!mcounters->hw_cntrs_hndl) { mcounters->hw_cntrs_hndl = mlx5_fc_create( to_mdev(ibcounters->device)->mdev, false); - if (!mcounters->hw_cntrs_hndl) { - ret = -ENOMEM; + if (IS_ERR(mcounters->hw_cntrs_hndl)) { + ret = PTR_ERR(mcounters->hw_cntrs_hndl); goto free; } hw_hndl = true; -- cgit v1.2.3 From 425cf5c1350a98b81f3ddda160b99c3be613a213 Mon Sep 17 00:00:00 2001 From: "Kalderon, Michal" Date: Mon, 11 Jun 2018 10:20:20 +0300 Subject: RDMA/qedr: Fix NULL pointer dereference when running over iWARP without RDMA-CM Some RoCE specific code in qedr_modify_qp was run over an iWARP device when running perftest benchmarks without the -R option. The commit 3e44e0ee0893 ("IB/providers: Avoid null netdev check for RoCE") exposed this. Dropping the check for NULL pointer on ndev in qedr_modify_qp lead to a null pointer dereference when running over iWARP. Before the code would identify ndev as being NULL and return an error. Fixes: 3e44e0ee0893 ("IB/providers: Avoid null netdev check for RoCE") Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 614a954d0757..f9b198455fc9 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1957,6 +1957,9 @@ int qedr_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } if (attr_mask & (IB_QP_AV | IB_QP_PATH_MTU)) { + if (rdma_protocol_iwarp(&dev->ibdev, 1)) + return -EINVAL; + if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { -- cgit v1.2.3 From 3dc7c7badb7502ec3e3aa817a8bdd9e53aa54c52 Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Mon, 11 Jun 2018 20:15:11 +0200 Subject: IB/mlx4: Fix an error handling path in 'mlx4_ib_rereg_user_mr()' Before returning -EPERM we should release some resources, as already done in the other error handling path of the function. Fixes: d8f9cc328c88 ("IB/mlx4: Mark user MR as writable if actual virtual memory is writable") Signed-off-by: Christophe JAILLET Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/mr.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index ed1f253faf97..c7c85c22e4e3 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -486,8 +486,11 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, } if (flags & IB_MR_REREG_ACCESS) { - if (ib_access_writable(mr_access_flags) && !mmr->umem->writable) - return -EPERM; + if (ib_access_writable(mr_access_flags) && + !mmr->umem->writable) { + err = -EPERM; + goto release_mpt_entry; + } err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, convert_access(mr_access_flags)); -- cgit v1.2.3 From 5d902372ba5f416261c79123f02e49c664c7118f Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Tue, 12 Jun 2018 12:02:56 +0200 Subject: xsk: re-add queue id check for XDP_SKB path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 173d3adb6f43 ("xsk: add zero-copy support for Rx") introduced a regression on the XDP_SKB receive path, when the queue id checks were removed. Now, they are back again. Fixes: 173d3adb6f43 ("xsk: add zero-copy support for Rx") Reported-by: Qi Zhang Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 36919a254ba3..3b3410ada097 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -118,6 +118,9 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) u64 addr; int err; + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + if (!xskq_peek_addr(xs->umem->fq, &addr) || len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; -- cgit v1.2.3 From 7352c5469307395875f4b62f366a369ae1c46e83 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 23 May 2018 16:47:36 +0200 Subject: s390/cio: sanitize css_general_characteristics definition Change css_general_characteristics such that the bitfields don't straddle storage-unit boundaries of the base types. This does not change the offsets of the structs members but now we do as documented and also fix the following sparse complaint: drivers/s390/cio/chsc.c:926:56: warning: invalid access past the end of 'css_general_characteristics' (16 18) Signed-off-by: Sebastian Ott Reviewed-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/css_chars.h | 62 ++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/arch/s390/include/asm/css_chars.h b/arch/s390/include/asm/css_chars.h index 0563fd3e8458..480bb02ccacd 100644 --- a/arch/s390/include/asm/css_chars.h +++ b/arch/s390/include/asm/css_chars.h @@ -6,36 +6,38 @@ struct css_general_char { u64 : 12; - u32 dynio : 1; /* bit 12 */ - u32 : 4; - u32 eadm : 1; /* bit 17 */ - u32 : 23; - u32 aif : 1; /* bit 41 */ - u32 : 3; - u32 mcss : 1; /* bit 45 */ - u32 fcs : 1; /* bit 46 */ - u32 : 1; - u32 ext_mb : 1; /* bit 48 */ - u32 : 7; - u32 aif_tdd : 1; /* bit 56 */ - u32 : 1; - u32 qebsm : 1; /* bit 58 */ - u32 : 2; - u32 aiv : 1; /* bit 61 */ - u32 : 5; - u32 aif_osa : 1; /* bit 67 */ - u32 : 12; - u32 eadm_rf : 1; /* bit 80 */ - u32 : 1; - u32 cib : 1; /* bit 82 */ - u32 : 5; - u32 fcx : 1; /* bit 88 */ - u32 : 19; - u32 alt_ssi : 1; /* bit 108 */ - u32 : 1; - u32 narf : 1; /* bit 110 */ - u32 : 12; - u32 util_str : 1;/* bit 123 */ + u64 dynio : 1; /* bit 12 */ + u64 : 4; + u64 eadm : 1; /* bit 17 */ + u64 : 23; + u64 aif : 1; /* bit 41 */ + u64 : 3; + u64 mcss : 1; /* bit 45 */ + u64 fcs : 1; /* bit 46 */ + u64 : 1; + u64 ext_mb : 1; /* bit 48 */ + u64 : 7; + u64 aif_tdd : 1; /* bit 56 */ + u64 : 1; + u64 qebsm : 1; /* bit 58 */ + u64 : 2; + u64 aiv : 1; /* bit 61 */ + u64 : 2; + + u64 : 3; + u64 aif_osa : 1; /* bit 67 */ + u64 : 12; + u64 eadm_rf : 1; /* bit 80 */ + u64 : 1; + u64 cib : 1; /* bit 82 */ + u64 : 5; + u64 fcx : 1; /* bit 88 */ + u64 : 19; + u64 alt_ssi : 1; /* bit 108 */ + u64 : 1; + u64 narf : 1; /* bit 110 */ + u64 : 12; + u64 util_str : 1;/* bit 123 */ } __packed; extern struct css_general_char css_general_characteristics; -- cgit v1.2.3 From 5c618c0cf451f1d9746296b0d30c84af1bce3604 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 24 May 2018 12:18:58 +0200 Subject: s390/dasd: simplify locking in process_final_queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify locking in __dasd_device_process_final_queue to fix the following sparse warning: drivers/s390/block/dasd.c:1902:9: warning: context imbalance in '__dasd_device_process_final_queue' - different lock contexts for basic block Signed-off-by: Sebastian Ott Reviewed-by: Stefan Haberland Reviewed-by: Jan Höppner Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 59 +++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 73cce3ecb97f..790b1fa3fec0 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1885,6 +1885,33 @@ static void __dasd_device_process_ccw_queue(struct dasd_device *device, } } +static void __dasd_process_cqr(struct dasd_device *device, + struct dasd_ccw_req *cqr) +{ + char errorstring[ERRORLENGTH]; + + switch (cqr->status) { + case DASD_CQR_SUCCESS: + cqr->status = DASD_CQR_DONE; + break; + case DASD_CQR_ERROR: + cqr->status = DASD_CQR_NEED_ERP; + break; + case DASD_CQR_CLEARED: + cqr->status = DASD_CQR_TERMINATED; + break; + default: + /* internal error 12 - wrong cqr status*/ + snprintf(errorstring, ERRORLENGTH, "12 %p %x02", cqr, cqr->status); + dev_err(&device->cdev->dev, + "An error occurred in the DASD device driver, " + "reason=%s\n", errorstring); + BUG(); + } + if (cqr->callback) + cqr->callback(cqr, cqr->callback_data); +} + /* * the cqrs from the final queue are returned to the upper layer * by setting a dasd_block state and calling the callback function @@ -1895,40 +1922,18 @@ static void __dasd_device_process_final_queue(struct dasd_device *device, struct list_head *l, *n; struct dasd_ccw_req *cqr; struct dasd_block *block; - void (*callback)(struct dasd_ccw_req *, void *data); - void *callback_data; - char errorstring[ERRORLENGTH]; list_for_each_safe(l, n, final_queue) { cqr = list_entry(l, struct dasd_ccw_req, devlist); list_del_init(&cqr->devlist); block = cqr->block; - callback = cqr->callback; - callback_data = cqr->callback_data; - if (block) + if (!block) { + __dasd_process_cqr(device, cqr); + } else { spin_lock_bh(&block->queue_lock); - switch (cqr->status) { - case DASD_CQR_SUCCESS: - cqr->status = DASD_CQR_DONE; - break; - case DASD_CQR_ERROR: - cqr->status = DASD_CQR_NEED_ERP; - break; - case DASD_CQR_CLEARED: - cqr->status = DASD_CQR_TERMINATED; - break; - default: - /* internal error 12 - wrong cqr status*/ - snprintf(errorstring, ERRORLENGTH, "12 %p %x02", cqr, cqr->status); - dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, " - "reason=%s\n", errorstring); - BUG(); - } - if (cqr->callback != NULL) - (callback)(cqr, callback_data); - if (block) + __dasd_process_cqr(device, cqr); spin_unlock_bh(&block->queue_lock); + } } } -- cgit v1.2.3 From c5205f2ff2bec6acf398211aed66b3e6ac44eee6 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 4 Jun 2018 19:07:39 +0200 Subject: s390/dasd: move dasd_ccw_req to per request data Let the block layer allocate per request data to store struct dasd_ccw_req. We still need extra preallocated memory for usage by ccw programs (which vary in length) and for requests which don't originate from the block layer. Link: https://lkml.kernel.org/r/20180530074130.GA6927@infradead.org Signed-off-by: Sebastian Ott Reviewed-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 46 ++++++++++++++++++++++-------------------- drivers/s390/block/dasd_diag.c | 3 ++- drivers/s390/block/dasd_eckd.c | 46 +++++++++++++++++++++++------------------- drivers/s390/block/dasd_fba.c | 6 ++++-- drivers/s390/block/dasd_int.h | 3 ++- 5 files changed, 57 insertions(+), 47 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 790b1fa3fec0..01a1d1dabb43 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1267,35 +1267,37 @@ struct dasd_ccw_req *dasd_kmalloc_request(int magic, int cplength, } EXPORT_SYMBOL(dasd_kmalloc_request); -struct dasd_ccw_req *dasd_smalloc_request(int magic, int cplength, - int datasize, - struct dasd_device *device) +struct dasd_ccw_req *dasd_smalloc_request(int magic, int cplength, int datasize, + struct dasd_device *device, + struct dasd_ccw_req *cqr) { unsigned long flags; - struct dasd_ccw_req *cqr; - char *data; - int size; + char *data, *chunk; + int size = 0; - size = (sizeof(struct dasd_ccw_req) + 7L) & -8L; if (cplength > 0) size += cplength * sizeof(struct ccw1); if (datasize > 0) size += datasize; + if (!cqr) + size += (sizeof(*cqr) + 7L) & -8L; + spin_lock_irqsave(&device->mem_lock, flags); - cqr = (struct dasd_ccw_req *) - dasd_alloc_chunk(&device->ccw_chunks, size); + data = chunk = dasd_alloc_chunk(&device->ccw_chunks, size); spin_unlock_irqrestore(&device->mem_lock, flags); - if (cqr == NULL) + if (!chunk) return ERR_PTR(-ENOMEM); - memset(cqr, 0, sizeof(struct dasd_ccw_req)); - data = (char *) cqr + ((sizeof(struct dasd_ccw_req) + 7L) & -8L); - cqr->cpaddr = NULL; + if (!cqr) { + cqr = (void *) data; + data += (sizeof(*cqr) + 7L) & -8L; + } + memset(cqr, 0, sizeof(*cqr)); + cqr->mem_chunk = chunk; if (cplength > 0) { - cqr->cpaddr = (struct ccw1 *) data; - data += cplength*sizeof(struct ccw1); - memset(cqr->cpaddr, 0, cplength*sizeof(struct ccw1)); + cqr->cpaddr = data; + data += cplength * sizeof(struct ccw1); + memset(cqr->cpaddr, 0, cplength * sizeof(struct ccw1)); } - cqr->data = NULL; if (datasize > 0) { cqr->data = data; memset(cqr->data, 0, datasize); @@ -1333,7 +1335,7 @@ void dasd_sfree_request(struct dasd_ccw_req *cqr, struct dasd_device *device) unsigned long flags; spin_lock_irqsave(&device->mem_lock, flags); - dasd_free_chunk(&device->ccw_chunks, cqr); + dasd_free_chunk(&device->ccw_chunks, cqr->mem_chunk); spin_unlock_irqrestore(&device->mem_lock, flags); dasd_put_device(device); } @@ -3046,7 +3048,6 @@ static blk_status_t do_dasd_request(struct blk_mq_hw_ctx *hctx, cqr->callback_data = req; cqr->status = DASD_CQR_FILLED; cqr->dq = dq; - *((struct dasd_ccw_req **) blk_mq_rq_to_pdu(req)) = cqr; blk_mq_start_request(req); spin_lock(&block->queue_lock); @@ -3077,7 +3078,7 @@ enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved) unsigned long flags; int rc = 0; - cqr = *((struct dasd_ccw_req **) blk_mq_rq_to_pdu(req)); + cqr = blk_mq_rq_to_pdu(req); if (!cqr) return BLK_EH_DONE; @@ -3179,7 +3180,7 @@ static int dasd_alloc_queue(struct dasd_block *block) int rc; block->tag_set.ops = &dasd_mq_ops; - block->tag_set.cmd_size = sizeof(struct dasd_ccw_req *); + block->tag_set.cmd_size = sizeof(struct dasd_ccw_req); block->tag_set.nr_hw_queues = DASD_NR_HW_QUEUES; block->tag_set.queue_depth = DASD_MAX_LCU_DEV * DASD_REQ_PER_DEV; block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; @@ -4043,7 +4044,8 @@ static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device, struct ccw1 *ccw; unsigned long *idaw; - cqr = dasd_smalloc_request(magic, 1 /* RDC */, rdc_buffer_size, device); + cqr = dasd_smalloc_request(magic, 1 /* RDC */, rdc_buffer_size, device, + NULL); if (IS_ERR(cqr)) { /* internal error 13 - Allocating the RDC request failed*/ diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 131f1989f6f3..e1fe02477ea8 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -536,7 +536,8 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, /* Build the request */ datasize = sizeof(struct dasd_diag_req) + count*sizeof(struct dasd_diag_bio); - cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, datasize, memdev); + cqr = dasd_smalloc_request(DASD_DIAG_MAGIC, 0, datasize, memdev, + blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index be208e7adcb4..bbf95b78ef5d 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -886,7 +886,7 @@ static int dasd_eckd_read_conf_lpm(struct dasd_device *device, } cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* RCD */, 0, /* use rcd_buf as data ara */ - device); + device, NULL); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", "Could not allocate RCD request"); @@ -1442,7 +1442,7 @@ static int dasd_eckd_read_features(struct dasd_device *device) cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, (sizeof(struct dasd_psf_prssd_data) + sizeof(struct dasd_rssd_features)), - device); + device, NULL); if (IS_ERR(cqr)) { DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", "Could not " "allocate initialization request"); @@ -1504,7 +1504,7 @@ static struct dasd_ccw_req *dasd_eckd_build_psf_ssc(struct dasd_device *device, cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ , sizeof(struct dasd_psf_ssc_data), - device); + device, NULL); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", @@ -1815,7 +1815,8 @@ dasd_eckd_analysis_ccw(struct dasd_device *device) cplength = 8; datasize = sizeof(struct DE_eckd_data) + 2*sizeof(struct LO_eckd_data); - cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, device, + NULL); if (IS_ERR(cqr)) return cqr; ccw = cqr->cpaddr; @@ -2092,7 +2093,8 @@ dasd_eckd_build_check_tcw(struct dasd_device *base, struct format_data_t *fdata, */ itcw_size = itcw_calc_size(0, count, 0); - cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 0, itcw_size, startdev); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 0, itcw_size, startdev, + NULL); if (IS_ERR(cqr)) return cqr; @@ -2186,7 +2188,7 @@ dasd_eckd_build_check(struct dasd_device *base, struct format_data_t *fdata, cplength += count; cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, - startdev); + startdev, NULL); if (IS_ERR(cqr)) return cqr; @@ -2332,7 +2334,7 @@ dasd_eckd_build_format(struct dasd_device *base, } /* Allocate the format ccw request. */ fcp = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, - datasize, startdev); + datasize, startdev, NULL); if (IS_ERR(fcp)) return fcp; @@ -3103,7 +3105,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_single( } /* Allocate the ccw request. */ cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, - startdev); + startdev, blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; ccw = cqr->cpaddr; @@ -3262,7 +3264,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_cmd_track( /* Allocate the ccw request. */ cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, datasize, - startdev); + startdev, blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; ccw = cqr->cpaddr; @@ -3595,7 +3597,8 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track( /* Allocate the ccw request. */ itcw_size = itcw_calc_size(0, ctidaw, 0); - cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 0, itcw_size, startdev); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 0, itcw_size, startdev, + blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; @@ -3862,7 +3865,7 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_raw(struct dasd_device *startdev, /* Allocate the ccw request. */ cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, cplength, - datasize, startdev); + datasize, startdev, blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; @@ -4102,7 +4105,7 @@ dasd_eckd_release(struct dasd_device *device) return -EACCES; useglobal = 0; - cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device, NULL); if (IS_ERR(cqr)) { mutex_lock(&dasd_reserve_mutex); useglobal = 1; @@ -4157,7 +4160,7 @@ dasd_eckd_reserve(struct dasd_device *device) return -EACCES; useglobal = 0; - cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device, NULL); if (IS_ERR(cqr)) { mutex_lock(&dasd_reserve_mutex); useglobal = 1; @@ -4211,7 +4214,7 @@ dasd_eckd_steal_lock(struct dasd_device *device) return -EACCES; useglobal = 0; - cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 32, device, NULL); if (IS_ERR(cqr)) { mutex_lock(&dasd_reserve_mutex); useglobal = 1; @@ -4271,7 +4274,8 @@ static int dasd_eckd_snid(struct dasd_device *device, useglobal = 0; cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, - sizeof(struct dasd_snid_data), device); + sizeof(struct dasd_snid_data), device, + NULL); if (IS_ERR(cqr)) { mutex_lock(&dasd_reserve_mutex); useglobal = 1; @@ -4331,7 +4335,7 @@ dasd_eckd_performance(struct dasd_device *device, void __user *argp) cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, (sizeof(struct dasd_psf_prssd_data) + sizeof(struct dasd_rssd_perf_stats_t)), - device); + device, NULL); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", "Could not allocate initialization request"); @@ -4477,7 +4481,7 @@ static int dasd_symm_io(struct dasd_device *device, void __user *argp) psf1 = psf_data[1]; /* setup CCWs for PSF + RSSD */ - cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 2 , 0, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 2, 0, device, NULL); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", "Could not allocate initialization request"); @@ -5037,7 +5041,7 @@ static int dasd_eckd_read_message_buffer(struct dasd_device *device, cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, (sizeof(struct dasd_psf_prssd_data) + sizeof(struct dasd_rssd_messages)), - device); + device, NULL); if (IS_ERR(cqr)) { DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", "Could not allocate read message buffer request"); @@ -5126,7 +5130,7 @@ static int dasd_eckd_query_host_access(struct dasd_device *device, cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, sizeof(struct dasd_psf_prssd_data) + 1, - device); + device, NULL); if (IS_ERR(cqr)) { DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", "Could not allocate read message buffer request"); @@ -5284,8 +5288,8 @@ dasd_eckd_psf_cuir_response(struct dasd_device *device, int response, int rc; cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ , - sizeof(struct dasd_psf_cuir_response), - device); + sizeof(struct dasd_psf_cuir_response), + device, NULL); if (IS_ERR(cqr)) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index a6b132f7e869..56007a3e7f11 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -356,7 +356,8 @@ static struct dasd_ccw_req *dasd_fba_build_cp_discard( datasize = sizeof(struct DE_fba_data) + nr_ccws * (sizeof(struct LO_fba_data) + sizeof(struct ccw1)); - cqr = dasd_smalloc_request(DASD_FBA_MAGIC, cplength, datasize, memdev); + cqr = dasd_smalloc_request(DASD_FBA_MAGIC, cplength, datasize, memdev, + blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; @@ -490,7 +491,8 @@ static struct dasd_ccw_req *dasd_fba_build_cp_regular( datasize += (count - 1)*sizeof(struct LO_fba_data); } /* Allocate the ccw request. */ - cqr = dasd_smalloc_request(DASD_FBA_MAGIC, cplength, datasize, memdev); + cqr = dasd_smalloc_request(DASD_FBA_MAGIC, cplength, datasize, memdev, + blk_mq_rq_to_pdu(req)); if (IS_ERR(cqr)) return cqr; ccw = cqr->cpaddr; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 96709b1a7bf8..0844e5e2f566 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -184,6 +184,7 @@ struct dasd_ccw_req { struct irb irb; /* device status in case of an error */ struct dasd_ccw_req *refers; /* ERP-chain queueing. */ void *function; /* originating ERP action */ + void *mem_chunk; /* these are for statistics only */ unsigned long buildclk; /* TOD-clock of request generation */ @@ -716,7 +717,7 @@ extern struct kmem_cache *dasd_page_cache; struct dasd_ccw_req * dasd_kmalloc_request(int , int, int, struct dasd_device *); struct dasd_ccw_req * -dasd_smalloc_request(int , int, int, struct dasd_device *); +dasd_smalloc_request(int, int, int, struct dasd_device *, struct dasd_ccw_req *); void dasd_kfree_request(struct dasd_ccw_req *, struct dasd_device *); void dasd_sfree_request(struct dasd_ccw_req *, struct dasd_device *); void dasd_wakeup_cb(struct dasd_ccw_req *, void *); -- cgit v1.2.3 From d8a72d414baf217a2eea9c73f3aac11052161015 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 5 Jun 2018 13:34:42 +0200 Subject: s390/dasd: remove dasd_kmalloc_set_cda There is no user of this function. Just remove it. Signed-off-by: Sebastian Ott Reviewed-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd_int.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 0844e5e2f566..885e7416c368 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -722,12 +722,6 @@ void dasd_kfree_request(struct dasd_ccw_req *, struct dasd_device *); void dasd_sfree_request(struct dasd_ccw_req *, struct dasd_device *); void dasd_wakeup_cb(struct dasd_ccw_req *, void *); -static inline int -dasd_kmalloc_set_cda(struct ccw1 *ccw, void *cda, struct dasd_device *device) -{ - return set_normalized_cda(ccw, cda); -} - struct dasd_device *dasd_alloc_device(void); void dasd_free_device(struct dasd_device *); -- cgit v1.2.3 From 61d388321032be9097935bbc5efdd6ac42691ed4 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 4 Jun 2018 19:39:38 +0200 Subject: s390/dasd: reshuffle struct dasd_ccw_req Move some members of struct dasd_ccw_req to get rid of padding bytes. This saves 16 bytes per dasd request. Signed-off-by: Sebastian Ott Reviewed-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd_int.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 885e7416c368..55bcbbed1b1d 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -158,41 +158,33 @@ do { \ struct dasd_ccw_req { unsigned int magic; /* Eye catcher */ + int intrc; /* internal error, e.g. from start_IO */ struct list_head devlist; /* for dasd_device request queue */ struct list_head blocklist; /* for dasd_block request queue */ - - /* Where to execute what... */ struct dasd_block *block; /* the originating block device */ struct dasd_device *memdev; /* the device used to allocate this */ struct dasd_device *startdev; /* device the request is started on */ struct dasd_device *basedev; /* base device if no block->base */ void *cpaddr; /* address of ccw or tcw */ + short retries; /* A retry counter */ unsigned char cpmode; /* 0 = cmd mode, 1 = itcw */ char status; /* status of this request */ - short retries; /* A retry counter */ + char lpm; /* logical path mask */ unsigned long flags; /* flags of this request */ struct dasd_queue *dq; - - /* ... and how */ unsigned long starttime; /* jiffies time of request start */ unsigned long expires; /* expiration period in jiffies */ - char lpm; /* logical path mask */ void *data; /* pointer to data area */ - - /* these are important for recovering erroneous requests */ - int intrc; /* internal error, e.g. from start_IO */ struct irb irb; /* device status in case of an error */ struct dasd_ccw_req *refers; /* ERP-chain queueing. */ void *function; /* originating ERP action */ void *mem_chunk; - /* these are for statistics only */ unsigned long buildclk; /* TOD-clock of request generation */ unsigned long startclk; /* TOD-clock of request start */ unsigned long stopclk; /* TOD-clock of request interrupt */ unsigned long endclk; /* TOD-clock of request termination */ - /* Callback that is called after reaching final status. */ void (*callback)(struct dasd_ccw_req *, void *data); void *callback_data; }; -- cgit v1.2.3 From ec530174c43798099d305fbd6511e5d7fc7616d4 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Mon, 4 Jun 2018 19:18:03 +0200 Subject: s390/dasd: only use preallocated requests Change the remaining users of dasd_kmalloc_request to use preallocated memory and remove this function. Signed-off-by: Sebastian Ott Reviewed-by: Stefan Haberland Signed-off-by: Martin Schwidefsky --- drivers/s390/block/dasd.c | 66 ----------------------------------------- drivers/s390/block/dasd_alias.c | 6 ++-- drivers/s390/block/dasd_eer.c | 10 +++---- drivers/s390/block/dasd_int.h | 3 -- 4 files changed, 8 insertions(+), 77 deletions(-) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 01a1d1dabb43..d3a38c421503 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1222,51 +1222,6 @@ static void dasd_hosts_init(struct dentry *base_dentry, device->hosts_dentry = pde; } -/* - * Allocate memory for a channel program with 'cplength' channel - * command words and 'datasize' additional space. There are two - * variantes: 1) dasd_kmalloc_request uses kmalloc to get the needed - * memory and 2) dasd_smalloc_request uses the static ccw memory - * that gets allocated for each device. - */ -struct dasd_ccw_req *dasd_kmalloc_request(int magic, int cplength, - int datasize, - struct dasd_device *device) -{ - struct dasd_ccw_req *cqr; - - /* Sanity checks */ - BUG_ON(datasize > PAGE_SIZE || - (cplength*sizeof(struct ccw1)) > PAGE_SIZE); - - cqr = kzalloc(sizeof(struct dasd_ccw_req), GFP_ATOMIC); - if (cqr == NULL) - return ERR_PTR(-ENOMEM); - cqr->cpaddr = NULL; - if (cplength > 0) { - cqr->cpaddr = kcalloc(cplength, sizeof(struct ccw1), - GFP_ATOMIC | GFP_DMA); - if (cqr->cpaddr == NULL) { - kfree(cqr); - return ERR_PTR(-ENOMEM); - } - } - cqr->data = NULL; - if (datasize > 0) { - cqr->data = kzalloc(datasize, GFP_ATOMIC | GFP_DMA); - if (cqr->data == NULL) { - kfree(cqr->cpaddr); - kfree(cqr); - return ERR_PTR(-ENOMEM); - } - } - cqr->magic = magic; - set_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags); - dasd_get_device(device); - return cqr; -} -EXPORT_SYMBOL(dasd_kmalloc_request); - struct dasd_ccw_req *dasd_smalloc_request(int magic, int cplength, int datasize, struct dasd_device *device, struct dasd_ccw_req *cqr) @@ -1309,27 +1264,6 @@ struct dasd_ccw_req *dasd_smalloc_request(int magic, int cplength, int datasize, } EXPORT_SYMBOL(dasd_smalloc_request); -/* - * Free memory of a channel program. This function needs to free all the - * idal lists that might have been created by dasd_set_cda and the - * struct dasd_ccw_req itself. - */ -void dasd_kfree_request(struct dasd_ccw_req *cqr, struct dasd_device *device) -{ - struct ccw1 *ccw; - - /* Clear any idals used for the request. */ - ccw = cqr->cpaddr; - do { - clear_normalized_cda(ccw); - } while (ccw++->flags & (CCW_FLAG_CC | CCW_FLAG_DC)); - kfree(cqr->cpaddr); - kfree(cqr->data); - kfree(cqr); - dasd_put_device(device); -} -EXPORT_SYMBOL(dasd_kfree_request); - void dasd_sfree_request(struct dasd_ccw_req *cqr, struct dasd_device *device) { unsigned long flags; diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c index 5e963fe0e38d..e36a114354fc 100644 --- a/drivers/s390/block/dasd_alias.c +++ b/drivers/s390/block/dasd_alias.c @@ -407,9 +407,9 @@ static int read_unit_address_configuration(struct dasd_device *device, int rc; unsigned long flags; - cqr = dasd_kmalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */, (sizeof(struct dasd_psf_prssd_data)), - device); + device, NULL); if (IS_ERR(cqr)) return PTR_ERR(cqr); cqr->startdev = device; @@ -457,7 +457,7 @@ static int read_unit_address_configuration(struct dasd_device *device, lcu->flags |= NEED_UAC_UPDATE; spin_unlock_irqrestore(&lcu->lock, flags); } - dasd_kfree_request(cqr, cqr->memdev); + dasd_sfree_request(cqr, cqr->memdev); return rc; } diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index fb2c3599d95c..6545342bd43f 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -447,7 +447,7 @@ static void dasd_eer_snss_cb(struct dasd_ccw_req *cqr, void *data) * is a new ccw in device->eer_cqr. Free the "old" * snss request now. */ - dasd_kfree_request(cqr, device); + dasd_sfree_request(cqr, device); } /* @@ -472,8 +472,8 @@ int dasd_eer_enable(struct dasd_device *device) if (rc) goto out; - cqr = dasd_kmalloc_request(DASD_ECKD_MAGIC, 1 /* SNSS */, - SNSS_DATA_SIZE, device); + cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* SNSS */, + SNSS_DATA_SIZE, device, NULL); if (IS_ERR(cqr)) { rc = -ENOMEM; cqr = NULL; @@ -505,7 +505,7 @@ out: spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); if (cqr) - dasd_kfree_request(cqr, device); + dasd_sfree_request(cqr, device); return rc; } @@ -528,7 +528,7 @@ void dasd_eer_disable(struct dasd_device *device) in_use = test_and_clear_bit(DASD_FLAG_EER_IN_USE, &device->flags); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); if (cqr && !in_use) - dasd_kfree_request(cqr, device); + dasd_sfree_request(cqr, device); } /* diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 55bcbbed1b1d..976b6bd4fb05 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -706,11 +706,8 @@ extern const struct block_device_operations dasd_device_operations; extern struct kmem_cache *dasd_page_cache; -struct dasd_ccw_req * -dasd_kmalloc_request(int , int, int, struct dasd_device *); struct dasd_ccw_req * dasd_smalloc_request(int, int, int, struct dasd_device *, struct dasd_ccw_req *); -void dasd_kfree_request(struct dasd_ccw_req *, struct dasd_device *); void dasd_sfree_request(struct dasd_ccw_req *, struct dasd_device *); void dasd_wakeup_cb(struct dasd_ccw_req *, void *); -- cgit v1.2.3 From c60c32a5775615d5456a09527aaa12e4a109c3da Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 May 2018 17:25:48 +0200 Subject: posix-cpu-timers: Remove lockdep_assert_irqs_disabled() The lockdep_assert_irqs_disabled() was a BUG_ON() statement in the beginning and it was added just before the "spin_lock(siglock)" statement to ensure this lock was taken with disabled interrupts. This is no longer the case: the siglock is acquired via lock_task_sighand() and this function already disables the interrupts. The lock is also acquired before this "lockdep_assert_irqs_disabled" so it is best to remove it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: Frederic Weisbecker Link: https://lkml.kernel.org/r20180504152548.7166-1-bigeasy@linutronix.de --- kernel/time/posix-cpu-timers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 5a6251ac6f7a..9cdf54b04ca8 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -604,7 +604,6 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, /* * Disarm any old timer after extracting its expiry time. */ - lockdep_assert_irqs_disabled(); ret = 0; old_incr = timer->it.cpu.incr; @@ -1049,7 +1048,6 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - lockdep_assert_irqs_disabled(); arm_timer(timer); unlock: unlock_task_sighand(p, &flags); -- cgit v1.2.3 From 1eb9364ce81d9445ad6f9d44921a91d2a6597156 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Jun 2018 09:40:23 -0600 Subject: IB/uverbs: Fix ordering of ucontext check in ib_uverbs_write During disassociation the ucontext will become NULL, however due to how the SRCU locking works the ucontext must only be examined after looking at the ib_dev, which governs the RCU control flow. With the wrong ordering userspace will see EINVAL instead of EIO for a disassociated uverbs FD, which breaks rdma-core. Cc: stable@vger.kernel.org Fixes: 491d5c6a3023 ("RDMA/uverbs: Move uncontext check before SRCU read lock") Reported-by: Mark Bloch Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_main.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 3ae2339dd27a..2094d136513d 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -736,10 +736,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (ret) return ret; - if (!file->ucontext && - (command != IB_USER_VERBS_CMD_GET_CONTEXT || extended)) - return -EINVAL; - if (extended) { if (count < (sizeof(hdr) + sizeof(ex_hdr))) return -EINVAL; @@ -759,6 +755,16 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, goto out; } + /* + * Must be after the ib_dev check, as once the RCU clears ib_dev == + * NULL means ucontext == NULL + */ + if (!file->ucontext && + (command != IB_USER_VERBS_CMD_GET_CONTEXT || extended)) { + ret = -EINVAL; + goto out; + } + if (!verify_command_mask(ib_dev, command, extended)) { ret = -EOPNOTSUPP; goto out; -- cgit v1.2.3 From f2ae67941138a1e53cb1bc6a1b5878a8bdc74d26 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 12 Jun 2018 18:16:19 +0200 Subject: alpha: Remove custom dec_and_lock() implementation Alpha provides a custom implementation of dec_and_lock(). The functions is split into two parts: - atomic_add_unless() + return 0 (fast path in assembly) - remaining part including locking (slow path in C) Comparing the result of the alpha implementation with the generic implementation compiled by gcc it looks like the fast path is optimized by avoiding a stack frame (and reloading the GP), register store and all this. This is only done in the slowpath. After marking the slowpath (atomic_dec_and_lock_1()) as "noinline" and doing the slowpath in C (the atomic_add_unless(atomic, -1, 1) part) I noticed differences in the resulting assembly: - the GP is still reloaded - atomic_add_unless() adds more memory barriers compared to the custom assembly - the custom assembly here does "load, sub, beq" while atomic_add_unless() does "load, cmpeq, add, bne". This is okay because it compares against zero after subtraction while the generic code compares against 1 before. I'm not sure if avoiding the stack frame (and GP reloading) brings a lot in terms of performance. Regarding the different barriers, Peter Zijlstra says: |refcount decrement needs to be a RELEASE operation, such that all the |load/stores to the object happen before we decrement the refcount. | |Otherwise things like: | | obj->foo = 5; | refcnt_dec(&obj->ref); | |can be re-ordered, which then allows fun scenarios like: | | CPU0 CPU1 | | refcnt_dec(&obj->ref); | if (dec_and_test(&obj->ref)) | free(obj); | obj->foo = 5; // oops UaF | | |This means (for alpha) that there should be a memory barrier _before_ |the decrement, however the dec_and_lock asm thing only has one _after_, |which, per the above, is too late. | |The generic version using add_unless will result in memory barrier |before and after (because that is the rule for atomic ops with a return |value) which is strictly too many barriers for the refcount story, but |who knows what other ordering requirements code has. Remove the custom alpha implementation of dec_and_lock() and if it is an issue (performance wise) then the fast path could still be inlined. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: linux-alpha@vger.kernel.org Link: https://lkml.kernel.org/r/20180606115918.GG12198@hirez.programming.kicks-ass.net Link: https://lkml.kernel.org/r20180612161621.22645-2-bigeasy@linutronix.de --- arch/alpha/Kconfig | 5 ----- arch/alpha/lib/Makefile | 2 -- arch/alpha/lib/dec_and_lock.c | 44 ------------------------------------------- lib/Makefile | 6 +----- 4 files changed, 1 insertion(+), 56 deletions(-) delete mode 100644 arch/alpha/lib/dec_and_lock.c diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 0c4805a572c8..04a4a138ed13 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -555,11 +555,6 @@ config SMP If you don't know what to do here, say N. -config HAVE_DEC_LOCK - bool - depends on SMP - default y - config NR_CPUS int "Maximum number of CPUs (2-32)" range 2 32 diff --git a/arch/alpha/lib/Makefile b/arch/alpha/lib/Makefile index 04f9729de57c..854d5e79979e 100644 --- a/arch/alpha/lib/Makefile +++ b/arch/alpha/lib/Makefile @@ -35,8 +35,6 @@ lib-y = __divqu.o __remqu.o __divlu.o __remlu.o \ callback_srm.o srm_puts.o srm_printk.o \ fls.o -lib-$(CONFIG_SMP) += dec_and_lock.o - # The division routines are built from single source, with different defines. AFLAGS___divqu.o = -DDIV AFLAGS___remqu.o = -DREM diff --git a/arch/alpha/lib/dec_and_lock.c b/arch/alpha/lib/dec_and_lock.c deleted file mode 100644 index a117707f57fe..000000000000 --- a/arch/alpha/lib/dec_and_lock.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * arch/alpha/lib/dec_and_lock.c - * - * ll/sc version of atomic_dec_and_lock() - * - */ - -#include -#include -#include - - asm (".text \n\ - .global _atomic_dec_and_lock \n\ - .ent _atomic_dec_and_lock \n\ - .align 4 \n\ -_atomic_dec_and_lock: \n\ - .prologue 0 \n\ -1: ldl_l $1, 0($16) \n\ - subl $1, 1, $1 \n\ - beq $1, 2f \n\ - stl_c $1, 0($16) \n\ - beq $1, 4f \n\ - mb \n\ - clr $0 \n\ - ret \n\ -2: br $29, 3f \n\ -3: ldgp $29, 0($29) \n\ - br $atomic_dec_and_lock_1..ng \n\ - .subsection 2 \n\ -4: br 1b \n\ - .previous \n\ - .end _atomic_dec_and_lock"); - -static int __used atomic_dec_and_lock_1(atomic_t *atomic, spinlock_t *lock) -{ - /* Slow path */ - spin_lock(lock); - if (atomic_dec_and_test(atomic)) - return 1; - spin_unlock(lock); - return 0; -} -EXPORT_SYMBOL(_atomic_dec_and_lock); diff --git a/lib/Makefile b/lib/Makefile index 84c6dcb31fbb..8b59f4a7c0e2 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -23,7 +23,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o chacha20.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o siphash.o \ + earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o nodemask.o win_minmax.o lib-$(CONFIG_PRINTK) += dump_stack.o @@ -98,10 +98,6 @@ obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o -ifneq ($(CONFIG_HAVE_DEC_LOCK),y) - lib-y += dec_and_lock.o -endif - obj-$(CONFIG_BITREVERSE) += bitrev.o obj-$(CONFIG_RATIONAL) += rational.o obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o -- cgit v1.2.3 From ccfbb5bed407053b27492a9adc06064d949a9aa6 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Tue, 12 Jun 2018 18:16:20 +0200 Subject: atomic: Add irqsave variant of atomic_dec_and_lock() There are in-tree users of atomic_dec_and_lock() which must acquire the spin lock with interrupts disabled. To workaround the lack of an irqsave variant of atomic_dec_and_lock() they use local_irq_save() at the call site. This causes extra code and creates in some places unneeded long interrupt disabled times. These places need also extra treatment for PREEMPT_RT due to the disconnect of the irq disabling and the lock function. Implement the missing irqsave variant of the function. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r20180612161621.22645-3-bigeasy@linutronix.de --- include/linux/spinlock.h | 5 +++++ lib/dec_and_lock.c | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 1e8a46435838..fd57888d4942 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -427,6 +427,11 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); #define atomic_dec_and_lock(atomic, lock) \ __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) +extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, + unsigned long *flags); +#define atomic_dec_and_lock_irqsave(atomic, lock, flags) \ + __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags))) + int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask, size_t max_size, unsigned int cpu_mult, gfp_t gfp); diff --git a/lib/dec_and_lock.c b/lib/dec_and_lock.c index 347fa7ac2e8a..9555b68bb774 100644 --- a/lib/dec_and_lock.c +++ b/lib/dec_and_lock.c @@ -33,3 +33,19 @@ int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) } EXPORT_SYMBOL(_atomic_dec_and_lock); + +int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock, + unsigned long *flags) +{ + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ + if (atomic_add_unless(atomic, -1, 1)) + return 0; + + /* Otherwise do it the slow way */ + spin_lock_irqsave(lock, *flags); + if (atomic_dec_and_test(atomic)) + return 1; + spin_unlock_irqrestore(lock, *flags); + return 0; +} +EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave); -- cgit v1.2.3 From 7ea959c45769612aa92557fb6464679f5fec7d9e Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Tue, 12 Jun 2018 18:16:21 +0200 Subject: locking/refcounts: Implement refcount_dec_and_lock_irqsave() There are in-tree users of refcount_dec_and_lock() which must acquire the spin lock with interrupts disabled. To workaround the lack of an irqsave variant of refcount_dec_and_lock() they use local_irq_save() at the call site. This causes extra code and creates in some places unneeded long interrupt disabled times. These places need also extra treatment for PREEMPT_RT due to the disconnect of the irq disabling and the lock function. Implement the missing irqsave variant of the function. Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r20180612161621.22645-4-bigeasy@linutronix.de [bigeasy: s@atomic_dec_and_lock@refcount_dec_and_lock@g] --- include/linux/refcount.h | 4 +++- lib/refcount.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 4193c41e383a..a685da2c4522 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -98,5 +98,7 @@ extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock); extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock); - +extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, + spinlock_t *lock, + unsigned long *flags); #endif /* _LINUX_REFCOUNT_H */ diff --git a/lib/refcount.c b/lib/refcount.c index 0eb48353abe3..d3b81cefce91 100644 --- a/lib/refcount.c +++ b/lib/refcount.c @@ -350,3 +350,31 @@ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) } EXPORT_SYMBOL(refcount_dec_and_lock); +/** + * refcount_dec_and_lock_irqsave - return holding spinlock with disabled + * interrupts if able to decrement refcount to 0 + * @r: the refcount + * @lock: the spinlock to be locked + * @flags: saved IRQ-flags if the is acquired + * + * Same as refcount_dec_and_lock() above except that the spinlock is acquired + * with disabled interupts. + * + * Return: true and hold spinlock if able to decrement refcount to 0, false + * otherwise + */ +bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, + unsigned long *flags) +{ + if (refcount_dec_not_one(r)) + return false; + + spin_lock_irqsave(lock, *flags); + if (!refcount_dec_and_test(r)) { + spin_unlock_irqrestore(lock, *flags); + return false; + } + + return true; +} +EXPORT_SYMBOL(refcount_dec_and_lock_irqsave); -- cgit v1.2.3 From a26ed66c20f080c510fcf5bd448bce204f2c19d7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 10 Jun 2018 16:24:15 +0200 Subject: clocksource/drivers/stm32: Fix error return code Return an error code on failure. Problem found using Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Thomas Gleixner Cc: Daniel Lezcano Cc: kernel-janitors@vger.kernel.org Cc: Maxime Coquelin Cc: Alexandre Torgue Cc: linux-arm-kernel@lists.infradead.org Link: https://lkml.kernel.org/r1528640655-18948-3-git-send-email-Julia.Lawall@lip6.fr --- drivers/clocksource/timer-stm32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-stm32.c b/drivers/clocksource/timer-stm32.c index e5cdc3af684c..2717f88c7904 100644 --- a/drivers/clocksource/timer-stm32.c +++ b/drivers/clocksource/timer-stm32.c @@ -304,8 +304,10 @@ static int __init stm32_timer_init(struct device_node *node) to->private_data = kzalloc(sizeof(struct stm32_timer_private), GFP_KERNEL); - if (!to->private_data) + if (!to->private_data) { + ret = -ENOMEM; goto deinit; + } rstc = of_reset_control_get(node, NULL); if (!IS_ERR(rstc)) { -- cgit v1.2.3 From 73df93c57c0b18195a2fe5429747e00018b3e863 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 11 Jun 2018 22:35:48 -0700 Subject: tools/bpftool: fix a bug in bpftool perf Commit b04df400c302 ("tools/bpftool: add perf subcommand") introduced bpftool subcommand perf to query bpf program kuprobe and tracepoint attachments. The perf subcommand will first test whether bpf subcommand BPF_TASK_FD_QUERY is supported in kernel or not. It does it by opening a file with argv[0] and feeds the file descriptor and current task pid to the kernel for querying. Such an approach won't work if the argv[0] cannot be opened successfully in the current directory. This is especially true when bpftool is accessible through PATH env variable. The error below reflects the open failure for file argv[0] at home directory. [yhs@localhost ~]$ which bpftool /usr/local/sbin/bpftool [yhs@localhost ~]$ bpftool perf Error: perf_query_support: No such file or directory To fix the issue, let us open root directory ("/") which exists in every linux system. With the fix, the error message will correctly reflect the permission issue. [yhs@localhost ~]$ which bpftool /usr/local/sbin/bpftool [yhs@localhost ~]$ bpftool perf Error: perf_query_support: Operation not permitted HINT: non root or kernel doesn't support TASK_FD_QUERY Fixes: b04df400c302 ("tools/bpftool: add perf subcommand") Reported-by: Alexei Starovoitov Signed-off-by: Yonghong Song Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/perf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c index ac6b1a12c9b7..b76b77dcfd1f 100644 --- a/tools/bpf/bpftool/perf.c +++ b/tools/bpf/bpftool/perf.c @@ -29,9 +29,10 @@ static bool has_perf_query_support(void) if (perf_query_supported) goto out; - fd = open(bin_name, O_RDONLY); + fd = open("/", O_RDONLY); if (fd < 0) { - p_err("perf_query_support: %s", strerror(errno)); + p_err("perf_query_support: cannot open directory \"/\" (%s)", + strerror(errno)); goto out; } -- cgit v1.2.3 From 3bce593ac06b4f18710274cfb084369b3d7909eb Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 12 Jun 2018 13:05:10 +0200 Subject: selftests: bpf: config: add config fragments Tests test_tunnel.sh fails due to config fragments ins't enabled. Fixes: 933a741e3b82 ("selftests/bpf: bpf tunnel test.") Signed-off-by: Anders Roxell Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/config | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 1eefe211a4a8..7eb613ffef55 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -7,3 +7,13 @@ CONFIG_CGROUP_BPF=y CONFIG_NETDEVSIM=m CONFIG_NET_CLS_ACT=y CONFIG_NET_SCH_INGRESS=y +CONFIG_NET_IPIP=y +CONFIG_IPV6=y +CONFIG_NET_IPGRE_DEMUX=y +CONFIG_NET_IPGRE=y +CONFIG_IPV6_GRE=y +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_HMAC=m +CONFIG_CRYPTO_SHA256=m +CONFIG_VXLAN=y +CONFIG_GENEVE=y -- cgit v1.2.3 From 8efaac07d7e6694f39521f9fb8a5c848b712ecee Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Fri, 8 Jun 2018 08:04:57 +0200 Subject: drm/bridge/sii8620: simplify hardware reset procedure There is no need to flip reset pin twice. Also delays can be changed to values present in vendor's code. Signed-off-by: Andrzej Hajda Tested-by: Marek Szyprowski Reviewed-by: Maciej Purski Link: https://patchwork.freedesktop.org/patch/msgid/20180608060457.18357-1-a.hajda@samsung.com --- drivers/gpu/drm/bridge/sil-sii8620.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index 7ab36042a822..d1e780fba4b6 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -971,8 +971,17 @@ static int sii8620_hw_on(struct sii8620 *ctx) ret = regulator_bulk_enable(ARRAY_SIZE(ctx->supplies), ctx->supplies); if (ret) return ret; + usleep_range(10000, 20000); - return clk_prepare_enable(ctx->clk_xtal); + ret = clk_prepare_enable(ctx->clk_xtal); + if (ret) + return ret; + + msleep(100); + gpiod_set_value(ctx->gpio_reset, 0); + msleep(100); + + return 0; } static int sii8620_hw_off(struct sii8620 *ctx) @@ -982,17 +991,6 @@ static int sii8620_hw_off(struct sii8620 *ctx) return regulator_bulk_disable(ARRAY_SIZE(ctx->supplies), ctx->supplies); } -static void sii8620_hw_reset(struct sii8620 *ctx) -{ - usleep_range(10000, 20000); - gpiod_set_value(ctx->gpio_reset, 0); - usleep_range(5000, 20000); - gpiod_set_value(ctx->gpio_reset, 1); - usleep_range(10000, 20000); - gpiod_set_value(ctx->gpio_reset, 0); - msleep(300); -} - static void sii8620_cbus_reset(struct sii8620 *ctx) { sii8620_write(ctx, REG_PWD_SRST, BIT_PWD_SRST_CBUS_RST @@ -2112,7 +2110,6 @@ static void sii8620_cable_in(struct sii8620 *ctx) dev_err(dev, "Error powering on, %d.\n", ret); return; } - sii8620_hw_reset(ctx); sii8620_read_buf(ctx, REG_VND_IDL, ver, ARRAY_SIZE(ver)); ret = sii8620_clear_error(ctx); -- cgit v1.2.3 From 8e627a1b1ce8feb3e1da4428b71b9b4905f04888 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Mon, 15 Jan 2018 18:33:57 +0100 Subject: drm/bridge/sii8620: fix loops in EDID fetch logic Function should constantly check if cable is connected and finish in finite time. Signed-off-by: Andrzej Hajda Tested-by: Marek Szyprowski Reviewed-by: Maciej Purski Link: https://patchwork.freedesktop.org/patch/msgid/20180115173357.31067-4-a.hajda@samsung.com --- drivers/gpu/drm/bridge/sil-sii8620.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index d1e780fba4b6..720bc7c325e0 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -807,6 +807,7 @@ static void sii8620_burst_rx_all(struct sii8620 *ctx) static void sii8620_fetch_edid(struct sii8620 *ctx) { u8 lm_ddc, ddc_cmd, int3, cbus; + unsigned long timeout; int fetched, i; int edid_len = EDID_LENGTH; u8 *edid; @@ -856,23 +857,31 @@ static void sii8620_fetch_edid(struct sii8620 *ctx) REG_DDC_CMD, ddc_cmd | VAL_DDC_CMD_ENH_DDC_READ_NO_ACK ); - do { - int3 = sii8620_readb(ctx, REG_INTR3); + int3 = 0; + timeout = jiffies + msecs_to_jiffies(200); + for (;;) { cbus = sii8620_readb(ctx, REG_CBUS_STATUS); - - if (int3 & BIT_DDC_CMD_DONE) - break; - - if (!(cbus & BIT_CBUS_STATUS_CBUS_CONNECTED)) { + if (~cbus & BIT_CBUS_STATUS_CBUS_CONNECTED) { + kfree(edid); + edid = NULL; + goto end; + } + if (int3 & BIT_DDC_CMD_DONE) { + if (sii8620_readb(ctx, REG_DDC_DOUT_CNT) + >= FETCH_SIZE) + break; + } else { + int3 = sii8620_readb(ctx, REG_INTR3); + } + if (time_is_before_jiffies(timeout)) { + ctx->error = -ETIMEDOUT; + dev_err(ctx->dev, "timeout during EDID read\n"); kfree(edid); edid = NULL; goto end; } - } while (1); - - sii8620_readb(ctx, REG_DDC_STATUS); - while (sii8620_readb(ctx, REG_DDC_DOUT_CNT) < FETCH_SIZE) usleep_range(10, 20); + } sii8620_read_buf(ctx, REG_DDC_DATA, edid + fetched, FETCH_SIZE); if (fetched + FETCH_SIZE == EDID_LENGTH) { -- cgit v1.2.3 From ecba7cfa3afbe489288f2c819158b7402afd7ee9 Mon Sep 17 00:00:00 2001 From: Maciej Purski Date: Fri, 2 Feb 2018 11:54:25 +0100 Subject: drm/bridge/sii8620: fix display modes validation Current implementation of mode_valid() and mode_fixup() callbacks handle packed pixel modes improperly. Fix it by using proper maximum clock values from the documentation. Signed-off-by: Maciej Purski Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1517568865-25219-1-git-send-email-m.purski@samsung.com --- drivers/gpu/drm/bridge/sil-sii8620.c | 80 ++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index 720bc7c325e0..5267dc6551af 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -36,8 +36,11 @@ #define SII8620_BURST_BUF_LEN 288 #define VAL_RX_HDMI_CTRL2_DEFVAL VAL_RX_HDMI_CTRL2_IDLE_CNT(3) -#define MHL1_MAX_LCLK 225000 -#define MHL3_MAX_LCLK 600000 + +#define MHL1_MAX_PCLK 75000 +#define MHL1_MAX_PCLK_PP_MODE 150000 +#define MHL3_MAX_PCLK 200000 +#define MHL3_MAX_PCLK_PP_MODE 300000 enum sii8620_mode { CM_DISCONNECTED, @@ -2274,17 +2277,43 @@ static void sii8620_detach(struct drm_bridge *bridge) rc_unregister_device(ctx->rc_dev); } +static int sii8620_is_packing_required(struct sii8620 *ctx, + const struct drm_display_mode *mode) +{ + int max_pclk, max_pclk_pp_mode; + + if (sii8620_is_mhl3(ctx)) { + max_pclk = MHL3_MAX_PCLK; + max_pclk_pp_mode = MHL3_MAX_PCLK_PP_MODE; + } else { + max_pclk = MHL1_MAX_PCLK; + max_pclk_pp_mode = MHL1_MAX_PCLK_PP_MODE; + } + + if (mode->clock < max_pclk) + return 0; + else if (mode->clock < max_pclk_pp_mode) + return 1; + else + return -1; +} + static enum drm_mode_status sii8620_mode_valid(struct drm_bridge *bridge, const struct drm_display_mode *mode) { struct sii8620 *ctx = bridge_to_sii8620(bridge); + int pack_required = sii8620_is_packing_required(ctx, mode); bool can_pack = ctx->devcap[MHL_DCAP_VID_LINK_MODE] & MHL_DCAP_VID_LINK_PPIXEL; - unsigned int max_pclk = sii8620_is_mhl3(ctx) ? MHL3_MAX_LCLK : - MHL1_MAX_LCLK; - max_pclk /= can_pack ? 2 : 3; - return (mode->clock > max_pclk) ? MODE_CLOCK_HIGH : MODE_OK; + switch (pack_required) { + case 0: + return MODE_OK; + case 1: + return (can_pack) ? MODE_OK : MODE_CLOCK_HIGH; + default: + return MODE_CLOCK_HIGH; + } } static bool sii8620_mode_fixup(struct drm_bridge *bridge, @@ -2292,43 +2321,16 @@ static bool sii8620_mode_fixup(struct drm_bridge *bridge, struct drm_display_mode *adjusted_mode) { struct sii8620 *ctx = bridge_to_sii8620(bridge); - int max_lclk; - bool ret = true; mutex_lock(&ctx->lock); - max_lclk = sii8620_is_mhl3(ctx) ? MHL3_MAX_LCLK : MHL1_MAX_LCLK; - if (max_lclk > 3 * adjusted_mode->clock) { - ctx->use_packed_pixel = 0; - goto end; - } - if ((ctx->devcap[MHL_DCAP_VID_LINK_MODE] & MHL_DCAP_VID_LINK_PPIXEL) && - max_lclk > 2 * adjusted_mode->clock) { - ctx->use_packed_pixel = 1; - goto end; - } - ret = false; -end: - if (ret) { - u8 vic = drm_match_cea_mode(adjusted_mode); - - if (!vic) { - union hdmi_infoframe frm; - u8 mhl_vic[] = { 0, 95, 94, 93, 98 }; - - /* FIXME: We need the connector here */ - drm_hdmi_vendor_infoframe_from_display_mode( - &frm.vendor.hdmi, NULL, adjusted_mode); - vic = frm.vendor.hdmi.vic; - if (vic >= ARRAY_SIZE(mhl_vic)) - vic = 0; - vic = mhl_vic[vic]; - } - ctx->video_code = vic; - ctx->pixel_clock = adjusted_mode->clock; - } + ctx->use_packed_pixel = sii8620_is_packing_required(ctx, adjusted_mode); + ctx->video_code = drm_match_cea_mode(adjusted_mode); + ctx->pixel_clock = adjusted_mode->clock; + mutex_unlock(&ctx->lock); - return ret; + + return true; } static const struct drm_bridge_funcs sii8620_bridge_funcs = { -- cgit v1.2.3 From 9378cecb1ce5d618b8aff4d65113ddcf72fc1011 Mon Sep 17 00:00:00 2001 From: Maciej Purski Date: Wed, 22 Nov 2017 10:08:38 +0100 Subject: drm/bridge/sii8620: fix potential buffer overflow Buffer overflow error should not occur, as mode_fixup() callback filters pixel clock value and it should never exceed 600000. However, current implementation is not obviously safe and relies on implementation of mode_fixup(). Make 'i' variable never reach unsafe value in order to avoid buffer overflow error. Reported-by: Dan Carpenter Fixes: bf1722ca ("drm/bridge/sii8620: rewrite hdmi start sequence") Signed-off-by: Maciej Purski Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1511341718-6974-1-git-send-email-m.purski@samsung.com --- drivers/gpu/drm/bridge/sil-sii8620.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index 5267dc6551af..61fd3e0a4ba6 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -1226,7 +1226,7 @@ static void sii8620_start_video(struct sii8620 *ctx) int clk = ctx->pixel_clock * (ctx->use_packed_pixel ? 2 : 3); int i; - for (i = 0; i < ARRAY_SIZE(clk_spec); ++i) + for (i = 0; i < ARRAY_SIZE(clk_spec) - 1; ++i) if (clk < clk_spec[i].max_clk) break; -- cgit v1.2.3 From bbc05e172fad9affa388be35b78b9e5e5da76648 Mon Sep 17 00:00:00 2001 From: Maciej Purski Date: Wed, 29 Nov 2017 12:48:50 +0100 Subject: drm/bridge/sii8620: start MHL transmission after HDMI signal detection The vendor code waits for infoframe to detect video mode set by source. We do not need to follow this pattern, because video mode information is provided by drm core. As a result most of the infoframe handling code can be removed. Start transmission immediately after detecting stream on HDMI lines in irq_scdt() function without waiting for infoframe interrupt. Signed-off-by: Maciej Purski Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1511956130-24482-1-git-send-email-m.purski@samsung.com --- drivers/gpu/drm/bridge/sil-sii8620.c | 53 ++---------------------------------- 1 file changed, 2 insertions(+), 51 deletions(-) diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index 61fd3e0a4ba6..853c4f97c7c9 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -1941,14 +1941,6 @@ static void sii8620_irq_edid(struct sii8620 *ctx) ctx->mt_state = MT_STATE_DONE; } -static void sii8620_scdt_high(struct sii8620 *ctx) -{ - sii8620_write_seq_static(ctx, - REG_INTR8_MASK, BIT_CEA_NEW_AVI | BIT_CEA_NEW_VSI, - REG_TPI_SC, BIT_TPI_SC_TPI_OUTPUT_MODE_0_HDMI, - ); -} - static void sii8620_irq_scdt(struct sii8620 *ctx) { u8 stat = sii8620_readb(ctx, REG_INTR5); @@ -1956,53 +1948,13 @@ static void sii8620_irq_scdt(struct sii8620 *ctx) if (stat & BIT_INTR_SCDT_CHANGE) { u8 cstat = sii8620_readb(ctx, REG_TMDS_CSTAT_P3); - if (cstat & BIT_TMDS_CSTAT_P3_SCDT) { - if (ctx->sink_type == SINK_HDMI) - /* enable infoframe interrupt */ - sii8620_scdt_high(ctx); - else - sii8620_start_video(ctx); - } + if (cstat & BIT_TMDS_CSTAT_P3_SCDT) + sii8620_start_video(ctx); } sii8620_write(ctx, REG_INTR5, stat); } -static void sii8620_new_vsi(struct sii8620 *ctx) -{ - u8 vsif[11]; - - sii8620_write(ctx, REG_RX_HDMI_CTRL2, - VAL_RX_HDMI_CTRL2_DEFVAL | - BIT_RX_HDMI_CTRL2_VSI_MON_SEL_VSI); - sii8620_read_buf(ctx, REG_RX_HDMI_MON_PKT_HEADER1, vsif, - ARRAY_SIZE(vsif)); -} - -static void sii8620_new_avi(struct sii8620 *ctx) -{ - sii8620_write(ctx, REG_RX_HDMI_CTRL2, VAL_RX_HDMI_CTRL2_DEFVAL); - sii8620_read_buf(ctx, REG_RX_HDMI_MON_PKT_HEADER1, ctx->avif, - ARRAY_SIZE(ctx->avif)); -} - -static void sii8620_irq_infr(struct sii8620 *ctx) -{ - u8 stat = sii8620_readb(ctx, REG_INTR8) - & (BIT_CEA_NEW_VSI | BIT_CEA_NEW_AVI); - - sii8620_write(ctx, REG_INTR8, stat); - - if (stat & BIT_CEA_NEW_VSI) - sii8620_new_vsi(ctx); - - if (stat & BIT_CEA_NEW_AVI) - sii8620_new_avi(ctx); - - if (stat & (BIT_CEA_NEW_VSI | BIT_CEA_NEW_AVI)) - sii8620_start_video(ctx); -} - static void sii8620_got_xdevcap(struct sii8620 *ctx, int ret) { if (ret < 0) @@ -2084,7 +2036,6 @@ static irqreturn_t sii8620_irq_thread(int irq, void *data) { BIT_FAST_INTR_STAT_EDID, sii8620_irq_edid }, { BIT_FAST_INTR_STAT_DDC, sii8620_irq_ddc }, { BIT_FAST_INTR_STAT_SCDT, sii8620_irq_scdt }, - { BIT_FAST_INTR_STAT_INFR, sii8620_irq_infr }, }; struct sii8620 *ctx = data; u8 stats[LEN_FAST_INTR_STAT]; -- cgit v1.2.3 From 95e8522588c81fdef1b10777a874c4a1fe14bf88 Mon Sep 17 00:00:00 2001 From: Maciej Purski Date: Wed, 29 Nov 2017 16:12:47 +0100 Subject: drm/bridge/sii8620: remove HSIC initialization HSIC initialization was taken from the vendor code. HSIC in MHL circuit is not connected, so it is not possible to test it. Tests prove that without HSIC the device works well. Therefore it can be removed. Signed-off-by: Maciej Purski Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1511968368-30884-1-git-send-email-m.purski@samsung.com --- drivers/gpu/drm/bridge/sil-sii8620.c | 38 ------------------------------------ 1 file changed, 38 deletions(-) diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index 853c4f97c7c9..ff041bec0cf4 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -511,50 +511,12 @@ static void sii8620_sink_detected(struct sii8620 *ctx, int ret) sink_str[ctx->sink_type], sink_name); } -static void sii8620_hsic_init(struct sii8620 *ctx) -{ - if (!sii8620_is_mhl3(ctx)) - return; - - sii8620_write(ctx, REG_FCGC, - BIT_FCGC_HSIC_HOSTMODE | BIT_FCGC_HSIC_ENABLE); - sii8620_setbits(ctx, REG_HRXCTRL3, - BIT_HRXCTRL3_HRX_STAY_RESET | BIT_HRXCTRL3_STATUS_EN, ~0); - sii8620_setbits(ctx, REG_TTXNUMB, MSK_TTXNUMB_TTX_NUMBPS, 4); - sii8620_setbits(ctx, REG_TRXCTRL, BIT_TRXCTRL_TRX_FROM_SE_COC, ~0); - sii8620_setbits(ctx, REG_HTXCTRL, BIT_HTXCTRL_HTX_DRVCONN1, 0); - sii8620_setbits(ctx, REG_KEEPER, MSK_KEEPER_MODE, VAL_KEEPER_MODE_HOST); - sii8620_write_seq_static(ctx, - REG_TDMLLCTL, 0, - REG_UTSRST, BIT_UTSRST_HRX_SRST | BIT_UTSRST_HTX_SRST | - BIT_UTSRST_KEEPER_SRST | BIT_UTSRST_FC_SRST, - REG_UTSRST, BIT_UTSRST_HRX_SRST | BIT_UTSRST_HTX_SRST, - REG_HRXINTL, 0xff, - REG_HRXINTH, 0xff, - REG_TTXINTL, 0xff, - REG_TTXINTH, 0xff, - REG_TRXINTL, 0xff, - REG_TRXINTH, 0xff, - REG_HTXINTL, 0xff, - REG_HTXINTH, 0xff, - REG_FCINTR0, 0xff, - REG_FCINTR1, 0xff, - REG_FCINTR2, 0xff, - REG_FCINTR3, 0xff, - REG_FCINTR4, 0xff, - REG_FCINTR5, 0xff, - REG_FCINTR6, 0xff, - REG_FCINTR7, 0xff - ); -} - static void sii8620_edid_read(struct sii8620 *ctx, int ret) { if (ret < 0) return; sii8620_set_upstream_edid(ctx); - sii8620_hsic_init(ctx); sii8620_enable_hpd(ctx); } -- cgit v1.2.3 From c7d6d511eb56495f065600c90336da0f0fe1b174 Mon Sep 17 00:00:00 2001 From: Maciej Purski Date: Tue, 23 Jan 2018 12:13:16 +0100 Subject: drm/bridge/sii8620: fix HDMI cable connection to dongle MHL bridge is usually connected to TV via MHL dongle. Currently plugging HDMI cable to dongle is handled improperly. Fix it by splitting connecting of a dongle and a HDMI cable. The driver should now handle unplugging a sink from a dongle and plugging a different sink with new edid. Tested on MHL1, MHL2 and MHL3 using various vendors' dongles both in DVI and HDMI mode. Signed-off-by: Maciej Purski Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1516705996-8928-1-git-send-email-m.purski@samsung.com --- drivers/gpu/drm/bridge/sil-sii8620.c | 65 +++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index ff041bec0cf4..4a3deeda065c 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -83,6 +83,9 @@ struct sii8620 { u8 devcap[MHL_DCAP_SIZE]; u8 xdevcap[MHL_XDC_SIZE]; u8 avif[HDMI_INFOFRAME_SIZE(AVI)]; + bool feature_complete; + bool devcap_read; + bool sink_detected; struct edid *edid; unsigned int gen2_write_burst:1; enum sii8620_mt_state mt_state; @@ -479,7 +482,7 @@ static void sii8620_update_array(u8 *dst, u8 *src, int count) } } -static void sii8620_sink_detected(struct sii8620 *ctx, int ret) +static void sii8620_identify_sink(struct sii8620 *ctx) { static const char * const sink_str[] = { [SINK_NONE] = "NONE", @@ -490,7 +493,7 @@ static void sii8620_sink_detected(struct sii8620 *ctx, int ret) char sink_name[20]; struct device *dev = ctx->dev; - if (ret < 0) + if (!ctx->sink_detected || !ctx->devcap_read) return; sii8620_fetch_edid(ctx); @@ -499,6 +502,7 @@ static void sii8620_sink_detected(struct sii8620 *ctx, int ret) sii8620_mhl_disconnected(ctx); return; } + sii8620_set_upstream_edid(ctx); if (drm_detect_hdmi_monitor(ctx->edid)) ctx->sink_type = SINK_HDMI; @@ -511,15 +515,6 @@ static void sii8620_sink_detected(struct sii8620 *ctx, int ret) sink_str[ctx->sink_type], sink_name); } -static void sii8620_edid_read(struct sii8620 *ctx, int ret) -{ - if (ret < 0) - return; - - sii8620_set_upstream_edid(ctx); - sii8620_enable_hpd(ctx); -} - static void sii8620_mr_devcap(struct sii8620 *ctx) { u8 dcap[MHL_DCAP_SIZE]; @@ -535,6 +530,8 @@ static void sii8620_mr_devcap(struct sii8620 *ctx) dcap[MHL_DCAP_ADOPTER_ID_H], dcap[MHL_DCAP_ADOPTER_ID_L], dcap[MHL_DCAP_DEVICE_ID_H], dcap[MHL_DCAP_DEVICE_ID_L]); sii8620_update_array(ctx->devcap, dcap, MHL_DCAP_SIZE); + ctx->devcap_read = true; + sii8620_identify_sink(ctx); } static void sii8620_mr_xdevcap(struct sii8620 *ctx) @@ -1506,6 +1503,16 @@ static void sii8620_set_mode(struct sii8620 *ctx, enum sii8620_mode mode) ); } +static void sii8620_hpd_unplugged(struct sii8620 *ctx) +{ + sii8620_disable_hpd(ctx); + ctx->sink_type = SINK_NONE; + ctx->sink_detected = false; + ctx->feature_complete = false; + kfree(ctx->edid); + ctx->edid = NULL; +} + static void sii8620_disconnect(struct sii8620 *ctx) { sii8620_disable_gen2_write_burst(ctx); @@ -1533,7 +1540,7 @@ static void sii8620_disconnect(struct sii8620 *ctx) REG_MHL_DP_CTL6, 0x2A, REG_MHL_DP_CTL7, 0x03 ); - sii8620_disable_hpd(ctx); + sii8620_hpd_unplugged(ctx); sii8620_write_seq_static(ctx, REG_M3_CTRL, VAL_M3_CTRL_MHL3_VALUE, REG_MHL_COC_CTL1, 0x07, @@ -1581,10 +1588,8 @@ static void sii8620_disconnect(struct sii8620 *ctx) memset(ctx->xstat, 0, sizeof(ctx->xstat)); memset(ctx->devcap, 0, sizeof(ctx->devcap)); memset(ctx->xdevcap, 0, sizeof(ctx->xdevcap)); + ctx->devcap_read = false; ctx->cbus_status = 0; - ctx->sink_type = SINK_NONE; - kfree(ctx->edid); - ctx->edid = NULL; sii8620_mt_cleanup(ctx); } @@ -1675,9 +1680,6 @@ static void sii8620_status_changed_path(struct sii8620 *ctx) sii8620_mt_write_stat(ctx, MHL_DST_REG(LINK_MODE), MHL_DST_LM_CLK_MODE_NORMAL | MHL_DST_LM_PATH_ENABLED); - if (!sii8620_is_mhl3(ctx)) - sii8620_mt_read_devcap(ctx, false); - sii8620_mt_set_cont(ctx, sii8620_sink_detected); } else { sii8620_mt_write_stat(ctx, MHL_DST_REG(LINK_MODE), MHL_DST_LM_CLK_MODE_NORMAL); @@ -1694,9 +1696,14 @@ static void sii8620_msc_mr_write_stat(struct sii8620 *ctx) sii8620_update_array(ctx->stat, st, MHL_DST_SIZE); sii8620_update_array(ctx->xstat, xst, MHL_XDS_SIZE); - if (ctx->stat[MHL_DST_CONNECTED_RDY] & MHL_DST_CONN_DCAP_RDY) + if (ctx->stat[MHL_DST_CONNECTED_RDY] & st[MHL_DST_CONNECTED_RDY] & + MHL_DST_CONN_DCAP_RDY) { sii8620_status_dcap_ready(ctx); + if (!sii8620_is_mhl3(ctx)) + sii8620_mt_read_devcap(ctx, false); + } + if (st[MHL_DST_LINK_MODE] & MHL_DST_LM_PATH_ENABLED) sii8620_status_changed_path(ctx); } @@ -1780,8 +1787,11 @@ static void sii8620_msc_mr_set_int(struct sii8620 *ctx) } if (ints[MHL_INT_RCHANGE] & MHL_INT_RC_FEAT_REQ) sii8620_send_features(ctx); - if (ints[MHL_INT_RCHANGE] & MHL_INT_RC_FEAT_COMPLETE) - sii8620_edid_read(ctx, 0); + if (ints[MHL_INT_RCHANGE] & MHL_INT_RC_FEAT_COMPLETE) { + ctx->feature_complete = true; + if (ctx->edid) + sii8620_enable_hpd(ctx); + } } static struct sii8620_mt_msg *sii8620_msc_msg_first(struct sii8620 *ctx) @@ -1856,6 +1866,15 @@ static void sii8620_irq_msc(struct sii8620 *ctx) if (stat & BIT_CBUS_MSC_MR_WRITE_STAT) sii8620_msc_mr_write_stat(ctx); + if (stat & BIT_CBUS_HPD_CHG) { + if (ctx->cbus_status & BIT_CBUS_STATUS_CBUS_HPD) { + ctx->sink_detected = true; + sii8620_identify_sink(ctx); + } else { + sii8620_hpd_unplugged(ctx); + } + } + if (stat & BIT_CBUS_MSC_MR_SET_INT) sii8620_msc_mr_set_int(ctx); @@ -1967,11 +1986,11 @@ static void sii8620_irq_ddc(struct sii8620 *ctx) if (stat & BIT_DDC_CMD_DONE) { sii8620_write(ctx, REG_INTR3_MASK, 0); - if (sii8620_is_mhl3(ctx)) + if (sii8620_is_mhl3(ctx) && !ctx->feature_complete) sii8620_mt_set_int(ctx, MHL_INT_REG(RCHANGE), MHL_INT_RC_FEAT_REQ); else - sii8620_edid_read(ctx, 0); + sii8620_enable_hpd(ctx); } sii8620_write(ctx, REG_INTR3, stat); } -- cgit v1.2.3 From e37460c1ca08cf9d3b82eb3b6f205888d8d01182 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Jun 2018 08:49:34 +0200 Subject: dma-mapping: use obj-y instead of lib-y for generic dma ops We already have exact config symbols to select the direct, non-coherent, or virt dma ops. So use the normal obj- scheme to select them. Signed-off-by: Christoph Hellwig --- lib/Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Makefile b/lib/Makefile index 956b320292fe..5e0e160c9242 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -29,9 +29,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o -lib-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o -lib-$(CONFIG_DMA_NONCOHERENT_OPS) += dma-noncoherent.o -lib-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o +obj-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o +obj-$(CONFIG_DMA_NONCOHERENT_OPS) += dma-noncoherent.o +obj-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o lib-y += kobject.o klist.o obj-y += lockref.o -- cgit v1.2.3 From cf65a0f6f6ff7631ba0ac0513a14ca5b65320d80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Jun 2018 19:01:45 +0200 Subject: dma-mapping: move all DMA mapping code to kernel/dma Currently the code is split over various files with dma- prefixes in the lib/ and drives/base directories, and the number of files keeps growing. Move them into a single directory to keep the code together and remove the file name prefixes. To match the irq infrastructure this directory is placed under the kernel/ directory. Signed-off-by: Christoph Hellwig --- Documentation/driver-api/infrastructure.rst | 4 +- MAINTAINERS | 9 +- drivers/base/Makefile | 3 - drivers/base/dma-coherent.c | 434 ------- drivers/base/dma-contiguous.c | 278 ----- drivers/base/dma-mapping.c | 345 ------ include/linux/dma-contiguous.h | 2 +- init/Kconfig | 4 - kernel/Makefile | 1 + kernel/dma/Kconfig | 50 + kernel/dma/Makefile | 11 + kernel/dma/coherent.c | 434 +++++++ kernel/dma/contiguous.c | 278 +++++ kernel/dma/debug.c | 1773 +++++++++++++++++++++++++++ kernel/dma/direct.c | 204 +++ kernel/dma/mapping.c | 345 ++++++ kernel/dma/noncoherent.c | 102 ++ kernel/dma/swiotlb.c | 1087 ++++++++++++++++ kernel/dma/virt.c | 59 + lib/Kconfig | 47 +- lib/Makefile | 6 - lib/dma-debug.c | 1773 --------------------------- lib/dma-direct.c | 204 --- lib/dma-noncoherent.c | 102 -- lib/dma-virt.c | 61 - lib/swiotlb.c | 1087 ---------------- 26 files changed, 4350 insertions(+), 4353 deletions(-) delete mode 100644 drivers/base/dma-coherent.c delete mode 100644 drivers/base/dma-contiguous.c delete mode 100644 drivers/base/dma-mapping.c create mode 100644 kernel/dma/Kconfig create mode 100644 kernel/dma/Makefile create mode 100644 kernel/dma/coherent.c create mode 100644 kernel/dma/contiguous.c create mode 100644 kernel/dma/debug.c create mode 100644 kernel/dma/direct.c create mode 100644 kernel/dma/mapping.c create mode 100644 kernel/dma/noncoherent.c create mode 100644 kernel/dma/swiotlb.c create mode 100644 kernel/dma/virt.c delete mode 100644 lib/dma-debug.c delete mode 100644 lib/dma-direct.c delete mode 100644 lib/dma-noncoherent.c delete mode 100644 lib/dma-virt.c delete mode 100644 lib/swiotlb.c diff --git a/Documentation/driver-api/infrastructure.rst b/Documentation/driver-api/infrastructure.rst index bee1b9a1702f..6172f3cc3d0b 100644 --- a/Documentation/driver-api/infrastructure.rst +++ b/Documentation/driver-api/infrastructure.rst @@ -49,10 +49,10 @@ Device Drivers Base Device Drivers DMA Management ----------------------------- -.. kernel-doc:: drivers/base/dma-coherent.c +.. kernel-doc:: kernel/dma/coherent.c :export: -.. kernel-doc:: drivers/base/dma-mapping.c +.. kernel-doc:: kernel/dma/mapping.c :export: Device drivers PnP support diff --git a/MAINTAINERS b/MAINTAINERS index c13b9fb3be0b..a6844a9e2f64 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4359,12 +4359,7 @@ L: iommu@lists.linux-foundation.org T: git git://git.infradead.org/users/hch/dma-mapping.git W: http://git.infradead.org/users/hch/dma-mapping.git S: Supported -F: lib/dma-debug.c -F: lib/dma-direct.c -F: lib/dma-noncoherent.c -F: lib/dma-virt.c -F: drivers/base/dma-mapping.c -F: drivers/base/dma-coherent.c +F: kernel/dma/ F: include/asm-generic/dma-mapping.h F: include/linux/dma-direct.h F: include/linux/dma-mapping.h @@ -13642,7 +13637,7 @@ M: Konrad Rzeszutek Wilk L: iommu@lists.linux-foundation.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/konrad/swiotlb.git S: Supported -F: lib/swiotlb.c +F: kernel/dma/swiotlb.c F: arch/*/kernel/pci-swiotlb.c F: include/linux/swiotlb.h diff --git a/drivers/base/Makefile b/drivers/base/Makefile index b074f242a435..704f44295810 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -8,10 +8,7 @@ obj-y := component.o core.o bus.o dd.o syscore.o \ topology.o container.o property.o cacheinfo.o \ devcon.o obj-$(CONFIG_DEVTMPFS) += devtmpfs.o -obj-$(CONFIG_DMA_CMA) += dma-contiguous.o obj-y += power/ -obj-$(CONFIG_HAS_DMA) += dma-mapping.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_ISA_BUS_API) += isa.o obj-y += firmware_loader/ obj-$(CONFIG_NUMA) += node.o diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c deleted file mode 100644 index 597d40893862..000000000000 --- a/drivers/base/dma-coherent.c +++ /dev/null @@ -1,434 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Coherent per-device memory handling. - * Borrowed from i386 - */ -#include -#include -#include -#include -#include - -struct dma_coherent_mem { - void *virt_base; - dma_addr_t device_base; - unsigned long pfn_base; - int size; - int flags; - unsigned long *bitmap; - spinlock_t spinlock; - bool use_dev_dma_pfn_offset; -}; - -static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; - -static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev) -{ - if (dev && dev->dma_mem) - return dev->dma_mem; - return NULL; -} - -static inline dma_addr_t dma_get_device_base(struct device *dev, - struct dma_coherent_mem * mem) -{ - if (mem->use_dev_dma_pfn_offset) - return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT; - else - return mem->device_base; -} - -static int dma_init_coherent_memory( - phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, - struct dma_coherent_mem **mem) -{ - struct dma_coherent_mem *dma_mem = NULL; - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - int ret; - - if (!size) { - ret = -EINVAL; - goto out; - } - - mem_base = memremap(phys_addr, size, MEMREMAP_WC); - if (!mem_base) { - ret = -EINVAL; - goto out; - } - dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dma_mem) { - ret = -ENOMEM; - goto out; - } - dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dma_mem->bitmap) { - ret = -ENOMEM; - goto out; - } - - dma_mem->virt_base = mem_base; - dma_mem->device_base = device_addr; - dma_mem->pfn_base = PFN_DOWN(phys_addr); - dma_mem->size = pages; - dma_mem->flags = flags; - spin_lock_init(&dma_mem->spinlock); - - *mem = dma_mem; - return 0; - -out: - kfree(dma_mem); - if (mem_base) - memunmap(mem_base); - return ret; -} - -static void dma_release_coherent_memory(struct dma_coherent_mem *mem) -{ - if (!mem) - return; - - memunmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} - -static int dma_assign_coherent_memory(struct device *dev, - struct dma_coherent_mem *mem) -{ - if (!dev) - return -ENODEV; - - if (dev->dma_mem) - return -EBUSY; - - dev->dma_mem = mem; - return 0; -} - -int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - struct dma_coherent_mem *mem; - int ret; - - ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); - if (ret) - return ret; - - ret = dma_assign_coherent_memory(dev, mem); - if (ret) - dma_release_coherent_memory(mem); - return ret; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dma_release_coherent_memory(mem); - dev->dma_mem = NULL; -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - unsigned long flags; - int pos, err; - - size += device_addr & ~PAGE_MASK; - - if (!mem) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&mem->spinlock, flags); - pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); - err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); - spin_unlock_irqrestore(&mem->spinlock, flags); - - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, - ssize_t size, dma_addr_t *dma_handle) -{ - int order = get_order(size); - unsigned long flags; - int pageno; - void *ret; - - spin_lock_irqsave(&mem->spinlock, flags); - - if (unlikely(size > (mem->size << PAGE_SHIFT))) - goto err; - - pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); - if (unlikely(pageno < 0)) - goto err; - - /* - * Memory was found in the coherent area. - */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); - ret = mem->virt_base + (pageno << PAGE_SHIFT); - spin_unlock_irqrestore(&mem->spinlock, flags); - memset(ret, 0, size); - return ret; -err: - spin_unlock_irqrestore(&mem->spinlock, flags); - return NULL; -} - -/** - * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool - * @dev: device from which we allocate memory - * @size: size of requested memory area - * @dma_handle: This will be filled with the correct dma handle - * @ret: This pointer will be filled with the virtual address - * to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); - - if (!mem) - return 0; - - *ret = __dma_alloc_from_coherent(mem, size, dma_handle); - if (*ret) - return 1; - - /* - * In the case where the allocation can not be satisfied from the - * per-device area, try to fall back to generic memory if the - * constraints allow it. - */ - return mem->flags & DMA_MEMORY_EXCLUSIVE; -} -EXPORT_SYMBOL(dma_alloc_from_dev_coherent); - -void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) -{ - if (!dma_coherent_default_memory) - return NULL; - - return __dma_alloc_from_coherent(dma_coherent_default_memory, size, - dma_handle); -} - -static int __dma_release_from_coherent(struct dma_coherent_mem *mem, - int order, void *vaddr) -{ - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - unsigned long flags; - - spin_lock_irqsave(&mem->spinlock, flags); - bitmap_release_region(mem->bitmap, page, order); - spin_unlock_irqrestore(&mem->spinlock, flags); - return 1; - } - return 0; -} - -/** - * dma_release_from_dev_coherent() - free memory to device coherent memory pool - * @dev: device from which the memory was allocated - * @order: the order of pages allocated - * @vaddr: virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if the caller should - * proceed with releasing memory from generic pools. - */ -int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); - - return __dma_release_from_coherent(mem, order, vaddr); -} -EXPORT_SYMBOL(dma_release_from_dev_coherent); - -int dma_release_from_global_coherent(int order, void *vaddr) -{ - if (!dma_coherent_default_memory) - return 0; - - return __dma_release_from_coherent(dma_coherent_default_memory, order, - vaddr); -} - -static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, - struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) -{ - if (mem && vaddr >= mem->virt_base && vaddr + size <= - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - unsigned long off = vma->vm_pgoff; - int start = (vaddr - mem->virt_base) >> PAGE_SHIFT; - int user_count = vma_pages(vma); - int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - - *ret = -ENXIO; - if (off < count && user_count <= count - off) { - unsigned long pfn = mem->pfn_base + start + off; - *ret = remap_pfn_range(vma, vma->vm_start, pfn, - user_count << PAGE_SHIFT, - vma->vm_page_prot); - } - return 1; - } - return 0; -} - -/** - * dma_mmap_from_dev_coherent() - mmap memory from the device coherent pool - * @dev: device from which the memory was allocated - * @vma: vm_area for the userspace memory - * @vaddr: cpu address returned by dma_alloc_from_dev_coherent - * @size: size of the memory buffer allocated - * @ret: result from remap_pfn_range() - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, maps that memory to the provided vma. - * - * Returns 1 if @vaddr belongs to the device coherent pool and the caller - * should return @ret, or 0 if they should proceed with mapping memory from - * generic areas. - */ -int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, - void *vaddr, size_t size, int *ret) -{ - struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); - - return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); -} -EXPORT_SYMBOL(dma_mmap_from_dev_coherent); - -int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, - size_t size, int *ret) -{ - if (!dma_coherent_default_memory) - return 0; - - return __dma_mmap_from_coherent(dma_coherent_default_memory, vma, - vaddr, size, ret); -} - -/* - * Support for reserved memory regions defined in device tree - */ -#ifdef CONFIG_OF_RESERVED_MEM -#include -#include -#include - -static struct reserved_mem *dma_reserved_default_memory __initdata; - -static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) -{ - struct dma_coherent_mem *mem = rmem->priv; - int ret; - - if (!mem) { - ret = dma_init_coherent_memory(rmem->base, rmem->base, - rmem->size, - DMA_MEMORY_EXCLUSIVE, &mem); - if (ret) { - pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - return ret; - } - } - mem->use_dev_dma_pfn_offset = true; - rmem->priv = mem; - dma_assign_coherent_memory(dev, mem); - return 0; -} - -static void rmem_dma_device_release(struct reserved_mem *rmem, - struct device *dev) -{ - if (dev) - dev->dma_mem = NULL; -} - -static const struct reserved_mem_ops rmem_dma_ops = { - .device_init = rmem_dma_device_init, - .device_release = rmem_dma_device_release, -}; - -static int __init rmem_dma_setup(struct reserved_mem *rmem) -{ - unsigned long node = rmem->fdt_node; - - if (of_get_flat_dt_prop(node, "reusable", NULL)) - return -EINVAL; - -#ifdef CONFIG_ARM - if (!of_get_flat_dt_prop(node, "no-map", NULL)) { - pr_err("Reserved memory: regions without no-map are not yet supported\n"); - return -EINVAL; - } - - if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) { - WARN(dma_reserved_default_memory, - "Reserved memory: region for default DMA coherent area is redefined\n"); - dma_reserved_default_memory = rmem; - } -#endif - - rmem->ops = &rmem_dma_ops; - pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - return 0; -} - -static int __init dma_init_reserved_memory(void) -{ - const struct reserved_mem_ops *ops; - int ret; - - if (!dma_reserved_default_memory) - return -ENOMEM; - - ops = dma_reserved_default_memory->ops; - - /* - * We rely on rmem_dma_device_init() does not propagate error of - * dma_assign_coherent_memory() for "NULL" device. - */ - ret = ops->device_init(dma_reserved_default_memory, NULL); - - if (!ret) { - dma_coherent_default_memory = dma_reserved_default_memory->priv; - pr_info("DMA: default coherent area is set\n"); - } - - return ret; -} - -core_initcall(dma_init_reserved_memory); - -RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); -#endif diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c deleted file mode 100644 index d987dcd1bd56..000000000000 --- a/drivers/base/dma-contiguous.c +++ /dev/null @@ -1,278 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0+ -/* - * Contiguous Memory Allocator for DMA mapping framework - * Copyright (c) 2010-2011 by Samsung Electronics. - * Written by: - * Marek Szyprowski - * Michal Nazarewicz - */ - -#define pr_fmt(fmt) "cma: " fmt - -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif - -#include -#include - -#include -#include -#include -#include -#include - -#ifdef CONFIG_CMA_SIZE_MBYTES -#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES -#else -#define CMA_SIZE_MBYTES 0 -#endif - -struct cma *dma_contiguous_default_area; - -/* - * Default global CMA area size can be defined in kernel's .config. - * This is useful mainly for distro maintainers to create a kernel - * that works correctly for most supported systems. - * The size can be set in bytes or as a percentage of the total memory - * in the system. - * - * Users, who want to set the size of global CMA area for their system - * should use cma= kernel parameter. - */ -static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; -static phys_addr_t size_cmdline = -1; -static phys_addr_t base_cmdline; -static phys_addr_t limit_cmdline; - -static int __init early_cma(char *p) -{ - pr_debug("%s(%s)\n", __func__, p); - size_cmdline = memparse(p, &p); - if (*p != '@') - return 0; - base_cmdline = memparse(p + 1, &p); - if (*p != '-') { - limit_cmdline = base_cmdline + size_cmdline; - return 0; - } - limit_cmdline = memparse(p + 1, &p); - - return 0; -} -early_param("cma", early_cma); - -#ifdef CONFIG_CMA_SIZE_PERCENTAGE - -static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) -{ - struct memblock_region *reg; - unsigned long total_pages = 0; - - /* - * We cannot use memblock_phys_mem_size() here, because - * memblock_analyze() has not been called yet. - */ - for_each_memblock(memory, reg) - total_pages += memblock_region_memory_end_pfn(reg) - - memblock_region_memory_base_pfn(reg); - - return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; -} - -#else - -static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) -{ - return 0; -} - -#endif - -/** - * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling - * @limit: End address of the reserved memory (optional, 0 for any). - * - * This function reserves memory from early allocator. It should be - * called by arch specific code once the early allocator (memblock or bootmem) - * has been activated and all other subsystems have already allocated/reserved - * memory. - */ -void __init dma_contiguous_reserve(phys_addr_t limit) -{ - phys_addr_t selected_size = 0; - phys_addr_t selected_base = 0; - phys_addr_t selected_limit = limit; - bool fixed = false; - - pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); - - if (size_cmdline != -1) { - selected_size = size_cmdline; - selected_base = base_cmdline; - selected_limit = min_not_zero(limit_cmdline, limit); - if (base_cmdline + size_cmdline == limit_cmdline) - fixed = true; - } else { -#ifdef CONFIG_CMA_SIZE_SEL_MBYTES - selected_size = size_bytes; -#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE) - selected_size = cma_early_percent_memory(); -#elif defined(CONFIG_CMA_SIZE_SEL_MIN) - selected_size = min(size_bytes, cma_early_percent_memory()); -#elif defined(CONFIG_CMA_SIZE_SEL_MAX) - selected_size = max(size_bytes, cma_early_percent_memory()); -#endif - } - - if (selected_size && !dma_contiguous_default_area) { - pr_debug("%s: reserving %ld MiB for global area\n", __func__, - (unsigned long)selected_size / SZ_1M); - - dma_contiguous_reserve_area(selected_size, selected_base, - selected_limit, - &dma_contiguous_default_area, - fixed); - } -} - -/** - * dma_contiguous_reserve_area() - reserve custom contiguous area - * @size: Size of the reserved area (in bytes), - * @base: Base address of the reserved area optional, use 0 for any - * @limit: End address of the reserved memory (optional, 0 for any). - * @res_cma: Pointer to store the created cma region. - * @fixed: hint about where to place the reserved area - * - * This function reserves memory from early allocator. It should be - * called by arch specific code once the early allocator (memblock or bootmem) - * has been activated and all other subsystems have already allocated/reserved - * memory. This function allows to create custom reserved areas for specific - * devices. - * - * If @fixed is true, reserve contiguous area at exactly @base. If false, - * reserve in range from @base to @limit. - */ -int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma, - bool fixed) -{ - int ret; - - ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, - "reserved", res_cma); - if (ret) - return ret; - - /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(cma_get_base(*res_cma), - cma_get_size(*res_cma)); - - return 0; -} - -/** - * dma_alloc_from_contiguous() - allocate pages from contiguous area - * @dev: Pointer to device for which the allocation is performed. - * @count: Requested number of pages. - * @align: Requested alignment of pages (in PAGE_SIZE order). - * @gfp_mask: GFP flags to use for this allocation. - * - * This function allocates memory buffer for specified device. It uses - * device specific contiguous memory area if available or the default - * global one. Requires architecture specific dev_get_cma_area() helper - * function. - */ -struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, - unsigned int align, gfp_t gfp_mask) -{ - if (align > CONFIG_CMA_ALIGNMENT) - align = CONFIG_CMA_ALIGNMENT; - - return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); -} - -/** - * dma_release_from_contiguous() - release allocated pages - * @dev: Pointer to device for which the pages were allocated. - * @pages: Allocated pages. - * @count: Number of allocated pages. - * - * This function releases memory allocated by dma_alloc_from_contiguous(). - * It returns false when provided pages do not belong to contiguous area and - * true otherwise. - */ -bool dma_release_from_contiguous(struct device *dev, struct page *pages, - int count) -{ - return cma_release(dev_get_cma_area(dev), pages, count); -} - -/* - * Support for reserved memory regions defined in device tree - */ -#ifdef CONFIG_OF_RESERVED_MEM -#include -#include -#include - -#undef pr_fmt -#define pr_fmt(fmt) fmt - -static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) -{ - dev_set_cma_area(dev, rmem->priv); - return 0; -} - -static void rmem_cma_device_release(struct reserved_mem *rmem, - struct device *dev) -{ - dev_set_cma_area(dev, NULL); -} - -static const struct reserved_mem_ops rmem_cma_ops = { - .device_init = rmem_cma_device_init, - .device_release = rmem_cma_device_release, -}; - -static int __init rmem_cma_setup(struct reserved_mem *rmem) -{ - phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - phys_addr_t mask = align - 1; - unsigned long node = rmem->fdt_node; - struct cma *cma; - int err; - - if (!of_get_flat_dt_prop(node, "reusable", NULL) || - of_get_flat_dt_prop(node, "no-map", NULL)) - return -EINVAL; - - if ((rmem->base & mask) || (rmem->size & mask)) { - pr_err("Reserved memory: incorrect alignment of CMA region\n"); - return -EINVAL; - } - - err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); - if (err) { - pr_err("Reserved memory: unable to setup CMA region\n"); - return err; - } - /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(rmem->base, rmem->size); - - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) - dma_contiguous_set_default(cma); - - rmem->ops = &rmem_cma_ops; - rmem->priv = cma; - - pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", - &rmem->base, (unsigned long)rmem->size / SZ_1M); - - return 0; -} -RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); -#endif diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c deleted file mode 100644 index f831a582209c..000000000000 --- a/drivers/base/dma-mapping.c +++ /dev/null @@ -1,345 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * drivers/base/dma-mapping.c - arch-independent dma-mapping routines - * - * Copyright (c) 2006 SUSE Linux Products GmbH - * Copyright (c) 2006 Tejun Heo - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * Managed DMA API - */ -struct dma_devres { - size_t size; - void *vaddr; - dma_addr_t dma_handle; - unsigned long attrs; -}; - -static void dmam_release(struct device *dev, void *res) -{ - struct dma_devres *this = res; - - dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle, - this->attrs); -} - -static int dmam_match(struct device *dev, void *res, void *match_data) -{ - struct dma_devres *this = res, *match = match_data; - - if (this->vaddr == match->vaddr) { - WARN_ON(this->size != match->size || - this->dma_handle != match->dma_handle); - return 1; - } - return 0; -} - -/** - * dmam_alloc_coherent - Managed dma_alloc_coherent() - * @dev: Device to allocate coherent memory for - * @size: Size of allocation - * @dma_handle: Out argument for allocated DMA handle - * @gfp: Allocation flags - * - * Managed dma_alloc_coherent(). Memory allocated using this function - * will be automatically released on driver detach. - * - * RETURNS: - * Pointer to allocated memory on success, NULL on failure. - */ -void *dmam_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - struct dma_devres *dr; - void *vaddr; - - dr = devres_alloc(dmam_release, sizeof(*dr), gfp); - if (!dr) - return NULL; - - vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp); - if (!vaddr) { - devres_free(dr); - return NULL; - } - - dr->vaddr = vaddr; - dr->dma_handle = *dma_handle; - dr->size = size; - - devres_add(dev, dr); - - return vaddr; -} -EXPORT_SYMBOL(dmam_alloc_coherent); - -/** - * dmam_free_coherent - Managed dma_free_coherent() - * @dev: Device to free coherent memory for - * @size: Size of allocation - * @vaddr: Virtual address of the memory to free - * @dma_handle: DMA handle of the memory to free - * - * Managed dma_free_coherent(). - */ -void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle) -{ - struct dma_devres match_data = { size, vaddr, dma_handle }; - - dma_free_coherent(dev, size, vaddr, dma_handle); - WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data)); -} -EXPORT_SYMBOL(dmam_free_coherent); - -/** - * dmam_alloc_attrs - Managed dma_alloc_attrs() - * @dev: Device to allocate non_coherent memory for - * @size: Size of allocation - * @dma_handle: Out argument for allocated DMA handle - * @gfp: Allocation flags - * @attrs: Flags in the DMA_ATTR_* namespace. - * - * Managed dma_alloc_attrs(). Memory allocated using this function will be - * automatically released on driver detach. - * - * RETURNS: - * Pointer to allocated memory on success, NULL on failure. - */ -void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - struct dma_devres *dr; - void *vaddr; - - dr = devres_alloc(dmam_release, sizeof(*dr), gfp); - if (!dr) - return NULL; - - vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs); - if (!vaddr) { - devres_free(dr); - return NULL; - } - - dr->vaddr = vaddr; - dr->dma_handle = *dma_handle; - dr->size = size; - dr->attrs = attrs; - - devres_add(dev, dr); - - return vaddr; -} -EXPORT_SYMBOL(dmam_alloc_attrs); - -#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT - -static void dmam_coherent_decl_release(struct device *dev, void *res) -{ - dma_release_declared_memory(dev); -} - -/** - * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory() - * @dev: Device to declare coherent memory for - * @phys_addr: Physical address of coherent memory to be declared - * @device_addr: Device address of coherent memory to be declared - * @size: Size of coherent memory to be declared - * @flags: Flags - * - * Managed dma_declare_coherent_memory(). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void *res; - int rc; - - res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL); - if (!res) - return -ENOMEM; - - rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size, - flags); - if (!rc) - devres_add(dev, res); - else - devres_free(res); - - return rc; -} -EXPORT_SYMBOL(dmam_declare_coherent_memory); - -/** - * dmam_release_declared_memory - Managed dma_release_declared_memory(). - * @dev: Device to release declared coherent memory for - * - * Managed dmam_release_declared_memory(). - */ -void dmam_release_declared_memory(struct device *dev) -{ - WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL)); -} -EXPORT_SYMBOL(dmam_release_declared_memory); - -#endif - -/* - * Create scatter-list for the already allocated DMA buffer. - */ -int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t handle, size_t size) -{ - struct page *page = virt_to_page(cpu_addr); - int ret; - - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); - if (unlikely(ret)) - return ret; - - sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); - return 0; -} -EXPORT_SYMBOL(dma_common_get_sgtable); - -/* - * Create userspace mapping for the DMA-coherent memory. - */ -int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size) -{ - int ret = -ENXIO; -#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP - unsigned long user_count = vma_pages(vma); - unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long off = vma->vm_pgoff; - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) - return ret; - - if (off < count && user_count <= (count - off)) - ret = remap_pfn_range(vma, vma->vm_start, - page_to_pfn(virt_to_page(cpu_addr)) + off, - user_count << PAGE_SHIFT, - vma->vm_page_prot); -#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ - - return ret; -} -EXPORT_SYMBOL(dma_common_mmap); - -#ifdef CONFIG_MMU -static struct vm_struct *__dma_common_pages_remap(struct page **pages, - size_t size, unsigned long vm_flags, pgprot_t prot, - const void *caller) -{ - struct vm_struct *area; - - area = get_vm_area_caller(size, vm_flags, caller); - if (!area) - return NULL; - - if (map_vm_area(area, prot, pages)) { - vunmap(area->addr); - return NULL; - } - - return area; -} - -/* - * remaps an array of PAGE_SIZE pages into another vm_area - * Cannot be used in non-sleeping contexts - */ -void *dma_common_pages_remap(struct page **pages, size_t size, - unsigned long vm_flags, pgprot_t prot, - const void *caller) -{ - struct vm_struct *area; - - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); - if (!area) - return NULL; - - area->pages = pages; - - return area->addr; -} - -/* - * remaps an allocated contiguous region into another vm_area. - * Cannot be used in non-sleeping contexts - */ - -void *dma_common_contiguous_remap(struct page *page, size_t size, - unsigned long vm_flags, - pgprot_t prot, const void *caller) -{ - int i; - struct page **pages; - struct vm_struct *area; - - pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); - if (!pages) - return NULL; - - for (i = 0; i < (size >> PAGE_SHIFT); i++) - pages[i] = nth_page(page, i); - - area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); - - kfree(pages); - - if (!area) - return NULL; - return area->addr; -} - -/* - * unmaps a range previously mapped by dma_common_*_remap - */ -void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) -{ - struct vm_struct *area = find_vm_area(cpu_addr); - - if (!area || (area->flags & vm_flags) != vm_flags) { - WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); - return; - } - - unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); - vunmap(cpu_addr); -} -#endif - -/* - * enables DMA API use for a device - */ -int dma_configure(struct device *dev) -{ - if (dev->bus->dma_configure) - return dev->bus->dma_configure(dev); - return 0; -} - -void dma_deconfigure(struct device *dev) -{ - of_dma_deconfigure(dev); - acpi_dma_deconfigure(dev); -} diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index b67bf6ac907d..3c5a4cb3eb95 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -48,7 +48,7 @@ * CMA should not be used by the device drivers directly. It is * only a helper framework for dma-mapping subsystem. * - * For more information, see kernel-docs in drivers/base/dma-contiguous.c + * For more information, see kernel-docs in kernel/dma/contiguous.c */ #ifdef __KERNEL__ diff --git a/init/Kconfig b/init/Kconfig index 5a52f07259a2..fde3d09e8b27 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1719,10 +1719,6 @@ source "arch/Kconfig" endmenu # General setup -config HAVE_GENERIC_DMA_COHERENT - bool - default n - config RT_MUTEXES bool diff --git a/kernel/Makefile b/kernel/Makefile index d2001624fe7a..04bc07c2b42a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,6 +41,7 @@ obj-y += printk/ obj-y += irq/ obj-y += rcu/ obj-y += livepatch/ +obj-y += dma/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig new file mode 100644 index 000000000000..9bd54304446f --- /dev/null +++ b/kernel/dma/Kconfig @@ -0,0 +1,50 @@ + +config HAS_DMA + bool + depends on !NO_DMA + default y + +config NEED_SG_DMA_LENGTH + bool + +config NEED_DMA_MAP_STATE + bool + +config ARCH_DMA_ADDR_T_64BIT + def_bool 64BIT || PHYS_ADDR_T_64BIT + +config HAVE_GENERIC_DMA_COHERENT + bool + +config ARCH_HAS_SYNC_DMA_FOR_DEVICE + bool + +config ARCH_HAS_SYNC_DMA_FOR_CPU + bool + select NEED_DMA_MAP_STATE + +config DMA_DIRECT_OPS + bool + depends on HAS_DMA + +config DMA_NONCOHERENT_OPS + bool + depends on HAS_DMA + select DMA_DIRECT_OPS + +config DMA_NONCOHERENT_MMAP + bool + depends on DMA_NONCOHERENT_OPS + +config DMA_NONCOHERENT_CACHE_SYNC + bool + depends on DMA_NONCOHERENT_OPS + +config DMA_VIRT_OPS + bool + depends on HAS_DMA + +config SWIOTLB + bool + select DMA_DIRECT_OPS + select NEED_DMA_MAP_STATE diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile new file mode 100644 index 000000000000..6de44e4eb454 --- /dev/null +++ b/kernel/dma/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_HAS_DMA) += mapping.o +obj-$(CONFIG_DMA_CMA) += contiguous.o +obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += coherent.o +obj-$(CONFIG_DMA_DIRECT_OPS) += direct.o +obj-$(CONFIG_DMA_NONCOHERENT_OPS) += noncoherent.o +obj-$(CONFIG_DMA_VIRT_OPS) += virt.o +obj-$(CONFIG_DMA_API_DEBUG) += debug.o +obj-$(CONFIG_SWIOTLB) += swiotlb.o + diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c new file mode 100644 index 000000000000..597d40893862 --- /dev/null +++ b/kernel/dma/coherent.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Coherent per-device memory handling. + * Borrowed from i386 + */ +#include +#include +#include +#include +#include + +struct dma_coherent_mem { + void *virt_base; + dma_addr_t device_base; + unsigned long pfn_base; + int size; + int flags; + unsigned long *bitmap; + spinlock_t spinlock; + bool use_dev_dma_pfn_offset; +}; + +static struct dma_coherent_mem *dma_coherent_default_memory __ro_after_init; + +static inline struct dma_coherent_mem *dev_get_coherent_memory(struct device *dev) +{ + if (dev && dev->dma_mem) + return dev->dma_mem; + return NULL; +} + +static inline dma_addr_t dma_get_device_base(struct device *dev, + struct dma_coherent_mem * mem) +{ + if (mem->use_dev_dma_pfn_offset) + return (mem->pfn_base - dev->dma_pfn_offset) << PAGE_SHIFT; + else + return mem->device_base; +} + +static int dma_init_coherent_memory( + phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, + struct dma_coherent_mem **mem) +{ + struct dma_coherent_mem *dma_mem = NULL; + void __iomem *mem_base = NULL; + int pages = size >> PAGE_SHIFT; + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + int ret; + + if (!size) { + ret = -EINVAL; + goto out; + } + + mem_base = memremap(phys_addr, size, MEMREMAP_WC); + if (!mem_base) { + ret = -EINVAL; + goto out; + } + dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dma_mem) { + ret = -ENOMEM; + goto out; + } + dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dma_mem->bitmap) { + ret = -ENOMEM; + goto out; + } + + dma_mem->virt_base = mem_base; + dma_mem->device_base = device_addr; + dma_mem->pfn_base = PFN_DOWN(phys_addr); + dma_mem->size = pages; + dma_mem->flags = flags; + spin_lock_init(&dma_mem->spinlock); + + *mem = dma_mem; + return 0; + +out: + kfree(dma_mem); + if (mem_base) + memunmap(mem_base); + return ret; +} + +static void dma_release_coherent_memory(struct dma_coherent_mem *mem) +{ + if (!mem) + return; + + memunmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} + +static int dma_assign_coherent_memory(struct device *dev, + struct dma_coherent_mem *mem) +{ + if (!dev) + return -ENODEV; + + if (dev->dma_mem) + return -EBUSY; + + dev->dma_mem = mem; + return 0; +} + +int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + struct dma_coherent_mem *mem; + int ret; + + ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); + if (ret) + return ret; + + ret = dma_assign_coherent_memory(dev, mem); + if (ret) + dma_release_coherent_memory(mem); + return ret; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if (!mem) + return; + dma_release_coherent_memory(mem); + dev->dma_mem = NULL; +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + unsigned long flags; + int pos, err; + + size += device_addr & ~PAGE_MASK; + + if (!mem) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&mem->spinlock, flags); + pos = PFN_DOWN(device_addr - dma_get_device_base(dev, mem)); + err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); + spin_unlock_irqrestore(&mem->spinlock, flags); + + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, + ssize_t size, dma_addr_t *dma_handle) +{ + int order = get_order(size); + unsigned long flags; + int pageno; + void *ret; + + spin_lock_irqsave(&mem->spinlock, flags); + + if (unlikely(size > (mem->size << PAGE_SHIFT))) + goto err; + + pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); + if (unlikely(pageno < 0)) + goto err; + + /* + * Memory was found in the coherent area. + */ + *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + ret = mem->virt_base + (pageno << PAGE_SHIFT); + spin_unlock_irqrestore(&mem->spinlock, flags); + memset(ret, 0, size); + return ret; +err: + spin_unlock_irqrestore(&mem->spinlock, flags); + return NULL; +} + +/** + * dma_alloc_from_dev_coherent() - allocate memory from device coherent pool + * @dev: device from which we allocate memory + * @size: size of requested memory area + * @dma_handle: This will be filled with the correct dma handle + * @ret: This pointer will be filled with the virtual address + * to allocated area. + * + * This function should be only called from per-arch dma_alloc_coherent() + * to support allocation from per-device coherent memory pools. + * + * Returns 0 if dma_alloc_coherent should continue with allocating from + * generic memory areas, or !0 if dma_alloc_coherent should return @ret. + */ +int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle, void **ret) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + if (!mem) + return 0; + + *ret = __dma_alloc_from_coherent(mem, size, dma_handle); + if (*ret) + return 1; + + /* + * In the case where the allocation can not be satisfied from the + * per-device area, try to fall back to generic memory if the + * constraints allow it. + */ + return mem->flags & DMA_MEMORY_EXCLUSIVE; +} +EXPORT_SYMBOL(dma_alloc_from_dev_coherent); + +void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) +{ + if (!dma_coherent_default_memory) + return NULL; + + return __dma_alloc_from_coherent(dma_coherent_default_memory, size, + dma_handle); +} + +static int __dma_release_from_coherent(struct dma_coherent_mem *mem, + int order, void *vaddr) +{ + if (mem && vaddr >= mem->virt_base && vaddr < + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + unsigned long flags; + + spin_lock_irqsave(&mem->spinlock, flags); + bitmap_release_region(mem->bitmap, page, order); + spin_unlock_irqrestore(&mem->spinlock, flags); + return 1; + } + return 0; +} + +/** + * dma_release_from_dev_coherent() - free memory to device coherent memory pool + * @dev: device from which the memory was allocated + * @order: the order of pages allocated + * @vaddr: virtual address of allocated pages + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, releases that memory. + * + * Returns 1 if we correctly released the memory, or 0 if the caller should + * proceed with releasing memory from generic pools. + */ +int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + return __dma_release_from_coherent(mem, order, vaddr); +} +EXPORT_SYMBOL(dma_release_from_dev_coherent); + +int dma_release_from_global_coherent(int order, void *vaddr) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_release_from_coherent(dma_coherent_default_memory, order, + vaddr); +} + +static int __dma_mmap_from_coherent(struct dma_coherent_mem *mem, + struct vm_area_struct *vma, void *vaddr, size_t size, int *ret) +{ + if (mem && vaddr >= mem->virt_base && vaddr + size <= + (mem->virt_base + (mem->size << PAGE_SHIFT))) { + unsigned long off = vma->vm_pgoff; + int start = (vaddr - mem->virt_base) >> PAGE_SHIFT; + int user_count = vma_pages(vma); + int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + + *ret = -ENXIO; + if (off < count && user_count <= count - off) { + unsigned long pfn = mem->pfn_base + start + off; + *ret = remap_pfn_range(vma, vma->vm_start, pfn, + user_count << PAGE_SHIFT, + vma->vm_page_prot); + } + return 1; + } + return 0; +} + +/** + * dma_mmap_from_dev_coherent() - mmap memory from the device coherent pool + * @dev: device from which the memory was allocated + * @vma: vm_area for the userspace memory + * @vaddr: cpu address returned by dma_alloc_from_dev_coherent + * @size: size of the memory buffer allocated + * @ret: result from remap_pfn_range() + * + * This checks whether the memory was allocated from the per-device + * coherent memory pool and if so, maps that memory to the provided vma. + * + * Returns 1 if @vaddr belongs to the device coherent pool and the caller + * should return @ret, or 0 if they should proceed with mapping memory from + * generic areas. + */ +int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, + void *vaddr, size_t size, int *ret) +{ + struct dma_coherent_mem *mem = dev_get_coherent_memory(dev); + + return __dma_mmap_from_coherent(mem, vma, vaddr, size, ret); +} +EXPORT_SYMBOL(dma_mmap_from_dev_coherent); + +int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *vaddr, + size_t size, int *ret) +{ + if (!dma_coherent_default_memory) + return 0; + + return __dma_mmap_from_coherent(dma_coherent_default_memory, vma, + vaddr, size, ret); +} + +/* + * Support for reserved memory regions defined in device tree + */ +#ifdef CONFIG_OF_RESERVED_MEM +#include +#include +#include + +static struct reserved_mem *dma_reserved_default_memory __initdata; + +static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) +{ + struct dma_coherent_mem *mem = rmem->priv; + int ret; + + if (!mem) { + ret = dma_init_coherent_memory(rmem->base, rmem->base, + rmem->size, + DMA_MEMORY_EXCLUSIVE, &mem); + if (ret) { + pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return ret; + } + } + mem->use_dev_dma_pfn_offset = true; + rmem->priv = mem; + dma_assign_coherent_memory(dev, mem); + return 0; +} + +static void rmem_dma_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + if (dev) + dev->dma_mem = NULL; +} + +static const struct reserved_mem_ops rmem_dma_ops = { + .device_init = rmem_dma_device_init, + .device_release = rmem_dma_device_release, +}; + +static int __init rmem_dma_setup(struct reserved_mem *rmem) +{ + unsigned long node = rmem->fdt_node; + + if (of_get_flat_dt_prop(node, "reusable", NULL)) + return -EINVAL; + +#ifdef CONFIG_ARM + if (!of_get_flat_dt_prop(node, "no-map", NULL)) { + pr_err("Reserved memory: regions without no-map are not yet supported\n"); + return -EINVAL; + } + + if (of_get_flat_dt_prop(node, "linux,dma-default", NULL)) { + WARN(dma_reserved_default_memory, + "Reserved memory: region for default DMA coherent area is redefined\n"); + dma_reserved_default_memory = rmem; + } +#endif + + rmem->ops = &rmem_dma_ops; + pr_info("Reserved memory: created DMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + return 0; +} + +static int __init dma_init_reserved_memory(void) +{ + const struct reserved_mem_ops *ops; + int ret; + + if (!dma_reserved_default_memory) + return -ENOMEM; + + ops = dma_reserved_default_memory->ops; + + /* + * We rely on rmem_dma_device_init() does not propagate error of + * dma_assign_coherent_memory() for "NULL" device. + */ + ret = ops->device_init(dma_reserved_default_memory, NULL); + + if (!ret) { + dma_coherent_default_memory = dma_reserved_default_memory->priv; + pr_info("DMA: default coherent area is set\n"); + } + + return ret; +} + +core_initcall(dma_init_reserved_memory); + +RESERVEDMEM_OF_DECLARE(dma, "shared-dma-pool", rmem_dma_setup); +#endif diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c new file mode 100644 index 000000000000..d987dcd1bd56 --- /dev/null +++ b/kernel/dma/contiguous.c @@ -0,0 +1,278 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Contiguous Memory Allocator for DMA mapping framework + * Copyright (c) 2010-2011 by Samsung Electronics. + * Written by: + * Marek Szyprowski + * Michal Nazarewicz + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_CMA_SIZE_MBYTES +#define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES +#else +#define CMA_SIZE_MBYTES 0 +#endif + +struct cma *dma_contiguous_default_area; + +/* + * Default global CMA area size can be defined in kernel's .config. + * This is useful mainly for distro maintainers to create a kernel + * that works correctly for most supported systems. + * The size can be set in bytes or as a percentage of the total memory + * in the system. + * + * Users, who want to set the size of global CMA area for their system + * should use cma= kernel parameter. + */ +static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +static phys_addr_t size_cmdline = -1; +static phys_addr_t base_cmdline; +static phys_addr_t limit_cmdline; + +static int __init early_cma(char *p) +{ + pr_debug("%s(%s)\n", __func__, p); + size_cmdline = memparse(p, &p); + if (*p != '@') + return 0; + base_cmdline = memparse(p + 1, &p); + if (*p != '-') { + limit_cmdline = base_cmdline + size_cmdline; + return 0; + } + limit_cmdline = memparse(p + 1, &p); + + return 0; +} +early_param("cma", early_cma); + +#ifdef CONFIG_CMA_SIZE_PERCENTAGE + +static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) +{ + struct memblock_region *reg; + unsigned long total_pages = 0; + + /* + * We cannot use memblock_phys_mem_size() here, because + * memblock_analyze() has not been called yet. + */ + for_each_memblock(memory, reg) + total_pages += memblock_region_memory_end_pfn(reg) - + memblock_region_memory_base_pfn(reg); + + return (total_pages * CONFIG_CMA_SIZE_PERCENTAGE / 100) << PAGE_SHIFT; +} + +#else + +static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) +{ + return 0; +} + +#endif + +/** + * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling + * @limit: End address of the reserved memory (optional, 0 for any). + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. + */ +void __init dma_contiguous_reserve(phys_addr_t limit) +{ + phys_addr_t selected_size = 0; + phys_addr_t selected_base = 0; + phys_addr_t selected_limit = limit; + bool fixed = false; + + pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); + + if (size_cmdline != -1) { + selected_size = size_cmdline; + selected_base = base_cmdline; + selected_limit = min_not_zero(limit_cmdline, limit); + if (base_cmdline + size_cmdline == limit_cmdline) + fixed = true; + } else { +#ifdef CONFIG_CMA_SIZE_SEL_MBYTES + selected_size = size_bytes; +#elif defined(CONFIG_CMA_SIZE_SEL_PERCENTAGE) + selected_size = cma_early_percent_memory(); +#elif defined(CONFIG_CMA_SIZE_SEL_MIN) + selected_size = min(size_bytes, cma_early_percent_memory()); +#elif defined(CONFIG_CMA_SIZE_SEL_MAX) + selected_size = max(size_bytes, cma_early_percent_memory()); +#endif + } + + if (selected_size && !dma_contiguous_default_area) { + pr_debug("%s: reserving %ld MiB for global area\n", __func__, + (unsigned long)selected_size / SZ_1M); + + dma_contiguous_reserve_area(selected_size, selected_base, + selected_limit, + &dma_contiguous_default_area, + fixed); + } +} + +/** + * dma_contiguous_reserve_area() - reserve custom contiguous area + * @size: Size of the reserved area (in bytes), + * @base: Base address of the reserved area optional, use 0 for any + * @limit: End address of the reserved memory (optional, 0 for any). + * @res_cma: Pointer to store the created cma region. + * @fixed: hint about where to place the reserved area + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas for specific + * devices. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, + phys_addr_t limit, struct cma **res_cma, + bool fixed) +{ + int ret; + + ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, + "reserved", res_cma); + if (ret) + return ret; + + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(cma_get_base(*res_cma), + cma_get_size(*res_cma)); + + return 0; +} + +/** + * dma_alloc_from_contiguous() - allocate pages from contiguous area + * @dev: Pointer to device for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * @gfp_mask: GFP flags to use for this allocation. + * + * This function allocates memory buffer for specified device. It uses + * device specific contiguous memory area if available or the default + * global one. Requires architecture specific dev_get_cma_area() helper + * function. + */ +struct page *dma_alloc_from_contiguous(struct device *dev, size_t count, + unsigned int align, gfp_t gfp_mask) +{ + if (align > CONFIG_CMA_ALIGNMENT) + align = CONFIG_CMA_ALIGNMENT; + + return cma_alloc(dev_get_cma_area(dev), count, align, gfp_mask); +} + +/** + * dma_release_from_contiguous() - release allocated pages + * @dev: Pointer to device for which the pages were allocated. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by dma_alloc_from_contiguous(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool dma_release_from_contiguous(struct device *dev, struct page *pages, + int count) +{ + return cma_release(dev_get_cma_area(dev), pages, count); +} + +/* + * Support for reserved memory regions defined in device tree + */ +#ifdef CONFIG_OF_RESERVED_MEM +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) fmt + +static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) +{ + dev_set_cma_area(dev, rmem->priv); + return 0; +} + +static void rmem_cma_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + dev_set_cma_area(dev, NULL); +} + +static const struct reserved_mem_ops rmem_cma_ops = { + .device_init = rmem_cma_device_init, + .device_release = rmem_cma_device_release, +}; + +static int __init rmem_cma_setup(struct reserved_mem *rmem) +{ + phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + phys_addr_t mask = align - 1; + unsigned long node = rmem->fdt_node; + struct cma *cma; + int err; + + if (!of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -EINVAL; + + if ((rmem->base & mask) || (rmem->size & mask)) { + pr_err("Reserved memory: incorrect alignment of CMA region\n"); + return -EINVAL; + } + + err = cma_init_reserved_mem(rmem->base, rmem->size, 0, rmem->name, &cma); + if (err) { + pr_err("Reserved memory: unable to setup CMA region\n"); + return err; + } + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(rmem->base, rmem->size); + + if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + dma_contiguous_set_default(cma); + + rmem->ops = &rmem_cma_ops; + rmem->priv = cma; + + pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + + return 0; +} +RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); +#endif diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c new file mode 100644 index 000000000000..c007d25bee09 --- /dev/null +++ b/kernel/dma/debug.c @@ -0,0 +1,1773 @@ +/* + * Copyright (C) 2008 Advanced Micro Devices, Inc. + * + * Author: Joerg Roedel + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define HASH_SIZE 1024ULL +#define HASH_FN_SHIFT 13 +#define HASH_FN_MASK (HASH_SIZE - 1) + +/* allow architectures to override this if absolutely required */ +#ifndef PREALLOC_DMA_DEBUG_ENTRIES +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) +#endif + +enum { + dma_debug_single, + dma_debug_page, + dma_debug_sg, + dma_debug_coherent, + dma_debug_resource, +}; + +enum map_err_types { + MAP_ERR_CHECK_NOT_APPLICABLE, + MAP_ERR_NOT_CHECKED, + MAP_ERR_CHECKED, +}; + +#define DMA_DEBUG_STACKTRACE_ENTRIES 5 + +/** + * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping + * @list: node on pre-allocated free_entries list + * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent + * @type: single, page, sg, coherent + * @pfn: page frame of the start address + * @offset: offset of mapping relative to pfn + * @size: length of the mapping + * @direction: enum dma_data_direction + * @sg_call_ents: 'nents' from dma_map_sg + * @sg_mapped_ents: 'mapped_ents' from dma_map_sg + * @map_err_type: track whether dma_mapping_error() was checked + * @stacktrace: support backtraces when a violation is detected + */ +struct dma_debug_entry { + struct list_head list; + struct device *dev; + int type; + unsigned long pfn; + size_t offset; + u64 dev_addr; + u64 size; + int direction; + int sg_call_ents; + int sg_mapped_ents; + enum map_err_types map_err_type; +#ifdef CONFIG_STACKTRACE + struct stack_trace stacktrace; + unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; +#endif +}; + +typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); + +struct hash_bucket { + struct list_head list; + spinlock_t lock; +} ____cacheline_aligned_in_smp; + +/* Hash list to save the allocated dma addresses */ +static struct hash_bucket dma_entry_hash[HASH_SIZE]; +/* List of pre-allocated dma_debug_entry's */ +static LIST_HEAD(free_entries); +/* Lock for the list above */ +static DEFINE_SPINLOCK(free_entries_lock); + +/* Global disable flag - will be set in case of an error */ +static bool global_disable __read_mostly; + +/* Early initialization disable flag, set at the end of dma_debug_init */ +static bool dma_debug_initialized __read_mostly; + +static inline bool dma_debug_disabled(void) +{ + return global_disable || !dma_debug_initialized; +} + +/* Global error count */ +static u32 error_count; + +/* Global error show enable*/ +static u32 show_all_errors __read_mostly; +/* Number of errors to show */ +static u32 show_num_errors = 1; + +static u32 num_free_entries; +static u32 min_free_entries; +static u32 nr_total_entries; + +/* number of preallocated entries requested by kernel cmdline */ +static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; + +/* debugfs dentry's for the stuff above */ +static struct dentry *dma_debug_dent __read_mostly; +static struct dentry *global_disable_dent __read_mostly; +static struct dentry *error_count_dent __read_mostly; +static struct dentry *show_all_errors_dent __read_mostly; +static struct dentry *show_num_errors_dent __read_mostly; +static struct dentry *num_free_entries_dent __read_mostly; +static struct dentry *min_free_entries_dent __read_mostly; +static struct dentry *filter_dent __read_mostly; + +/* per-driver filter related state */ + +#define NAME_MAX_LEN 64 + +static char current_driver_name[NAME_MAX_LEN] __read_mostly; +static struct device_driver *current_driver __read_mostly; + +static DEFINE_RWLOCK(driver_name_lock); + +static const char *const maperr2str[] = { + [MAP_ERR_CHECK_NOT_APPLICABLE] = "dma map error check not applicable", + [MAP_ERR_NOT_CHECKED] = "dma map error not checked", + [MAP_ERR_CHECKED] = "dma map error checked", +}; + +static const char *type2name[5] = { "single", "page", + "scather-gather", "coherent", + "resource" }; + +static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", + "DMA_FROM_DEVICE", "DMA_NONE" }; + +/* + * The access to some variables in this macro is racy. We can't use atomic_t + * here because all these variables are exported to debugfs. Some of them even + * writeable. This is also the reason why a lock won't help much. But anyway, + * the races are no big deal. Here is why: + * + * error_count: the addition is racy, but the worst thing that can happen is + * that we don't count some errors + * show_num_errors: the subtraction is racy. Also no big deal because in + * worst case this will result in one warning more in the + * system log than the user configured. This variable is + * writeable via debugfs. + */ +static inline void dump_entry_trace(struct dma_debug_entry *entry) +{ +#ifdef CONFIG_STACKTRACE + if (entry) { + pr_warning("Mapped at:\n"); + print_stack_trace(&entry->stacktrace, 0); + } +#endif +} + +static bool driver_filter(struct device *dev) +{ + struct device_driver *drv; + unsigned long flags; + bool ret; + + /* driver filter off */ + if (likely(!current_driver_name[0])) + return true; + + /* driver filter on and initialized */ + if (current_driver && dev && dev->driver == current_driver) + return true; + + /* driver filter on, but we can't filter on a NULL device... */ + if (!dev) + return false; + + if (current_driver || !current_driver_name[0]) + return false; + + /* driver filter on but not yet initialized */ + drv = dev->driver; + if (!drv) + return false; + + /* lock to protect against change of current_driver_name */ + read_lock_irqsave(&driver_name_lock, flags); + + ret = false; + if (drv->name && + strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) { + current_driver = drv; + ret = true; + } + + read_unlock_irqrestore(&driver_name_lock, flags); + + return ret; +} + +#define err_printk(dev, entry, format, arg...) do { \ + error_count += 1; \ + if (driver_filter(dev) && \ + (show_all_errors || show_num_errors > 0)) { \ + WARN(1, "%s %s: " format, \ + dev ? dev_driver_string(dev) : "NULL", \ + dev ? dev_name(dev) : "NULL", ## arg); \ + dump_entry_trace(entry); \ + } \ + if (!show_all_errors && show_num_errors > 0) \ + show_num_errors -= 1; \ + } while (0); + +/* + * Hash related functions + * + * Every DMA-API request is saved into a struct dma_debug_entry. To + * have quick access to these structs they are stored into a hash. + */ +static int hash_fn(struct dma_debug_entry *entry) +{ + /* + * Hash function is based on the dma address. + * We use bits 20-27 here as the index into the hash + */ + return (entry->dev_addr >> HASH_FN_SHIFT) & HASH_FN_MASK; +} + +/* + * Request exclusive access to a hash bucket for a given dma_debug_entry. + */ +static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, + unsigned long *flags) + __acquires(&dma_entry_hash[idx].lock) +{ + int idx = hash_fn(entry); + unsigned long __flags; + + spin_lock_irqsave(&dma_entry_hash[idx].lock, __flags); + *flags = __flags; + return &dma_entry_hash[idx]; +} + +/* + * Give up exclusive access to the hash bucket + */ +static void put_hash_bucket(struct hash_bucket *bucket, + unsigned long *flags) + __releases(&bucket->lock) +{ + unsigned long __flags = *flags; + + spin_unlock_irqrestore(&bucket->lock, __flags); +} + +static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b) +{ + return ((a->dev_addr == b->dev_addr) && + (a->dev == b->dev)) ? true : false; +} + +static bool containing_match(struct dma_debug_entry *a, + struct dma_debug_entry *b) +{ + if (a->dev != b->dev) + return false; + + if ((b->dev_addr <= a->dev_addr) && + ((b->dev_addr + b->size) >= (a->dev_addr + a->size))) + return true; + + return false; +} + +/* + * Search a given entry in the hash bucket list + */ +static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket, + struct dma_debug_entry *ref, + match_fn match) +{ + struct dma_debug_entry *entry, *ret = NULL; + int matches = 0, match_lvl, last_lvl = -1; + + list_for_each_entry(entry, &bucket->list, list) { + if (!match(ref, entry)) + continue; + + /* + * Some drivers map the same physical address multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therefore we implement a + * best-fit algorithm here which returns the entry from + * the hash which fits best to the reference value + * instead of the first-fit. + */ + matches += 1; + match_lvl = 0; + entry->size == ref->size ? ++match_lvl : 0; + entry->type == ref->type ? ++match_lvl : 0; + entry->direction == ref->direction ? ++match_lvl : 0; + entry->sg_call_ents == ref->sg_call_ents ? ++match_lvl : 0; + + if (match_lvl == 4) { + /* perfect-fit - return the result */ + return entry; + } else if (match_lvl > last_lvl) { + /* + * We found an entry that fits better then the + * previous one or it is the 1st match. + */ + last_lvl = match_lvl; + ret = entry; + } + } + + /* + * If we have multiple matches but no perfect-fit, just return + * NULL. + */ + ret = (matches == 1) ? ret : NULL; + + return ret; +} + +static struct dma_debug_entry *bucket_find_exact(struct hash_bucket *bucket, + struct dma_debug_entry *ref) +{ + return __hash_bucket_find(bucket, ref, exact_match); +} + +static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, + struct dma_debug_entry *ref, + unsigned long *flags) +{ + + unsigned int max_range = dma_get_max_seg_size(ref->dev); + struct dma_debug_entry *entry, index = *ref; + unsigned int range = 0; + + while (range <= max_range) { + entry = __hash_bucket_find(*bucket, ref, containing_match); + + if (entry) + return entry; + + /* + * Nothing found, go back a hash bucket + */ + put_hash_bucket(*bucket, flags); + range += (1 << HASH_FN_SHIFT); + index.dev_addr -= (1 << HASH_FN_SHIFT); + *bucket = get_hash_bucket(&index, flags); + } + + return NULL; +} + +/* + * Add an entry to a hash bucket + */ +static void hash_bucket_add(struct hash_bucket *bucket, + struct dma_debug_entry *entry) +{ + list_add_tail(&entry->list, &bucket->list); +} + +/* + * Remove entry from a hash bucket list + */ +static void hash_bucket_del(struct dma_debug_entry *entry) +{ + list_del(&entry->list); +} + +static unsigned long long phys_addr(struct dma_debug_entry *entry) +{ + if (entry->type == dma_debug_resource) + return __pfn_to_phys(entry->pfn) + entry->offset; + + return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; +} + +/* + * Dump mapping entries for debugging purposes + */ +void debug_dma_dump_mappings(struct device *dev) +{ + int idx; + + for (idx = 0; idx < HASH_SIZE; idx++) { + struct hash_bucket *bucket = &dma_entry_hash[idx]; + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&bucket->lock, flags); + + list_for_each_entry(entry, &bucket->list, list) { + if (!dev || dev == entry->dev) { + dev_info(entry->dev, + "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", + type2name[entry->type], idx, + phys_addr(entry), entry->pfn, + entry->dev_addr, entry->size, + dir2name[entry->direction], + maperr2str[entry->map_err_type]); + } + } + + spin_unlock_irqrestore(&bucket->lock, flags); + } +} + +/* + * For each mapping (initial cacheline in the case of + * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a + * scatterlist, or the cacheline specified in dma_map_single) insert + * into this tree using the cacheline as the key. At + * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If + * the entry already exists at insertion time add a tag as a reference + * count for the overlapping mappings. For now, the overlap tracking + * just ensures that 'unmaps' balance 'maps' before marking the + * cacheline idle, but we should also be flagging overlaps as an API + * violation. + * + * Memory usage is mostly constrained by the maximum number of available + * dma-debug entries in that we need a free dma_debug_entry before + * inserting into the tree. In the case of dma_map_page and + * dma_alloc_coherent there is only one dma_debug_entry and one + * dma_active_cacheline entry to track per event. dma_map_sg(), on the + * other hand, consumes a single dma_debug_entry, but inserts 'nents' + * entries into the tree. + * + * At any time debug_dma_assert_idle() can be called to trigger a + * warning if any cachelines in the given page are in the active set. + */ +static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); +static DEFINE_SPINLOCK(radix_lock); +#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) +#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) +#define CACHELINES_PER_PAGE (1 << CACHELINE_PER_PAGE_SHIFT) + +static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry) +{ + return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) + + (entry->offset >> L1_CACHE_SHIFT); +} + +static int active_cacheline_read_overlap(phys_addr_t cln) +{ + int overlap = 0, i; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (radix_tree_tag_get(&dma_active_cacheline, cln, i)) + overlap |= 1 << i; + return overlap; +} + +static int active_cacheline_set_overlap(phys_addr_t cln, int overlap) +{ + int i; + + if (overlap > ACTIVE_CACHELINE_MAX_OVERLAP || overlap < 0) + return overlap; + + for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) + if (overlap & 1 << i) + radix_tree_tag_set(&dma_active_cacheline, cln, i); + else + radix_tree_tag_clear(&dma_active_cacheline, cln, i); + + return overlap; +} + +static void active_cacheline_inc_overlap(phys_addr_t cln) +{ + int overlap = active_cacheline_read_overlap(cln); + + overlap = active_cacheline_set_overlap(cln, ++overlap); + + /* If we overflowed the overlap counter then we're potentially + * leaking dma-mappings. Otherwise, if maps and unmaps are + * balanced then this overflow may cause false negatives in + * debug_dma_assert_idle() as the cacheline may be marked idle + * prematurely. + */ + WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, + "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n", + ACTIVE_CACHELINE_MAX_OVERLAP, &cln); +} + +static int active_cacheline_dec_overlap(phys_addr_t cln) +{ + int overlap = active_cacheline_read_overlap(cln); + + return active_cacheline_set_overlap(cln, --overlap); +} + +static int active_cacheline_insert(struct dma_debug_entry *entry) +{ + phys_addr_t cln = to_cacheline_number(entry); + unsigned long flags; + int rc; + + /* If the device is not writing memory then we don't have any + * concerns about the cpu consuming stale data. This mitigates + * legitimate usages of overlapping mappings. + */ + if (entry->direction == DMA_TO_DEVICE) + return 0; + + spin_lock_irqsave(&radix_lock, flags); + rc = radix_tree_insert(&dma_active_cacheline, cln, entry); + if (rc == -EEXIST) + active_cacheline_inc_overlap(cln); + spin_unlock_irqrestore(&radix_lock, flags); + + return rc; +} + +static void active_cacheline_remove(struct dma_debug_entry *entry) +{ + phys_addr_t cln = to_cacheline_number(entry); + unsigned long flags; + + /* ...mirror the insert case */ + if (entry->direction == DMA_TO_DEVICE) + return; + + spin_lock_irqsave(&radix_lock, flags); + /* since we are counting overlaps the final put of the + * cacheline will occur when the overlap count is 0. + * active_cacheline_dec_overlap() returns -1 in that case + */ + if (active_cacheline_dec_overlap(cln) < 0) + radix_tree_delete(&dma_active_cacheline, cln); + spin_unlock_irqrestore(&radix_lock, flags); +} + +/** + * debug_dma_assert_idle() - assert that a page is not undergoing dma + * @page: page to lookup in the dma_active_cacheline tree + * + * Place a call to this routine in cases where the cpu touching the page + * before the dma completes (page is dma_unmapped) will lead to data + * corruption. + */ +void debug_dma_assert_idle(struct page *page) +{ + static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; + struct dma_debug_entry *entry = NULL; + void **results = (void **) &ents; + unsigned int nents, i; + unsigned long flags; + phys_addr_t cln; + + if (dma_debug_disabled()) + return; + + if (!page) + return; + + cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; + spin_lock_irqsave(&radix_lock, flags); + nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, + CACHELINES_PER_PAGE); + for (i = 0; i < nents; i++) { + phys_addr_t ent_cln = to_cacheline_number(ents[i]); + + if (ent_cln == cln) { + entry = ents[i]; + break; + } else if (ent_cln >= cln + CACHELINES_PER_PAGE) + break; + } + spin_unlock_irqrestore(&radix_lock, flags); + + if (!entry) + return; + + cln = to_cacheline_number(entry); + err_printk(entry->dev, entry, + "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n", + &cln); +} + +/* + * Wrapper function for adding an entry to the hash. + * This function takes care of locking itself. + */ +static void add_dma_entry(struct dma_debug_entry *entry) +{ + struct hash_bucket *bucket; + unsigned long flags; + int rc; + + bucket = get_hash_bucket(entry, &flags); + hash_bucket_add(bucket, entry); + put_hash_bucket(bucket, &flags); + + rc = active_cacheline_insert(entry); + if (rc == -ENOMEM) { + pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n"); + global_disable = true; + } + + /* TODO: report -EEXIST errors here as overlapping mappings are + * not supported by the DMA API + */ +} + +static struct dma_debug_entry *__dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + + entry = list_entry(free_entries.next, struct dma_debug_entry, list); + list_del(&entry->list); + memset(entry, 0, sizeof(*entry)); + + num_free_entries -= 1; + if (num_free_entries < min_free_entries) + min_free_entries = num_free_entries; + + return entry; +} + +/* struct dma_entry allocator + * + * The next two functions implement the allocator for + * struct dma_debug_entries. + */ +static struct dma_debug_entry *dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + unsigned long flags; + + spin_lock_irqsave(&free_entries_lock, flags); + + if (list_empty(&free_entries)) { + global_disable = true; + spin_unlock_irqrestore(&free_entries_lock, flags); + pr_err("DMA-API: debugging out of memory - disabling\n"); + return NULL; + } + + entry = __dma_entry_alloc(); + + spin_unlock_irqrestore(&free_entries_lock, flags); + +#ifdef CONFIG_STACKTRACE + entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; + entry->stacktrace.entries = entry->st_entries; + entry->stacktrace.skip = 2; + save_stack_trace(&entry->stacktrace); +#endif + + return entry; +} + +static void dma_entry_free(struct dma_debug_entry *entry) +{ + unsigned long flags; + + active_cacheline_remove(entry); + + /* + * add to beginning of the list - this way the entries are + * more likely cache hot when they are reallocated. + */ + spin_lock_irqsave(&free_entries_lock, flags); + list_add(&entry->list, &free_entries); + num_free_entries += 1; + spin_unlock_irqrestore(&free_entries_lock, flags); +} + +int dma_debug_resize_entries(u32 num_entries) +{ + int i, delta, ret = 0; + unsigned long flags; + struct dma_debug_entry *entry; + LIST_HEAD(tmp); + + spin_lock_irqsave(&free_entries_lock, flags); + + if (nr_total_entries < num_entries) { + delta = num_entries - nr_total_entries; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + for (i = 0; i < delta; i++) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + break; + + list_add_tail(&entry->list, &tmp); + } + + spin_lock_irqsave(&free_entries_lock, flags); + + list_splice(&tmp, &free_entries); + nr_total_entries += i; + num_free_entries += i; + } else { + delta = nr_total_entries - num_entries; + + for (i = 0; i < delta && !list_empty(&free_entries); i++) { + entry = __dma_entry_alloc(); + kfree(entry); + } + + nr_total_entries -= i; + } + + if (nr_total_entries != num_entries) + ret = 1; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + return ret; +} + +/* + * DMA-API debugging init code + * + * The init code does two things: + * 1. Initialize core data structures + * 2. Preallocate a given number of dma_debug_entry structs + */ + +static int prealloc_memory(u32 num_entries) +{ + struct dma_debug_entry *entry, *next_entry; + int i; + + for (i = 0; i < num_entries; ++i) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_err; + + list_add_tail(&entry->list, &free_entries); + } + + num_free_entries = num_entries; + min_free_entries = num_entries; + + pr_info("DMA-API: preallocated %d debug entries\n", num_entries); + + return 0; + +out_err: + + list_for_each_entry_safe(entry, next_entry, &free_entries, list) { + list_del(&entry->list); + kfree(entry); + } + + return -ENOMEM; +} + +static ssize_t filter_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[NAME_MAX_LEN + 1]; + unsigned long flags; + int len; + + if (!current_driver_name[0]) + return 0; + + /* + * We can't copy to userspace directly because current_driver_name can + * only be read under the driver_name_lock with irqs disabled. So + * create a temporary copy first. + */ + read_lock_irqsave(&driver_name_lock, flags); + len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name); + read_unlock_irqrestore(&driver_name_lock, flags); + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t filter_write(struct file *file, const char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[NAME_MAX_LEN]; + unsigned long flags; + size_t len; + int i; + + /* + * We can't copy from userspace directly. Access to + * current_driver_name is protected with a write_lock with irqs + * disabled. Since copy_from_user can fault and may sleep we + * need to copy to temporary buffer first + */ + len = min(count, (size_t)(NAME_MAX_LEN - 1)); + if (copy_from_user(buf, userbuf, len)) + return -EFAULT; + + buf[len] = 0; + + write_lock_irqsave(&driver_name_lock, flags); + + /* + * Now handle the string we got from userspace very carefully. + * The rules are: + * - only use the first token we got + * - token delimiter is everything looking like a space + * character (' ', '\n', '\t' ...) + * + */ + if (!isalnum(buf[0])) { + /* + * If the first character userspace gave us is not + * alphanumerical then assume the filter should be + * switched off. + */ + if (current_driver_name[0]) + pr_info("DMA-API: switching off dma-debug driver filter\n"); + current_driver_name[0] = 0; + current_driver = NULL; + goto out_unlock; + } + + /* + * Now parse out the first token and use it as the name for the + * driver to filter for. + */ + for (i = 0; i < NAME_MAX_LEN - 1; ++i) { + current_driver_name[i] = buf[i]; + if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0) + break; + } + current_driver_name[i] = 0; + current_driver = NULL; + + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + +out_unlock: + write_unlock_irqrestore(&driver_name_lock, flags); + + return count; +} + +static const struct file_operations filter_fops = { + .read = filter_read, + .write = filter_write, + .llseek = default_llseek, +}; + +static int dma_debug_fs_init(void) +{ + dma_debug_dent = debugfs_create_dir("dma-api", NULL); + if (!dma_debug_dent) { + pr_err("DMA-API: can not create debugfs directory\n"); + return -ENOMEM; + } + + global_disable_dent = debugfs_create_bool("disabled", 0444, + dma_debug_dent, + &global_disable); + if (!global_disable_dent) + goto out_err; + + error_count_dent = debugfs_create_u32("error_count", 0444, + dma_debug_dent, &error_count); + if (!error_count_dent) + goto out_err; + + show_all_errors_dent = debugfs_create_u32("all_errors", 0644, + dma_debug_dent, + &show_all_errors); + if (!show_all_errors_dent) + goto out_err; + + show_num_errors_dent = debugfs_create_u32("num_errors", 0644, + dma_debug_dent, + &show_num_errors); + if (!show_num_errors_dent) + goto out_err; + + num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, + dma_debug_dent, + &num_free_entries); + if (!num_free_entries_dent) + goto out_err; + + min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, + dma_debug_dent, + &min_free_entries); + if (!min_free_entries_dent) + goto out_err; + + filter_dent = debugfs_create_file("driver_filter", 0644, + dma_debug_dent, NULL, &filter_fops); + if (!filter_dent) + goto out_err; + + return 0; + +out_err: + debugfs_remove_recursive(dma_debug_dent); + + return -ENOMEM; +} + +static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) +{ + struct dma_debug_entry *entry; + unsigned long flags; + int count = 0, i; + + for (i = 0; i < HASH_SIZE; ++i) { + spin_lock_irqsave(&dma_entry_hash[i].lock, flags); + list_for_each_entry(entry, &dma_entry_hash[i].list, list) { + if (entry->dev == dev) { + count += 1; + *out_entry = entry; + } + } + spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); + } + + return count; +} + +static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) +{ + struct device *dev = data; + struct dma_debug_entry *uninitialized_var(entry); + int count; + + if (dma_debug_disabled()) + return 0; + + switch (action) { + case BUS_NOTIFY_UNBOUND_DRIVER: + count = device_dma_allocations(dev, &entry); + if (count == 0) + break; + err_printk(dev, entry, "DMA-API: device driver has pending " + "DMA allocations while released from device " + "[count=%d]\n" + "One of leaked entries details: " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [mapped as %s]\n", + count, entry->dev_addr, entry->size, + dir2name[entry->direction], type2name[entry->type]); + break; + default: + break; + } + + return 0; +} + +void dma_debug_add_bus(struct bus_type *bus) +{ + struct notifier_block *nb; + + if (dma_debug_disabled()) + return; + + nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); + if (nb == NULL) { + pr_err("dma_debug_add_bus: out of memory\n"); + return; + } + + nb->notifier_call = dma_debug_device_change; + + bus_register_notifier(bus, nb); +} + +static int dma_debug_init(void) +{ + int i; + + /* Do not use dma_debug_initialized here, since we really want to be + * called to set dma_debug_initialized + */ + if (global_disable) + return 0; + + for (i = 0; i < HASH_SIZE; ++i) { + INIT_LIST_HEAD(&dma_entry_hash[i].list); + spin_lock_init(&dma_entry_hash[i].lock); + } + + if (dma_debug_fs_init() != 0) { + pr_err("DMA-API: error creating debugfs entries - disabling\n"); + global_disable = true; + + return 0; + } + + if (prealloc_memory(nr_prealloc_entries) != 0) { + pr_err("DMA-API: debugging out of memory error - disabled\n"); + global_disable = true; + + return 0; + } + + nr_total_entries = num_free_entries; + + dma_debug_initialized = true; + + pr_info("DMA-API: debugging enabled by kernel config\n"); + return 0; +} +core_initcall(dma_debug_init); + +static __init int dma_debug_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (strncmp(str, "off", 3) == 0) { + pr_info("DMA-API: debugging disabled on kernel command line\n"); + global_disable = true; + } + + return 0; +} + +static __init int dma_debug_entries_cmdline(char *str) +{ + if (!str) + return -EINVAL; + if (!get_option(&str, &nr_prealloc_entries)) + nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; + return 0; +} + +__setup("dma_debug=", dma_debug_cmdline); +__setup("dma_debug_entries=", dma_debug_entries_cmdline); + +static void check_unmap(struct dma_debug_entry *ref) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + bucket = get_hash_bucket(ref, &flags); + entry = bucket_find_exact(bucket, ref); + + if (!entry) { + /* must drop lock before calling dma_mapping_error */ + put_hash_bucket(bucket, &flags); + + if (dma_mapping_error(ref->dev, ref->dev_addr)) { + err_printk(ref->dev, NULL, + "DMA-API: device driver tries to free an " + "invalid DMA memory address\n"); + } else { + err_printk(ref->dev, NULL, + "DMA-API: device driver tries to free DMA " + "memory it has not allocated [device " + "address=0x%016llx] [size=%llu bytes]\n", + ref->dev_addr, ref->size); + } + return; + } + + if (ref->size != entry->size) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different size " + "[device address=0x%016llx] [map size=%llu bytes] " + "[unmap size=%llu bytes]\n", + ref->dev_addr, entry->size, ref->size); + } + + if (ref->type != entry->type) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with wrong function " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped as %s] [unmapped as %s]\n", + ref->dev_addr, ref->size, + type2name[entry->type], type2name[ref->type]); + } else if ((entry->type == dma_debug_coherent) && + (phys_addr(ref) != phys_addr(entry))) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different CPU address " + "[device address=0x%016llx] [size=%llu bytes] " + "[cpu alloc address=0x%016llx] " + "[cpu free address=0x%016llx]", + ref->dev_addr, ref->size, + phys_addr(entry), + phys_addr(ref)); + } + + if (ref->sg_call_ents && ref->type == dma_debug_sg && + ref->sg_call_ents != entry->sg_call_ents) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA sg list with different entry count " + "[map count=%d] [unmap count=%d]\n", + entry->sg_call_ents, ref->sg_call_ents); + } + + /* + * This may be no bug in reality - but most implementations of the + * DMA API don't handle this properly, so check for it here + */ + if (ref->direction != entry->direction) { + err_printk(ref->dev, entry, "DMA-API: device driver frees " + "DMA memory with different direction " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [unmapped with %s]\n", + ref->dev_addr, ref->size, + dir2name[entry->direction], + dir2name[ref->direction]); + } + + /* + * Drivers should use dma_mapping_error() to check the returned + * addresses of dma_map_single() and dma_map_page(). + * If not, print this warning message. See Documentation/DMA-API.txt. + */ + if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { + err_printk(ref->dev, entry, + "DMA-API: device driver failed to check map error" + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped as %s]", + ref->dev_addr, ref->size, + type2name[entry->type]); + } + + hash_bucket_del(entry); + dma_entry_free(entry); + + put_hash_bucket(bucket, &flags); +} + +static void check_for_stack(struct device *dev, + struct page *page, size_t offset) +{ + void *addr; + struct vm_struct *stack_vm_area = task_stack_vm_area(current); + + if (!stack_vm_area) { + /* Stack is direct-mapped. */ + if (PageHighMem(page)) + return; + addr = page_address(page) + offset; + if (object_is_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + } else { + /* Stack is vmalloced. */ + int i; + + for (i = 0; i < stack_vm_area->nr_pages; i++) { + if (page != stack_vm_area->pages[i]) + continue; + + addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + break; + } + } +} + +static inline bool overlap(void *addr, unsigned long len, void *start, void *end) +{ + unsigned long a1 = (unsigned long)addr; + unsigned long b1 = a1 + len; + unsigned long a2 = (unsigned long)start; + unsigned long b2 = (unsigned long)end; + + return !(b1 <= a2 || a1 >= b2); +} + +static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len) +{ + if (overlap(addr, len, _stext, _etext) || + overlap(addr, len, __start_rodata, __end_rodata)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); +} + +static void check_sync(struct device *dev, + struct dma_debug_entry *ref, + bool to_cpu) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + bucket = get_hash_bucket(ref, &flags); + + entry = bucket_find_contain(&bucket, ref, &flags); + + if (!entry) { + err_printk(dev, NULL, "DMA-API: device driver tries " + "to sync DMA memory it has not allocated " + "[device address=0x%016llx] [size=%llu bytes]\n", + (unsigned long long)ref->dev_addr, ref->size); + goto out; + } + + if (ref->size > entry->size) { + err_printk(dev, entry, "DMA-API: device driver syncs" + " DMA memory outside allocated range " + "[device address=0x%016llx] " + "[allocation size=%llu bytes] " + "[sync offset+size=%llu]\n", + entry->dev_addr, entry->size, + ref->size); + } + + if (entry->direction == DMA_BIDIRECTIONAL) + goto out; + + if (ref->direction != entry->direction) { + err_printk(dev, entry, "DMA-API: device driver syncs " + "DMA memory with different direction " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + } + + if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) && + !(ref->direction == DMA_TO_DEVICE)) + err_printk(dev, entry, "DMA-API: device driver syncs " + "device read-only DMA memory for cpu " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + + if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) && + !(ref->direction == DMA_FROM_DEVICE)) + err_printk(dev, entry, "DMA-API: device driver syncs " + "device write-only DMA memory to device " + "[device address=0x%016llx] [size=%llu bytes] " + "[mapped with %s] [synced with %s]\n", + (unsigned long long)ref->dev_addr, entry->size, + dir2name[entry->direction], + dir2name[ref->direction]); + + if (ref->sg_call_ents && ref->type == dma_debug_sg && + ref->sg_call_ents != entry->sg_call_ents) { + err_printk(ref->dev, entry, "DMA-API: device driver syncs " + "DMA sg list with different entry count " + "[map count=%d] [sync count=%d]\n", + entry->sg_call_ents, ref->sg_call_ents); + } + +out: + put_hash_bucket(bucket, &flags); +} + +static void check_sg_segment(struct device *dev, struct scatterlist *sg) +{ +#ifdef CONFIG_DMA_API_DEBUG_SG + unsigned int max_seg = dma_get_max_seg_size(dev); + u64 start, end, boundary = dma_get_seg_boundary(dev); + + /* + * Either the driver forgot to set dma_parms appropriately, or + * whoever generated the list forgot to check them. + */ + if (sg->length > max_seg) + err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n", + sg->length, max_seg); + /* + * In some cases this could potentially be the DMA API + * implementation's fault, but it would usually imply that + * the scatterlist was built inappropriately to begin with. + */ + start = sg_dma_address(sg); + end = start + sg_dma_len(sg) - 1; + if ((start ^ end) & ~boundary) + err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n", + start, end, boundary); +#endif +} + +void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, + size_t size, int direction, dma_addr_t dma_addr, + bool map_single) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + if (dma_mapping_error(dev, dma_addr)) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->dev = dev; + entry->type = dma_debug_page; + entry->pfn = page_to_pfn(page); + entry->offset = offset, + entry->dev_addr = dma_addr; + entry->size = size; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + if (map_single) + entry->type = dma_debug_single; + + check_for_stack(dev, page, offset); + + if (!PageHighMem(page)) { + void *addr = page_address(page) + offset; + + check_for_illegal_area(dev, addr, size); + } + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_page); + +void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + struct dma_debug_entry ref; + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + + if (unlikely(dma_debug_disabled())) + return; + + ref.dev = dev; + ref.dev_addr = dma_addr; + bucket = get_hash_bucket(&ref, &flags); + + list_for_each_entry(entry, &bucket->list, list) { + if (!exact_match(&ref, entry)) + continue; + + /* + * The same physical address can be mapped multiple + * times. Without a hardware IOMMU this results in the + * same device addresses being put into the dma-debug + * hash multiple times too. This can result in false + * positives being reported. Therefore we implement a + * best-fit algorithm here which updates the first entry + * from the hash which fits the reference value and is + * not currently listed as being checked. + */ + if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { + entry->map_err_type = MAP_ERR_CHECKED; + break; + } + } + + put_hash_bucket(bucket, &flags); +} +EXPORT_SYMBOL(debug_dma_mapping_error); + +void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, int direction, bool map_single) +{ + struct dma_debug_entry ref = { + .type = dma_debug_page, + .dev = dev, + .dev_addr = addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + if (map_single) + ref.type = dma_debug_single; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_page); + +void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, int mapped_ents, int direction) +{ + struct dma_debug_entry *entry; + struct scatterlist *s; + int i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, mapped_ents, i) { + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_sg; + entry->dev = dev; + entry->pfn = page_to_pfn(sg_page(s)); + entry->offset = s->offset, + entry->size = sg_dma_len(s); + entry->dev_addr = sg_dma_address(s); + entry->direction = direction; + entry->sg_call_ents = nents; + entry->sg_mapped_ents = mapped_ents; + + check_for_stack(dev, sg_page(s), s->offset); + + if (!PageHighMem(sg_page(s))) { + check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); + } + + check_sg_segment(dev, s); + + add_dma_entry(entry); + } +} +EXPORT_SYMBOL(debug_dma_map_sg); + +static int get_nr_mapped_entries(struct device *dev, + struct dma_debug_entry *ref) +{ + struct dma_debug_entry *entry; + struct hash_bucket *bucket; + unsigned long flags; + int mapped_ents; + + bucket = get_hash_bucket(ref, &flags); + entry = bucket_find_exact(bucket, ref); + mapped_ents = 0; + + if (entry) + mapped_ents = entry->sg_mapped_ents; + put_hash_bucket(bucket, &flags); + + return mapped_ents; +} + +void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sglist, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = dir, + .sg_call_ents = nelems, + }; + + if (mapped_ents && i >= mapped_ents) + break; + + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + check_unmap(&ref); + } +} +EXPORT_SYMBOL(debug_dma_unmap_sg); + +void debug_dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t dma_addr, void *virt) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + if (unlikely(virt == NULL)) + return; + + /* handle vmalloc and linear addresses */ + if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_coherent; + entry->dev = dev; + entry->offset = offset_in_page(virt); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = DMA_BIDIRECTIONAL; + + if (is_vmalloc_addr(virt)) + entry->pfn = vmalloc_to_pfn(virt); + else + entry->pfn = page_to_pfn(virt_to_page(virt)); + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_alloc_coherent); + +void debug_dma_free_coherent(struct device *dev, size_t size, + void *virt, dma_addr_t addr) +{ + struct dma_debug_entry ref = { + .type = dma_debug_coherent, + .dev = dev, + .offset = offset_in_page(virt), + .dev_addr = addr, + .size = size, + .direction = DMA_BIDIRECTIONAL, + }; + + /* handle vmalloc and linear addresses */ + if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) + return; + + if (is_vmalloc_addr(virt)) + ref.pfn = vmalloc_to_pfn(virt); + else + ref.pfn = page_to_pfn(virt_to_page(virt)); + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_free_coherent); + +void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, + int direction, dma_addr_t dma_addr) +{ + struct dma_debug_entry *entry; + + if (unlikely(dma_debug_disabled())) + return; + + entry = dma_entry_alloc(); + if (!entry) + return; + + entry->type = dma_debug_resource; + entry->dev = dev; + entry->pfn = PHYS_PFN(addr); + entry->offset = offset_in_page(addr); + entry->size = size; + entry->dev_addr = dma_addr; + entry->direction = direction; + entry->map_err_type = MAP_ERR_NOT_CHECKED; + + add_dma_entry(entry); +} +EXPORT_SYMBOL(debug_dma_map_resource); + +void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, + size_t size, int direction) +{ + struct dma_debug_entry ref = { + .type = dma_debug_resource, + .dev = dev, + .dev_addr = dma_addr, + .size = size, + .direction = direction, + }; + + if (unlikely(dma_debug_disabled())) + return; + + check_unmap(&ref); +} +EXPORT_SYMBOL(debug_dma_unmap_resource); + +void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, + size_t size, int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, true); +} +EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); + +void debug_dma_sync_single_for_device(struct device *dev, + dma_addr_t dma_handle, size_t size, + int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, false); +} +EXPORT_SYMBOL(debug_dma_sync_single_for_device); + +void debug_dma_sync_single_range_for_cpu(struct device *dev, + dma_addr_t dma_handle, + unsigned long offset, size_t size, + int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = offset + size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, true); +} +EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu); + +void debug_dma_sync_single_range_for_device(struct device *dev, + dma_addr_t dma_handle, + unsigned long offset, + size_t size, int direction) +{ + struct dma_debug_entry ref; + + if (unlikely(dma_debug_disabled())) + return; + + ref.type = dma_debug_single; + ref.dev = dev; + ref.dev_addr = dma_handle; + ref.size = offset + size; + ref.direction = direction; + ref.sg_call_ents = 0; + + check_sync(dev, &ref, false); +} +EXPORT_SYMBOL(debug_dma_sync_single_range_for_device); + +void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = direction, + .sg_call_ents = nelems, + }; + + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + if (i >= mapped_ents) + break; + + check_sync(dev, &ref, true); + } +} +EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); + +void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, + int nelems, int direction) +{ + struct scatterlist *s; + int mapped_ents = 0, i; + + if (unlikely(dma_debug_disabled())) + return; + + for_each_sg(sg, s, nelems, i) { + + struct dma_debug_entry ref = { + .type = dma_debug_sg, + .dev = dev, + .pfn = page_to_pfn(sg_page(s)), + .offset = s->offset, + .dev_addr = sg_dma_address(s), + .size = sg_dma_len(s), + .direction = direction, + .sg_call_ents = nelems, + }; + if (!i) + mapped_ents = get_nr_mapped_entries(dev, &ref); + + if (i >= mapped_ents) + break; + + check_sync(dev, &ref, false); + } +} +EXPORT_SYMBOL(debug_dma_sync_sg_for_device); + +static int __init dma_debug_driver_setup(char *str) +{ + int i; + + for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) { + current_driver_name[i] = *str; + if (*str == 0) + break; + } + + if (current_driver_name[0]) + pr_info("DMA-API: enable driver filter for driver [%s]\n", + current_driver_name); + + + return 1; +} +__setup("dma_debug_driver=", dma_debug_driver_setup); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c new file mode 100644 index 000000000000..8be8106270c2 --- /dev/null +++ b/kernel/dma/direct.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DMA operations that map physical memory directly without using an IOMMU or + * flushing caches. + */ +#include +#include +#include +#include +#include +#include +#include + +#define DIRECT_MAPPING_ERROR 0 + +/* + * Most architectures use ZONE_DMA for the first 16 Megabytes, but + * some use it for entirely different regions: + */ +#ifndef ARCH_ZONE_DMA_BITS +#define ARCH_ZONE_DMA_BITS 24 +#endif + +/* + * For AMD SEV all DMA must be to unencrypted addresses. + */ +static inline bool force_dma_unencrypted(void) +{ + return sev_active(); +} + +static bool +check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, + const char *caller) +{ + if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { + if (!dev->dma_mask) { + dev_err(dev, + "%s: call on device without dma_mask\n", + caller); + return false; + } + + if (*dev->dma_mask >= DMA_BIT_MASK(32)) { + dev_err(dev, + "%s: overflow %pad+%zu of device mask %llx\n", + caller, &dma_addr, size, *dev->dma_mask); + } + return false; + } + return true; +} + +static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) +{ + dma_addr_t addr = force_dma_unencrypted() ? + __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); + return addr + size - 1 <= dev->coherent_dma_mask; +} + +void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + int page_order = get_order(size); + struct page *page = NULL; + void *ret; + + /* we always manually zero the memory once we are done: */ + gfp &= ~__GFP_ZERO; + + /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ + if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + gfp |= GFP_DMA; + if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) + gfp |= GFP_DMA32; + +again: + /* CMA can be used only in the context which permits sleeping */ + if (gfpflags_allow_blocking(gfp)) { + page = dma_alloc_from_contiguous(dev, count, page_order, gfp); + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } + } + if (!page) + page = alloc_pages_node(dev_to_node(dev), gfp, page_order); + + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + __free_pages(page, page_order); + page = NULL; + + if (IS_ENABLED(CONFIG_ZONE_DMA32) && + dev->coherent_dma_mask < DMA_BIT_MASK(64) && + !(gfp & (GFP_DMA32 | GFP_DMA))) { + gfp |= GFP_DMA32; + goto again; + } + + if (IS_ENABLED(CONFIG_ZONE_DMA) && + dev->coherent_dma_mask < DMA_BIT_MASK(32) && + !(gfp & GFP_DMA)) { + gfp = (gfp & ~GFP_DMA32) | GFP_DMA; + goto again; + } + } + + if (!page) + return NULL; + ret = page_address(page); + if (force_dma_unencrypted()) { + set_memory_decrypted((unsigned long)ret, 1 << page_order); + *dma_handle = __phys_to_dma(dev, page_to_phys(page)); + } else { + *dma_handle = phys_to_dma(dev, page_to_phys(page)); + } + memset(ret, 0, size); + return ret; +} + +/* + * NOTE: this function must never look at the dma_addr argument, because we want + * to be able to use it as a helper for iommu implementations as well. + */ +void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_addr, unsigned long attrs) +{ + unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned int page_order = get_order(size); + + if (force_dma_unencrypted()) + set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); + if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count)) + free_pages((unsigned long)cpu_addr, page_order); +} + +dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset; + + if (!check_addr(dev, dma_addr, size, __func__)) + return DIRECT_MAPPING_ERROR; + return dma_addr; +} + +int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, + enum dma_data_direction dir, unsigned long attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + + sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); + if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__)) + return 0; + sg_dma_len(sg) = sg->length; + } + + return nents; +} + +int dma_direct_supported(struct device *dev, u64 mask) +{ +#ifdef CONFIG_ZONE_DMA + if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + return 0; +#else + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask < DMA_BIT_MASK(32)) + return 0; +#endif + /* + * Various PCI/PCIe bridges have broken support for > 32bit DMA even + * if the device itself might support it. + */ + if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32)) + return 0; + return 1; +} + +int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == DIRECT_MAPPING_ERROR; +} + +const struct dma_map_ops dma_direct_ops = { + .alloc = dma_direct_alloc, + .free = dma_direct_free, + .map_page = dma_direct_map_page, + .map_sg = dma_direct_map_sg, + .dma_supported = dma_direct_supported, + .mapping_error = dma_direct_mapping_error, +}; +EXPORT_SYMBOL(dma_direct_ops); diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c new file mode 100644 index 000000000000..d2a92ddaac4d --- /dev/null +++ b/kernel/dma/mapping.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arch-independent dma-mapping routines + * + * Copyright (c) 2006 SUSE Linux Products GmbH + * Copyright (c) 2006 Tejun Heo + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Managed DMA API + */ +struct dma_devres { + size_t size; + void *vaddr; + dma_addr_t dma_handle; + unsigned long attrs; +}; + +static void dmam_release(struct device *dev, void *res) +{ + struct dma_devres *this = res; + + dma_free_attrs(dev, this->size, this->vaddr, this->dma_handle, + this->attrs); +} + +static int dmam_match(struct device *dev, void *res, void *match_data) +{ + struct dma_devres *this = res, *match = match_data; + + if (this->vaddr == match->vaddr) { + WARN_ON(this->size != match->size || + this->dma_handle != match->dma_handle); + return 1; + } + return 0; +} + +/** + * dmam_alloc_coherent - Managed dma_alloc_coherent() + * @dev: Device to allocate coherent memory for + * @size: Size of allocation + * @dma_handle: Out argument for allocated DMA handle + * @gfp: Allocation flags + * + * Managed dma_alloc_coherent(). Memory allocated using this function + * will be automatically released on driver detach. + * + * RETURNS: + * Pointer to allocated memory on success, NULL on failure. + */ +void *dmam_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct dma_devres *dr; + void *vaddr; + + dr = devres_alloc(dmam_release, sizeof(*dr), gfp); + if (!dr) + return NULL; + + vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp); + if (!vaddr) { + devres_free(dr); + return NULL; + } + + dr->vaddr = vaddr; + dr->dma_handle = *dma_handle; + dr->size = size; + + devres_add(dev, dr); + + return vaddr; +} +EXPORT_SYMBOL(dmam_alloc_coherent); + +/** + * dmam_free_coherent - Managed dma_free_coherent() + * @dev: Device to free coherent memory for + * @size: Size of allocation + * @vaddr: Virtual address of the memory to free + * @dma_handle: DMA handle of the memory to free + * + * Managed dma_free_coherent(). + */ +void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle) +{ + struct dma_devres match_data = { size, vaddr, dma_handle }; + + dma_free_coherent(dev, size, vaddr, dma_handle); + WARN_ON(devres_destroy(dev, dmam_release, dmam_match, &match_data)); +} +EXPORT_SYMBOL(dmam_free_coherent); + +/** + * dmam_alloc_attrs - Managed dma_alloc_attrs() + * @dev: Device to allocate non_coherent memory for + * @size: Size of allocation + * @dma_handle: Out argument for allocated DMA handle + * @gfp: Allocation flags + * @attrs: Flags in the DMA_ATTR_* namespace. + * + * Managed dma_alloc_attrs(). Memory allocated using this function will be + * automatically released on driver detach. + * + * RETURNS: + * Pointer to allocated memory on success, NULL on failure. + */ +void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + struct dma_devres *dr; + void *vaddr; + + dr = devres_alloc(dmam_release, sizeof(*dr), gfp); + if (!dr) + return NULL; + + vaddr = dma_alloc_attrs(dev, size, dma_handle, gfp, attrs); + if (!vaddr) { + devres_free(dr); + return NULL; + } + + dr->vaddr = vaddr; + dr->dma_handle = *dma_handle; + dr->size = size; + dr->attrs = attrs; + + devres_add(dev, dr); + + return vaddr; +} +EXPORT_SYMBOL(dmam_alloc_attrs); + +#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT + +static void dmam_coherent_decl_release(struct device *dev, void *res) +{ + dma_release_declared_memory(dev); +} + +/** + * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory() + * @dev: Device to declare coherent memory for + * @phys_addr: Physical address of coherent memory to be declared + * @device_addr: Device address of coherent memory to be declared + * @size: Size of coherent memory to be declared + * @flags: Flags + * + * Managed dma_declare_coherent_memory(). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void *res; + int rc; + + res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL); + if (!res) + return -ENOMEM; + + rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size, + flags); + if (!rc) + devres_add(dev, res); + else + devres_free(res); + + return rc; +} +EXPORT_SYMBOL(dmam_declare_coherent_memory); + +/** + * dmam_release_declared_memory - Managed dma_release_declared_memory(). + * @dev: Device to release declared coherent memory for + * + * Managed dmam_release_declared_memory(). + */ +void dmam_release_declared_memory(struct device *dev) +{ + WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL)); +} +EXPORT_SYMBOL(dmam_release_declared_memory); + +#endif + +/* + * Create scatter-list for the already allocated DMA buffer. + */ +int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t handle, size_t size) +{ + struct page *page = virt_to_page(cpu_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (unlikely(ret)) + return ret; + + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return 0; +} +EXPORT_SYMBOL(dma_common_get_sgtable); + +/* + * Create userspace mapping for the DMA-coherent memory. + */ +int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size) +{ + int ret = -ENXIO; +#ifndef CONFIG_ARCH_NO_COHERENT_DMA_MMAP + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long off = vma->vm_pgoff; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (off < count && user_count <= (count - off)) + ret = remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + off, + user_count << PAGE_SHIFT, + vma->vm_page_prot); +#endif /* !CONFIG_ARCH_NO_COHERENT_DMA_MMAP */ + + return ret; +} +EXPORT_SYMBOL(dma_common_mmap); + +#ifdef CONFIG_MMU +static struct vm_struct *__dma_common_pages_remap(struct page **pages, + size_t size, unsigned long vm_flags, pgprot_t prot, + const void *caller) +{ + struct vm_struct *area; + + area = get_vm_area_caller(size, vm_flags, caller); + if (!area) + return NULL; + + if (map_vm_area(area, prot, pages)) { + vunmap(area->addr); + return NULL; + } + + return area; +} + +/* + * remaps an array of PAGE_SIZE pages into another vm_area + * Cannot be used in non-sleeping contexts + */ +void *dma_common_pages_remap(struct page **pages, size_t size, + unsigned long vm_flags, pgprot_t prot, + const void *caller) +{ + struct vm_struct *area; + + area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + if (!area) + return NULL; + + area->pages = pages; + + return area->addr; +} + +/* + * remaps an allocated contiguous region into another vm_area. + * Cannot be used in non-sleeping contexts + */ + +void *dma_common_contiguous_remap(struct page *page, size_t size, + unsigned long vm_flags, + pgprot_t prot, const void *caller) +{ + int i; + struct page **pages; + struct vm_struct *area; + + pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); + if (!pages) + return NULL; + + for (i = 0; i < (size >> PAGE_SHIFT); i++) + pages[i] = nth_page(page, i); + + area = __dma_common_pages_remap(pages, size, vm_flags, prot, caller); + + kfree(pages); + + if (!area) + return NULL; + return area->addr; +} + +/* + * unmaps a range previously mapped by dma_common_*_remap + */ +void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) +{ + struct vm_struct *area = find_vm_area(cpu_addr); + + if (!area || (area->flags & vm_flags) != vm_flags) { + WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); + return; + } + + unmap_kernel_range((unsigned long)cpu_addr, PAGE_ALIGN(size)); + vunmap(cpu_addr); +} +#endif + +/* + * enables DMA API use for a device + */ +int dma_configure(struct device *dev) +{ + if (dev->bus->dma_configure) + return dev->bus->dma_configure(dev); + return 0; +} + +void dma_deconfigure(struct device *dev) +{ + of_dma_deconfigure(dev); + acpi_dma_deconfigure(dev); +} diff --git a/kernel/dma/noncoherent.c b/kernel/dma/noncoherent.c new file mode 100644 index 000000000000..79e9a757387f --- /dev/null +++ b/kernel/dma/noncoherent.c @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018 Christoph Hellwig. + * + * DMA operations that map physical memory directly without providing cache + * coherence. + */ +#include +#include +#include +#include +#include + +static void dma_noncoherent_sync_single_for_device(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); +} + +static void dma_noncoherent_sync_sg_for_device(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); +} + +static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + dma_addr_t addr; + + addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); + if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(dev, page_to_phys(page) + offset, + size, dir); + return addr; +} + +static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs); + if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir); + return nents; +} + +#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU +static void dma_noncoherent_sync_single_for_cpu(struct device *dev, + dma_addr_t addr, size_t size, enum dma_data_direction dir) +{ + arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); +} + +static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, + struct scatterlist *sgl, int nents, enum dma_data_direction dir) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nents, i) + arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); +} + +static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir); +} + +static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, unsigned long attrs) +{ + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir); +} +#endif + +const struct dma_map_ops dma_noncoherent_ops = { + .alloc = arch_dma_alloc, + .free = arch_dma_free, + .mmap = arch_dma_mmap, + .sync_single_for_device = dma_noncoherent_sync_single_for_device, + .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, + .map_page = dma_noncoherent_map_page, + .map_sg = dma_noncoherent_map_sg, +#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU + .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, + .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, + .unmap_page = dma_noncoherent_unmap_page, + .unmap_sg = dma_noncoherent_unmap_sg, +#endif + .dma_supported = dma_direct_supported, + .mapping_error = dma_direct_mapping_error, + .cache_sync = arch_dma_cache_sync, +}; +EXPORT_SYMBOL(dma_noncoherent_ops); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c new file mode 100644 index 000000000000..04b68d9dffac --- /dev/null +++ b/kernel/dma/swiotlb.c @@ -0,0 +1,1087 @@ +/* + * Dynamic DMA mapping support. + * + * This implementation is a fallback for platforms that do not support + * I/O TLBs (aka DMA address translation hardware). + * Copyright (C) 2000 Asit Mallick + * Copyright (C) 2000 Goutham Rao + * Copyright (C) 2000, 2003 Hewlett-Packard Co + * David Mosberger-Tang + * + * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. + * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid + * unnecessary i-cache flushing. + * 04/07/.. ak Better overflow handling. Assorted fixes. + * 05/09/10 linville Add support for syncing ranges, support syncing for + * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. + * 08/12/11 beckyb Add highmem support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#define OFFSET(val,align) ((unsigned long) \ + ( (val) & ( (align) - 1))) + +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) + +/* + * Minimum IO TLB size to bother booting with. Systems with mainly + * 64bit capable cards will only lightly use the swiotlb. If we can't + * allocate a contiguous 1MB, we're probably in trouble anyway. + */ +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) + +enum swiotlb_force swiotlb_force; + +/* + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ +static phys_addr_t io_tlb_start, io_tlb_end; + +/* + * The number of IO TLB blocks (in groups of 64) between io_tlb_start and + * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. + */ +static unsigned long io_tlb_nslabs; + +/* + * When the IOMMU overflows we return a fallback buffer. This sets the size. + */ +static unsigned long io_tlb_overflow = 32*1024; + +static phys_addr_t io_tlb_overflow_buffer; + +/* + * This is a free list describing the number of free entries available from + * each index + */ +static unsigned int *io_tlb_list; +static unsigned int io_tlb_index; + +/* + * Max segment that we can provide which (if pages are contingous) will + * not be bounced (unless SWIOTLB_FORCE is set). + */ +unsigned int max_segment; + +/* + * We need to save away the original address corresponding to a mapped entry + * for the sync operations. + */ +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) +static phys_addr_t *io_tlb_orig_addr; + +/* + * Protect the above data structures in the map and unmap calls + */ +static DEFINE_SPINLOCK(io_tlb_lock); + +static int late_alloc; + +static int __init +setup_io_tlb_npages(char *str) +{ + if (isdigit(*str)) { + io_tlb_nslabs = simple_strtoul(str, &str, 0); + /* avoid tail segment of size < IO_TLB_SEGSIZE */ + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + if (*str == ',') + ++str; + if (!strcmp(str, "force")) { + swiotlb_force = SWIOTLB_FORCE; + } else if (!strcmp(str, "noforce")) { + swiotlb_force = SWIOTLB_NO_FORCE; + io_tlb_nslabs = 1; + } + + return 0; +} +early_param("swiotlb", setup_io_tlb_npages); +/* make io_tlb_overflow tunable too? */ + +unsigned long swiotlb_nr_tbl(void) +{ + return io_tlb_nslabs; +} +EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); + +unsigned int swiotlb_max_segment(void) +{ + return max_segment; +} +EXPORT_SYMBOL_GPL(swiotlb_max_segment); + +void swiotlb_set_max_segment(unsigned int val) +{ + if (swiotlb_force == SWIOTLB_FORCE) + max_segment = 1; + else + max_segment = rounddown(val, PAGE_SIZE); +} + +/* default to 64MB */ +#define IO_TLB_DEFAULT_SIZE (64UL<<20) +unsigned long swiotlb_size_or_default(void) +{ + unsigned long size; + + size = io_tlb_nslabs << IO_TLB_SHIFT; + + return size ? size : (IO_TLB_DEFAULT_SIZE); +} + +static bool no_iotlb_memory; + +void swiotlb_print_info(void) +{ + unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; + unsigned char *vstart, *vend; + + if (no_iotlb_memory) { + pr_warn("software IO TLB: No low mem\n"); + return; + } + + vstart = phys_to_virt(io_tlb_start); + vend = phys_to_virt(io_tlb_end); + + printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", + (unsigned long long)io_tlb_start, + (unsigned long long)io_tlb_end, + bytes >> 20, vstart, vend - 1); +} + +/* + * Early SWIOTLB allocation may be too early to allow an architecture to + * perform the desired operations. This function allows the architecture to + * call SWIOTLB when the operations are possible. It needs to be called + * before the SWIOTLB memory is used. + */ +void __init swiotlb_update_mem_attributes(void) +{ + void *vaddr; + unsigned long bytes; + + if (no_iotlb_memory || late_alloc) + return; + + vaddr = phys_to_virt(io_tlb_start); + bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); + set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); + memset(vaddr, 0, bytes); + + vaddr = phys_to_virt(io_tlb_overflow_buffer); + bytes = PAGE_ALIGN(io_tlb_overflow); + set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); + memset(vaddr, 0, bytes); +} + +int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) +{ + void *v_overflow_buffer; + unsigned long i, bytes; + + bytes = nslabs << IO_TLB_SHIFT; + + io_tlb_nslabs = nslabs; + io_tlb_start = __pa(tlb); + io_tlb_end = io_tlb_start + bytes; + + /* + * Get the overflow emergency buffer + */ + v_overflow_buffer = memblock_virt_alloc_low_nopanic( + PAGE_ALIGN(io_tlb_overflow), + PAGE_SIZE); + if (!v_overflow_buffer) + return -ENOMEM; + + io_tlb_overflow_buffer = __pa(v_overflow_buffer); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE + * between io_tlb_start and io_tlb_end. + */ + io_tlb_list = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), + PAGE_SIZE); + io_tlb_orig_addr = memblock_virt_alloc( + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), + PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + io_tlb_index = 0; + + if (verbose) + swiotlb_print_info(); + + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + return 0; +} + +/* + * Statically reserve bounce buffer space and initialize bounce buffer data + * structures for the software IO TLB used to implement the DMA API. + */ +void __init +swiotlb_init(int verbose) +{ + size_t default_size = IO_TLB_DEFAULT_SIZE; + unsigned char *vstart; + unsigned long bytes; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + /* Get IO TLB memory from the low pages */ + vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); + if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) + return; + + if (io_tlb_start) + memblock_free_early(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + pr_warn("Cannot allocate SWIOTLB buffer"); + no_iotlb_memory = true; +} + +/* + * Systems with larger DMA zones (those that don't support ISA) can + * initialize the swiotlb later using the slab allocator if needed. + * This should be just like above, but with some error catching. + */ +int +swiotlb_late_init_with_default_size(size_t default_size) +{ + unsigned long bytes, req_nslabs = io_tlb_nslabs; + unsigned char *vstart = NULL; + unsigned int order; + int rc = 0; + + if (!io_tlb_nslabs) { + io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + } + + /* + * Get IO TLB memory from the low pages + */ + order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); + io_tlb_nslabs = SLABS_PER_PAGE << order; + bytes = io_tlb_nslabs << IO_TLB_SHIFT; + + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { + vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, + order); + if (vstart) + break; + order--; + } + + if (!vstart) { + io_tlb_nslabs = req_nslabs; + return -ENOMEM; + } + if (order != get_order(bytes)) { + printk(KERN_WARNING "Warning: only able to allocate %ld MB " + "for software IO TLB\n", (PAGE_SIZE << order) >> 20); + io_tlb_nslabs = SLABS_PER_PAGE << order; + } + rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); + if (rc) + free_pages((unsigned long)vstart, order); + + return rc; +} + +int +swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) +{ + unsigned long i, bytes; + unsigned char *v_overflow_buffer; + + bytes = nslabs << IO_TLB_SHIFT; + + io_tlb_nslabs = nslabs; + io_tlb_start = virt_to_phys(tlb); + io_tlb_end = io_tlb_start + bytes; + + set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); + memset(tlb, 0, bytes); + + /* + * Get the overflow emergency buffer + */ + v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, + get_order(io_tlb_overflow)); + if (!v_overflow_buffer) + goto cleanup2; + + set_memory_decrypted((unsigned long)v_overflow_buffer, + io_tlb_overflow >> PAGE_SHIFT); + memset(v_overflow_buffer, 0, io_tlb_overflow); + io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); + + /* + * Allocate and initialize the free list array. This array is used + * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE + * between io_tlb_start and io_tlb_end. + */ + io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * sizeof(int))); + if (!io_tlb_list) + goto cleanup3; + + io_tlb_orig_addr = (phys_addr_t *) + __get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * + sizeof(phys_addr_t))); + if (!io_tlb_orig_addr) + goto cleanup4; + + for (i = 0; i < io_tlb_nslabs; i++) { + io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + io_tlb_index = 0; + + swiotlb_print_info(); + + late_alloc = 1; + + swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); + + return 0; + +cleanup4: + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + io_tlb_list = NULL; +cleanup3: + free_pages((unsigned long)v_overflow_buffer, + get_order(io_tlb_overflow)); + io_tlb_overflow_buffer = 0; +cleanup2: + io_tlb_end = 0; + io_tlb_start = 0; + io_tlb_nslabs = 0; + max_segment = 0; + return -ENOMEM; +} + +void __init swiotlb_exit(void) +{ + if (!io_tlb_orig_addr) + return; + + if (late_alloc) { + free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), + get_order(io_tlb_overflow)); + free_pages((unsigned long)io_tlb_orig_addr, + get_order(io_tlb_nslabs * sizeof(phys_addr_t))); + free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * + sizeof(int))); + free_pages((unsigned long)phys_to_virt(io_tlb_start), + get_order(io_tlb_nslabs << IO_TLB_SHIFT)); + } else { + memblock_free_late(io_tlb_overflow_buffer, + PAGE_ALIGN(io_tlb_overflow)); + memblock_free_late(__pa(io_tlb_orig_addr), + PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(io_tlb_list), + PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); + memblock_free_late(io_tlb_start, + PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + } + io_tlb_nslabs = 0; + max_segment = 0; +} + +int is_swiotlb_buffer(phys_addr_t paddr) +{ + return paddr >= io_tlb_start && paddr < io_tlb_end; +} + +/* + * Bounce: copy the swiotlb buffer back to the original dma location + */ +static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir) +{ + unsigned long pfn = PFN_DOWN(orig_addr); + unsigned char *vaddr = phys_to_virt(tlb_addr); + + if (PageHighMem(pfn_to_page(pfn))) { + /* The buffer does not have a mapping. Map it in and copy */ + unsigned int offset = orig_addr & ~PAGE_MASK; + char *buffer; + unsigned int sz = 0; + unsigned long flags; + + while (size) { + sz = min_t(size_t, PAGE_SIZE - offset, size); + + local_irq_save(flags); + buffer = kmap_atomic(pfn_to_page(pfn)); + if (dir == DMA_TO_DEVICE) + memcpy(vaddr, buffer + offset, sz); + else + memcpy(buffer + offset, vaddr, sz); + kunmap_atomic(buffer); + local_irq_restore(flags); + + size -= sz; + pfn++; + vaddr += sz; + offset = 0; + } + } else if (dir == DMA_TO_DEVICE) { + memcpy(vaddr, phys_to_virt(orig_addr), size); + } else { + memcpy(phys_to_virt(orig_addr), vaddr, size); + } +} + +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, + dma_addr_t tbl_dma_addr, + phys_addr_t orig_addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long flags; + phys_addr_t tlb_addr; + unsigned int nslots, stride, index, wrap; + int i; + unsigned long mask; + unsigned long offset_slots; + unsigned long max_slots; + + if (no_iotlb_memory) + panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + + if (mem_encrypt_active()) + pr_warn_once("%s is active and system is using DMA bounce buffers\n", + sme_active() ? "SME" : "SEV"); + + mask = dma_get_seg_boundary(hwdev); + + tbl_dma_addr &= mask; + + offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + + /* + * Carefully handle integer overflow which can occur when mask == ~0UL. + */ + max_slots = mask + 1 + ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT + : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); + + /* + * For mappings greater than or equal to a page, we limit the stride + * (and hence alignment) to a page size. + */ + nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + if (size >= PAGE_SIZE) + stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); + else + stride = 1; + + BUG_ON(!nslots); + + /* + * Find suitable number of IO TLB entries size that will fit this + * request and allocate a buffer from that IO TLB pool. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + index = ALIGN(io_tlb_index, stride); + if (index >= io_tlb_nslabs) + index = 0; + wrap = index; + + do { + while (iommu_is_span_boundary(index, nslots, offset_slots, + max_slots)) { + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + if (index == wrap) + goto not_found; + } + + /* + * If we find a slot that indicates we have 'nslots' number of + * contiguous buffers, we allocate the buffers from that slot + * and mark the entries as '0' indicating unavailable. + */ + if (io_tlb_list[index] >= nslots) { + int count = 0; + + for (i = index; i < (int) (index + nslots); i++) + io_tlb_list[i] = 0; + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); + + /* + * Update the indices to avoid searching in the next + * round. + */ + io_tlb_index = ((index + nslots) < io_tlb_nslabs + ? (index + nslots) : 0); + + goto found; + } + index += stride; + if (index >= io_tlb_nslabs) + index = 0; + } while (index != wrap); + +not_found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) + dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); + return SWIOTLB_MAP_ERROR; +found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + + /* + * Save away the mapping from the original address to the DMA address. + * This is needed when we sync the memory. Then we sync the buffer if + * needed. + */ + for (i = 0; i < nslots; i++) + io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); + + return tlb_addr; +} + +/* + * Allocates bounce buffer and returns its physical address. + */ +static phys_addr_t +map_single(struct device *hwdev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + dma_addr_t start_dma_addr; + + if (swiotlb_force == SWIOTLB_NO_FORCE) { + dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", + &phys); + return SWIOTLB_MAP_ERROR; + } + + start_dma_addr = __phys_to_dma(hwdev, io_tlb_start); + return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, + dir, attrs); +} + +/* + * tlb_addr is the physical address of the bounce buffer to unmap. + */ +void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + unsigned long flags; + int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; + + /* + * First, sync the memory before unmapping the entry + */ + if (orig_addr != INVALID_PHYS_ADDR && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) + swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); + + /* + * Return the buffer to the free list by setting the corresponding + * entries to indicate the number of contiguous entries available. + * While returning the entries to the free list, we merge the entries + * with slots below and above the pool being returned. + */ + spin_lock_irqsave(&io_tlb_lock, flags); + { + count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? + io_tlb_list[index + nslots] : 0); + /* + * Step 1: return the slots to the free list, merging the + * slots with superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) { + io_tlb_list[i] = ++count; + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + } + /* + * Step 2: merge the returned slots with the preceding slots, + * if available (non zero) + */ + for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + } + spin_unlock_irqrestore(&io_tlb_lock, flags); +} + +void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + phys_addr_t orig_addr = io_tlb_orig_addr[index]; + + if (orig_addr == INVALID_PHYS_ADDR) + return; + orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); + + switch (target) { + case SYNC_FOR_CPU: + if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_FROM_DEVICE); + else + BUG_ON(dir != DMA_TO_DEVICE); + break; + case SYNC_FOR_DEVICE: + if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + swiotlb_bounce(orig_addr, tlb_addr, + size, DMA_TO_DEVICE); + else + BUG_ON(dir != DMA_FROM_DEVICE); + break; + default: + BUG(); + } +} + +static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr, + size_t size) +{ + u64 mask = DMA_BIT_MASK(32); + + if (dev && dev->coherent_dma_mask) + mask = dev->coherent_dma_mask; + return addr + size - 1 <= mask; +} + +static void * +swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, + unsigned long attrs) +{ + phys_addr_t phys_addr; + + if (swiotlb_force == SWIOTLB_NO_FORCE) + goto out_warn; + + phys_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + 0, size, DMA_FROM_DEVICE, attrs); + if (phys_addr == SWIOTLB_MAP_ERROR) + goto out_warn; + + *dma_handle = __phys_to_dma(dev, phys_addr); + if (!dma_coherent_ok(dev, *dma_handle, size)) + goto out_unmap; + + memset(phys_to_virt(phys_addr), 0, size); + return phys_to_virt(phys_addr); + +out_unmap: + dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", + (unsigned long long)dev->coherent_dma_mask, + (unsigned long long)*dma_handle); + + /* + * DMA_TO_DEVICE to avoid memcpy in unmap_single. + * DMA_ATTR_SKIP_CPU_SYNC is optional. + */ + swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); +out_warn: + if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { + dev_warn(dev, + "swiotlb: coherent allocation failed, size=%zu\n", + size); + dump_stack(); + } + return NULL; +} + +static bool swiotlb_free_buffer(struct device *dev, size_t size, + dma_addr_t dma_addr) +{ + phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); + + WARN_ON_ONCE(irqs_disabled()); + + if (!is_swiotlb_buffer(phys_addr)) + return false; + + /* + * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. + * DMA_ATTR_SKIP_CPU_SYNC is optional. + */ + swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + return true; +} + +static void +swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, + int do_panic) +{ + if (swiotlb_force == SWIOTLB_NO_FORCE) + return; + + /* + * Ran out of IOMMU space for this operation. This is very bad. + * Unfortunately the drivers cannot handle this operation properly. + * unless they check for dma_mapping_error (most don't) + * When the mapping is small enough return a static buffer to limit + * the damage, or panic when the transfer is too big. + */ + dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", + size); + + if (size <= io_tlb_overflow || !do_panic) + return; + + if (dir == DMA_BIDIRECTIONAL) + panic("DMA: Random memory could be DMA accessed\n"); + if (dir == DMA_FROM_DEVICE) + panic("DMA: Random memory could be DMA written\n"); + if (dir == DMA_TO_DEVICE) + panic("DMA: Random memory could be DMA read\n"); +} + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. + */ +dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t map, phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = phys_to_dma(dev, phys); + + BUG_ON(dir == DMA_NONE); + /* + * If the address happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) + return dev_addr; + + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + + /* Oh well, have to allocate and map a bounce buffer. */ + map = map_single(dev, phys, size, dir, attrs); + if (map == SWIOTLB_MAP_ERROR) { + swiotlb_full(dev, size, dir, 1); + return __phys_to_dma(dev, io_tlb_overflow_buffer); + } + + dev_addr = __phys_to_dma(dev, map); + + /* Ensure that the address returned is DMA'ble */ + if (dma_capable(dev, dev_addr, size)) + return dev_addr; + + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); + + return __phys_to_dma(dev, io_tlb_overflow_buffer); +} + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous swiotlb_map_page call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(paddr)) { + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + /* + * phys_to_virt doesn't work with hihgmem page but we could + * call dma_mark_clean() with hihgmem page here. However, we + * are fine since dma_mark_clean() is null on POWERPC. We can + * make dma_mark_clean() take a physical address if necessary. + */ + dma_mark_clean(phys_to_virt(paddr), size); +} + +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + unmap_single(hwdev, dev_addr, size, dir, attrs); +} + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a swiotlb_map_page() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the dma mapping, you must + * call this function before doing so. At the next point you give the dma + * address back to the card, you must first perform a + * swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +static void +swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (is_swiotlb_buffer(paddr)) { + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(phys_to_virt(paddr), size); +} + +void +swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); +} + +void +swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); +} + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above swiotlb_map_page + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for swiotlb_map_page are the + * same here. + */ +int +swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir, unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); + + if (swiotlb_force == SWIOTLB_FORCE || + !dma_capable(hwdev, dev_addr, sg->length)) { + phys_addr_t map = map_single(hwdev, sg_phys(sg), + sg->length, dir, attrs); + if (map == SWIOTLB_MAP_ERROR) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + swiotlb_full(hwdev, sg->length, dir, 0); + attrs |= DMA_ATTR_SKIP_CPU_SYNC; + swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, + attrs); + sg_dma_len(sgl) = 0; + return 0; + } + sg->dma_address = __phys_to_dma(hwdev, map); + } else + sg->dma_address = dev_addr; + sg_dma_len(sg) = sg->length; + } + return nelems; +} + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_page() above. + */ +void +swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + unsigned long attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) + unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, + attrs); +} + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +static void +swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nelems, i) + swiotlb_sync_single(hwdev, sg->dma_address, + sg_dma_len(sg), dir, target); +} + +void +swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); +} + +void +swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); +} + +int +swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) +{ + return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer)); +} + +/* + * Return whether the given device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +swiotlb_dma_supported(struct device *hwdev, u64 mask) +{ + return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; +} + +void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) +{ + void *vaddr; + + /* temporary workaround: */ + if (gfp & __GFP_NOWARN) + attrs |= DMA_ATTR_NO_WARN; + + /* + * Don't print a warning when the first allocation attempt fails. + * swiotlb_alloc_coherent() will print a warning when the DMA memory + * allocation ultimately failed. + */ + gfp |= __GFP_NOWARN; + + vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); + if (!vaddr) + vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs); + return vaddr; +} + +void swiotlb_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr, unsigned long attrs) +{ + if (!swiotlb_free_buffer(dev, size, dma_addr)) + dma_direct_free(dev, size, vaddr, dma_addr, attrs); +} + +const struct dma_map_ops swiotlb_dma_ops = { + .mapping_error = swiotlb_dma_mapping_error, + .alloc = swiotlb_alloc, + .free = swiotlb_free, + .sync_single_for_cpu = swiotlb_sync_single_for_cpu, + .sync_single_for_device = swiotlb_sync_single_for_device, + .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .map_sg = swiotlb_map_sg_attrs, + .unmap_sg = swiotlb_unmap_sg_attrs, + .map_page = swiotlb_map_page, + .unmap_page = swiotlb_unmap_page, + .dma_supported = dma_direct_supported, +}; diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c new file mode 100644 index 000000000000..631ddec4b60a --- /dev/null +++ b/kernel/dma/virt.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DMA operations that map to virtual addresses without flushing memory. + */ +#include +#include +#include +#include + +static void *dma_virt_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + unsigned long attrs) +{ + void *ret; + + ret = (void *)__get_free_pages(gfp, get_order(size)); + if (ret) + *dma_handle = (uintptr_t)ret; + return ret; +} + +static void dma_virt_free(struct device *dev, size_t size, + void *cpu_addr, dma_addr_t dma_addr, + unsigned long attrs) +{ + free_pages((unsigned long)cpu_addr, get_order(size)); +} + +static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + return (uintptr_t)(page_address(page) + offset); +} + +static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction dir, + unsigned long attrs) +{ + int i; + struct scatterlist *sg; + + for_each_sg(sgl, sg, nents, i) { + BUG_ON(!sg_page(sg)); + sg_dma_address(sg) = (uintptr_t)sg_virt(sg); + sg_dma_len(sg) = sg->length; + } + + return nents; +} + +const struct dma_map_ops dma_virt_ops = { + .alloc = dma_virt_alloc, + .free = dma_virt_free, + .map_page = dma_virt_map_page, + .map_sg = dma_virt_map_sg, +}; +EXPORT_SYMBOL(dma_virt_ops); diff --git a/lib/Kconfig b/lib/Kconfig index 809fdd155739..803fcbced729 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -420,60 +420,15 @@ config HAS_IOPORT_MAP depends on HAS_IOMEM && !NO_IOPORT_MAP default y -config HAS_DMA - bool - depends on !NO_DMA - default y +source "kernel/dma/Kconfig" config SGL_ALLOC bool default n -config NEED_SG_DMA_LENGTH - bool - -config NEED_DMA_MAP_STATE - bool - -config ARCH_DMA_ADDR_T_64BIT - def_bool 64BIT || PHYS_ADDR_T_64BIT - config IOMMU_HELPER bool -config ARCH_HAS_SYNC_DMA_FOR_DEVICE - bool - -config ARCH_HAS_SYNC_DMA_FOR_CPU - bool - select NEED_DMA_MAP_STATE - -config DMA_DIRECT_OPS - bool - depends on HAS_DMA - -config DMA_NONCOHERENT_OPS - bool - depends on HAS_DMA - select DMA_DIRECT_OPS - -config DMA_NONCOHERENT_MMAP - bool - depends on DMA_NONCOHERENT_OPS - -config DMA_NONCOHERENT_CACHE_SYNC - bool - depends on DMA_NONCOHERENT_OPS - -config DMA_VIRT_OPS - bool - depends on HAS_DMA - -config SWIOTLB - bool - select DMA_DIRECT_OPS - select NEED_DMA_MAP_STATE - config CHECK_SIGNATURE bool diff --git a/lib/Makefile b/lib/Makefile index 5e0e160c9242..8153fdab287f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -29,9 +29,6 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o -obj-$(CONFIG_DMA_DIRECT_OPS) += dma-direct.o -obj-$(CONFIG_DMA_NONCOHERENT_OPS) += dma-noncoherent.o -obj-$(CONFIG_DMA_VIRT_OPS) += dma-virt.o lib-y += kobject.o klist.o obj-y += lockref.o @@ -148,7 +145,6 @@ obj-$(CONFIG_SMP) += percpu_counter.o obj-$(CONFIG_AUDIT_GENERIC) += audit.o obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o -obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o @@ -169,8 +165,6 @@ obj-$(CONFIG_NLATTR) += nlattr.o obj-$(CONFIG_LRU_CACHE) += lru_cache.o -obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o - obj-$(CONFIG_GENERIC_CSUM) += checksum.o obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o diff --git a/lib/dma-debug.c b/lib/dma-debug.c deleted file mode 100644 index c007d25bee09..000000000000 --- a/lib/dma-debug.c +++ /dev/null @@ -1,1773 +0,0 @@ -/* - * Copyright (C) 2008 Advanced Micro Devices, Inc. - * - * Author: Joerg Roedel - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published - * by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define HASH_SIZE 1024ULL -#define HASH_FN_SHIFT 13 -#define HASH_FN_MASK (HASH_SIZE - 1) - -/* allow architectures to override this if absolutely required */ -#ifndef PREALLOC_DMA_DEBUG_ENTRIES -#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) -#endif - -enum { - dma_debug_single, - dma_debug_page, - dma_debug_sg, - dma_debug_coherent, - dma_debug_resource, -}; - -enum map_err_types { - MAP_ERR_CHECK_NOT_APPLICABLE, - MAP_ERR_NOT_CHECKED, - MAP_ERR_CHECKED, -}; - -#define DMA_DEBUG_STACKTRACE_ENTRIES 5 - -/** - * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping - * @list: node on pre-allocated free_entries list - * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent - * @type: single, page, sg, coherent - * @pfn: page frame of the start address - * @offset: offset of mapping relative to pfn - * @size: length of the mapping - * @direction: enum dma_data_direction - * @sg_call_ents: 'nents' from dma_map_sg - * @sg_mapped_ents: 'mapped_ents' from dma_map_sg - * @map_err_type: track whether dma_mapping_error() was checked - * @stacktrace: support backtraces when a violation is detected - */ -struct dma_debug_entry { - struct list_head list; - struct device *dev; - int type; - unsigned long pfn; - size_t offset; - u64 dev_addr; - u64 size; - int direction; - int sg_call_ents; - int sg_mapped_ents; - enum map_err_types map_err_type; -#ifdef CONFIG_STACKTRACE - struct stack_trace stacktrace; - unsigned long st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; -#endif -}; - -typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); - -struct hash_bucket { - struct list_head list; - spinlock_t lock; -} ____cacheline_aligned_in_smp; - -/* Hash list to save the allocated dma addresses */ -static struct hash_bucket dma_entry_hash[HASH_SIZE]; -/* List of pre-allocated dma_debug_entry's */ -static LIST_HEAD(free_entries); -/* Lock for the list above */ -static DEFINE_SPINLOCK(free_entries_lock); - -/* Global disable flag - will be set in case of an error */ -static bool global_disable __read_mostly; - -/* Early initialization disable flag, set at the end of dma_debug_init */ -static bool dma_debug_initialized __read_mostly; - -static inline bool dma_debug_disabled(void) -{ - return global_disable || !dma_debug_initialized; -} - -/* Global error count */ -static u32 error_count; - -/* Global error show enable*/ -static u32 show_all_errors __read_mostly; -/* Number of errors to show */ -static u32 show_num_errors = 1; - -static u32 num_free_entries; -static u32 min_free_entries; -static u32 nr_total_entries; - -/* number of preallocated entries requested by kernel cmdline */ -static u32 nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; - -/* debugfs dentry's for the stuff above */ -static struct dentry *dma_debug_dent __read_mostly; -static struct dentry *global_disable_dent __read_mostly; -static struct dentry *error_count_dent __read_mostly; -static struct dentry *show_all_errors_dent __read_mostly; -static struct dentry *show_num_errors_dent __read_mostly; -static struct dentry *num_free_entries_dent __read_mostly; -static struct dentry *min_free_entries_dent __read_mostly; -static struct dentry *filter_dent __read_mostly; - -/* per-driver filter related state */ - -#define NAME_MAX_LEN 64 - -static char current_driver_name[NAME_MAX_LEN] __read_mostly; -static struct device_driver *current_driver __read_mostly; - -static DEFINE_RWLOCK(driver_name_lock); - -static const char *const maperr2str[] = { - [MAP_ERR_CHECK_NOT_APPLICABLE] = "dma map error check not applicable", - [MAP_ERR_NOT_CHECKED] = "dma map error not checked", - [MAP_ERR_CHECKED] = "dma map error checked", -}; - -static const char *type2name[5] = { "single", "page", - "scather-gather", "coherent", - "resource" }; - -static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE", - "DMA_FROM_DEVICE", "DMA_NONE" }; - -/* - * The access to some variables in this macro is racy. We can't use atomic_t - * here because all these variables are exported to debugfs. Some of them even - * writeable. This is also the reason why a lock won't help much. But anyway, - * the races are no big deal. Here is why: - * - * error_count: the addition is racy, but the worst thing that can happen is - * that we don't count some errors - * show_num_errors: the subtraction is racy. Also no big deal because in - * worst case this will result in one warning more in the - * system log than the user configured. This variable is - * writeable via debugfs. - */ -static inline void dump_entry_trace(struct dma_debug_entry *entry) -{ -#ifdef CONFIG_STACKTRACE - if (entry) { - pr_warning("Mapped at:\n"); - print_stack_trace(&entry->stacktrace, 0); - } -#endif -} - -static bool driver_filter(struct device *dev) -{ - struct device_driver *drv; - unsigned long flags; - bool ret; - - /* driver filter off */ - if (likely(!current_driver_name[0])) - return true; - - /* driver filter on and initialized */ - if (current_driver && dev && dev->driver == current_driver) - return true; - - /* driver filter on, but we can't filter on a NULL device... */ - if (!dev) - return false; - - if (current_driver || !current_driver_name[0]) - return false; - - /* driver filter on but not yet initialized */ - drv = dev->driver; - if (!drv) - return false; - - /* lock to protect against change of current_driver_name */ - read_lock_irqsave(&driver_name_lock, flags); - - ret = false; - if (drv->name && - strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) { - current_driver = drv; - ret = true; - } - - read_unlock_irqrestore(&driver_name_lock, flags); - - return ret; -} - -#define err_printk(dev, entry, format, arg...) do { \ - error_count += 1; \ - if (driver_filter(dev) && \ - (show_all_errors || show_num_errors > 0)) { \ - WARN(1, "%s %s: " format, \ - dev ? dev_driver_string(dev) : "NULL", \ - dev ? dev_name(dev) : "NULL", ## arg); \ - dump_entry_trace(entry); \ - } \ - if (!show_all_errors && show_num_errors > 0) \ - show_num_errors -= 1; \ - } while (0); - -/* - * Hash related functions - * - * Every DMA-API request is saved into a struct dma_debug_entry. To - * have quick access to these structs they are stored into a hash. - */ -static int hash_fn(struct dma_debug_entry *entry) -{ - /* - * Hash function is based on the dma address. - * We use bits 20-27 here as the index into the hash - */ - return (entry->dev_addr >> HASH_FN_SHIFT) & HASH_FN_MASK; -} - -/* - * Request exclusive access to a hash bucket for a given dma_debug_entry. - */ -static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, - unsigned long *flags) - __acquires(&dma_entry_hash[idx].lock) -{ - int idx = hash_fn(entry); - unsigned long __flags; - - spin_lock_irqsave(&dma_entry_hash[idx].lock, __flags); - *flags = __flags; - return &dma_entry_hash[idx]; -} - -/* - * Give up exclusive access to the hash bucket - */ -static void put_hash_bucket(struct hash_bucket *bucket, - unsigned long *flags) - __releases(&bucket->lock) -{ - unsigned long __flags = *flags; - - spin_unlock_irqrestore(&bucket->lock, __flags); -} - -static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b) -{ - return ((a->dev_addr == b->dev_addr) && - (a->dev == b->dev)) ? true : false; -} - -static bool containing_match(struct dma_debug_entry *a, - struct dma_debug_entry *b) -{ - if (a->dev != b->dev) - return false; - - if ((b->dev_addr <= a->dev_addr) && - ((b->dev_addr + b->size) >= (a->dev_addr + a->size))) - return true; - - return false; -} - -/* - * Search a given entry in the hash bucket list - */ -static struct dma_debug_entry *__hash_bucket_find(struct hash_bucket *bucket, - struct dma_debug_entry *ref, - match_fn match) -{ - struct dma_debug_entry *entry, *ret = NULL; - int matches = 0, match_lvl, last_lvl = -1; - - list_for_each_entry(entry, &bucket->list, list) { - if (!match(ref, entry)) - continue; - - /* - * Some drivers map the same physical address multiple - * times. Without a hardware IOMMU this results in the - * same device addresses being put into the dma-debug - * hash multiple times too. This can result in false - * positives being reported. Therefore we implement a - * best-fit algorithm here which returns the entry from - * the hash which fits best to the reference value - * instead of the first-fit. - */ - matches += 1; - match_lvl = 0; - entry->size == ref->size ? ++match_lvl : 0; - entry->type == ref->type ? ++match_lvl : 0; - entry->direction == ref->direction ? ++match_lvl : 0; - entry->sg_call_ents == ref->sg_call_ents ? ++match_lvl : 0; - - if (match_lvl == 4) { - /* perfect-fit - return the result */ - return entry; - } else if (match_lvl > last_lvl) { - /* - * We found an entry that fits better then the - * previous one or it is the 1st match. - */ - last_lvl = match_lvl; - ret = entry; - } - } - - /* - * If we have multiple matches but no perfect-fit, just return - * NULL. - */ - ret = (matches == 1) ? ret : NULL; - - return ret; -} - -static struct dma_debug_entry *bucket_find_exact(struct hash_bucket *bucket, - struct dma_debug_entry *ref) -{ - return __hash_bucket_find(bucket, ref, exact_match); -} - -static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, - struct dma_debug_entry *ref, - unsigned long *flags) -{ - - unsigned int max_range = dma_get_max_seg_size(ref->dev); - struct dma_debug_entry *entry, index = *ref; - unsigned int range = 0; - - while (range <= max_range) { - entry = __hash_bucket_find(*bucket, ref, containing_match); - - if (entry) - return entry; - - /* - * Nothing found, go back a hash bucket - */ - put_hash_bucket(*bucket, flags); - range += (1 << HASH_FN_SHIFT); - index.dev_addr -= (1 << HASH_FN_SHIFT); - *bucket = get_hash_bucket(&index, flags); - } - - return NULL; -} - -/* - * Add an entry to a hash bucket - */ -static void hash_bucket_add(struct hash_bucket *bucket, - struct dma_debug_entry *entry) -{ - list_add_tail(&entry->list, &bucket->list); -} - -/* - * Remove entry from a hash bucket list - */ -static void hash_bucket_del(struct dma_debug_entry *entry) -{ - list_del(&entry->list); -} - -static unsigned long long phys_addr(struct dma_debug_entry *entry) -{ - if (entry->type == dma_debug_resource) - return __pfn_to_phys(entry->pfn) + entry->offset; - - return page_to_phys(pfn_to_page(entry->pfn)) + entry->offset; -} - -/* - * Dump mapping entries for debugging purposes - */ -void debug_dma_dump_mappings(struct device *dev) -{ - int idx; - - for (idx = 0; idx < HASH_SIZE; idx++) { - struct hash_bucket *bucket = &dma_entry_hash[idx]; - struct dma_debug_entry *entry; - unsigned long flags; - - spin_lock_irqsave(&bucket->lock, flags); - - list_for_each_entry(entry, &bucket->list, list) { - if (!dev || dev == entry->dev) { - dev_info(entry->dev, - "%s idx %d P=%Lx N=%lx D=%Lx L=%Lx %s %s\n", - type2name[entry->type], idx, - phys_addr(entry), entry->pfn, - entry->dev_addr, entry->size, - dir2name[entry->direction], - maperr2str[entry->map_err_type]); - } - } - - spin_unlock_irqrestore(&bucket->lock, flags); - } -} - -/* - * For each mapping (initial cacheline in the case of - * dma_alloc_coherent/dma_map_page, initial cacheline in each page of a - * scatterlist, or the cacheline specified in dma_map_single) insert - * into this tree using the cacheline as the key. At - * dma_unmap_{single|sg|page} or dma_free_coherent delete the entry. If - * the entry already exists at insertion time add a tag as a reference - * count for the overlapping mappings. For now, the overlap tracking - * just ensures that 'unmaps' balance 'maps' before marking the - * cacheline idle, but we should also be flagging overlaps as an API - * violation. - * - * Memory usage is mostly constrained by the maximum number of available - * dma-debug entries in that we need a free dma_debug_entry before - * inserting into the tree. In the case of dma_map_page and - * dma_alloc_coherent there is only one dma_debug_entry and one - * dma_active_cacheline entry to track per event. dma_map_sg(), on the - * other hand, consumes a single dma_debug_entry, but inserts 'nents' - * entries into the tree. - * - * At any time debug_dma_assert_idle() can be called to trigger a - * warning if any cachelines in the given page are in the active set. - */ -static RADIX_TREE(dma_active_cacheline, GFP_NOWAIT); -static DEFINE_SPINLOCK(radix_lock); -#define ACTIVE_CACHELINE_MAX_OVERLAP ((1 << RADIX_TREE_MAX_TAGS) - 1) -#define CACHELINE_PER_PAGE_SHIFT (PAGE_SHIFT - L1_CACHE_SHIFT) -#define CACHELINES_PER_PAGE (1 << CACHELINE_PER_PAGE_SHIFT) - -static phys_addr_t to_cacheline_number(struct dma_debug_entry *entry) -{ - return (entry->pfn << CACHELINE_PER_PAGE_SHIFT) + - (entry->offset >> L1_CACHE_SHIFT); -} - -static int active_cacheline_read_overlap(phys_addr_t cln) -{ - int overlap = 0, i; - - for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) - if (radix_tree_tag_get(&dma_active_cacheline, cln, i)) - overlap |= 1 << i; - return overlap; -} - -static int active_cacheline_set_overlap(phys_addr_t cln, int overlap) -{ - int i; - - if (overlap > ACTIVE_CACHELINE_MAX_OVERLAP || overlap < 0) - return overlap; - - for (i = RADIX_TREE_MAX_TAGS - 1; i >= 0; i--) - if (overlap & 1 << i) - radix_tree_tag_set(&dma_active_cacheline, cln, i); - else - radix_tree_tag_clear(&dma_active_cacheline, cln, i); - - return overlap; -} - -static void active_cacheline_inc_overlap(phys_addr_t cln) -{ - int overlap = active_cacheline_read_overlap(cln); - - overlap = active_cacheline_set_overlap(cln, ++overlap); - - /* If we overflowed the overlap counter then we're potentially - * leaking dma-mappings. Otherwise, if maps and unmaps are - * balanced then this overflow may cause false negatives in - * debug_dma_assert_idle() as the cacheline may be marked idle - * prematurely. - */ - WARN_ONCE(overlap > ACTIVE_CACHELINE_MAX_OVERLAP, - "DMA-API: exceeded %d overlapping mappings of cacheline %pa\n", - ACTIVE_CACHELINE_MAX_OVERLAP, &cln); -} - -static int active_cacheline_dec_overlap(phys_addr_t cln) -{ - int overlap = active_cacheline_read_overlap(cln); - - return active_cacheline_set_overlap(cln, --overlap); -} - -static int active_cacheline_insert(struct dma_debug_entry *entry) -{ - phys_addr_t cln = to_cacheline_number(entry); - unsigned long flags; - int rc; - - /* If the device is not writing memory then we don't have any - * concerns about the cpu consuming stale data. This mitigates - * legitimate usages of overlapping mappings. - */ - if (entry->direction == DMA_TO_DEVICE) - return 0; - - spin_lock_irqsave(&radix_lock, flags); - rc = radix_tree_insert(&dma_active_cacheline, cln, entry); - if (rc == -EEXIST) - active_cacheline_inc_overlap(cln); - spin_unlock_irqrestore(&radix_lock, flags); - - return rc; -} - -static void active_cacheline_remove(struct dma_debug_entry *entry) -{ - phys_addr_t cln = to_cacheline_number(entry); - unsigned long flags; - - /* ...mirror the insert case */ - if (entry->direction == DMA_TO_DEVICE) - return; - - spin_lock_irqsave(&radix_lock, flags); - /* since we are counting overlaps the final put of the - * cacheline will occur when the overlap count is 0. - * active_cacheline_dec_overlap() returns -1 in that case - */ - if (active_cacheline_dec_overlap(cln) < 0) - radix_tree_delete(&dma_active_cacheline, cln); - spin_unlock_irqrestore(&radix_lock, flags); -} - -/** - * debug_dma_assert_idle() - assert that a page is not undergoing dma - * @page: page to lookup in the dma_active_cacheline tree - * - * Place a call to this routine in cases where the cpu touching the page - * before the dma completes (page is dma_unmapped) will lead to data - * corruption. - */ -void debug_dma_assert_idle(struct page *page) -{ - static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; - struct dma_debug_entry *entry = NULL; - void **results = (void **) &ents; - unsigned int nents, i; - unsigned long flags; - phys_addr_t cln; - - if (dma_debug_disabled()) - return; - - if (!page) - return; - - cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; - spin_lock_irqsave(&radix_lock, flags); - nents = radix_tree_gang_lookup(&dma_active_cacheline, results, cln, - CACHELINES_PER_PAGE); - for (i = 0; i < nents; i++) { - phys_addr_t ent_cln = to_cacheline_number(ents[i]); - - if (ent_cln == cln) { - entry = ents[i]; - break; - } else if (ent_cln >= cln + CACHELINES_PER_PAGE) - break; - } - spin_unlock_irqrestore(&radix_lock, flags); - - if (!entry) - return; - - cln = to_cacheline_number(entry); - err_printk(entry->dev, entry, - "DMA-API: cpu touching an active dma mapped cacheline [cln=%pa]\n", - &cln); -} - -/* - * Wrapper function for adding an entry to the hash. - * This function takes care of locking itself. - */ -static void add_dma_entry(struct dma_debug_entry *entry) -{ - struct hash_bucket *bucket; - unsigned long flags; - int rc; - - bucket = get_hash_bucket(entry, &flags); - hash_bucket_add(bucket, entry); - put_hash_bucket(bucket, &flags); - - rc = active_cacheline_insert(entry); - if (rc == -ENOMEM) { - pr_err("DMA-API: cacheline tracking ENOMEM, dma-debug disabled\n"); - global_disable = true; - } - - /* TODO: report -EEXIST errors here as overlapping mappings are - * not supported by the DMA API - */ -} - -static struct dma_debug_entry *__dma_entry_alloc(void) -{ - struct dma_debug_entry *entry; - - entry = list_entry(free_entries.next, struct dma_debug_entry, list); - list_del(&entry->list); - memset(entry, 0, sizeof(*entry)); - - num_free_entries -= 1; - if (num_free_entries < min_free_entries) - min_free_entries = num_free_entries; - - return entry; -} - -/* struct dma_entry allocator - * - * The next two functions implement the allocator for - * struct dma_debug_entries. - */ -static struct dma_debug_entry *dma_entry_alloc(void) -{ - struct dma_debug_entry *entry; - unsigned long flags; - - spin_lock_irqsave(&free_entries_lock, flags); - - if (list_empty(&free_entries)) { - global_disable = true; - spin_unlock_irqrestore(&free_entries_lock, flags); - pr_err("DMA-API: debugging out of memory - disabling\n"); - return NULL; - } - - entry = __dma_entry_alloc(); - - spin_unlock_irqrestore(&free_entries_lock, flags); - -#ifdef CONFIG_STACKTRACE - entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; - entry->stacktrace.entries = entry->st_entries; - entry->stacktrace.skip = 2; - save_stack_trace(&entry->stacktrace); -#endif - - return entry; -} - -static void dma_entry_free(struct dma_debug_entry *entry) -{ - unsigned long flags; - - active_cacheline_remove(entry); - - /* - * add to beginning of the list - this way the entries are - * more likely cache hot when they are reallocated. - */ - spin_lock_irqsave(&free_entries_lock, flags); - list_add(&entry->list, &free_entries); - num_free_entries += 1; - spin_unlock_irqrestore(&free_entries_lock, flags); -} - -int dma_debug_resize_entries(u32 num_entries) -{ - int i, delta, ret = 0; - unsigned long flags; - struct dma_debug_entry *entry; - LIST_HEAD(tmp); - - spin_lock_irqsave(&free_entries_lock, flags); - - if (nr_total_entries < num_entries) { - delta = num_entries - nr_total_entries; - - spin_unlock_irqrestore(&free_entries_lock, flags); - - for (i = 0; i < delta; i++) { - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - break; - - list_add_tail(&entry->list, &tmp); - } - - spin_lock_irqsave(&free_entries_lock, flags); - - list_splice(&tmp, &free_entries); - nr_total_entries += i; - num_free_entries += i; - } else { - delta = nr_total_entries - num_entries; - - for (i = 0; i < delta && !list_empty(&free_entries); i++) { - entry = __dma_entry_alloc(); - kfree(entry); - } - - nr_total_entries -= i; - } - - if (nr_total_entries != num_entries) - ret = 1; - - spin_unlock_irqrestore(&free_entries_lock, flags); - - return ret; -} - -/* - * DMA-API debugging init code - * - * The init code does two things: - * 1. Initialize core data structures - * 2. Preallocate a given number of dma_debug_entry structs - */ - -static int prealloc_memory(u32 num_entries) -{ - struct dma_debug_entry *entry, *next_entry; - int i; - - for (i = 0; i < num_entries; ++i) { - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto out_err; - - list_add_tail(&entry->list, &free_entries); - } - - num_free_entries = num_entries; - min_free_entries = num_entries; - - pr_info("DMA-API: preallocated %d debug entries\n", num_entries); - - return 0; - -out_err: - - list_for_each_entry_safe(entry, next_entry, &free_entries, list) { - list_del(&entry->list); - kfree(entry); - } - - return -ENOMEM; -} - -static ssize_t filter_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - char buf[NAME_MAX_LEN + 1]; - unsigned long flags; - int len; - - if (!current_driver_name[0]) - return 0; - - /* - * We can't copy to userspace directly because current_driver_name can - * only be read under the driver_name_lock with irqs disabled. So - * create a temporary copy first. - */ - read_lock_irqsave(&driver_name_lock, flags); - len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name); - read_unlock_irqrestore(&driver_name_lock, flags); - - return simple_read_from_buffer(user_buf, count, ppos, buf, len); -} - -static ssize_t filter_write(struct file *file, const char __user *userbuf, - size_t count, loff_t *ppos) -{ - char buf[NAME_MAX_LEN]; - unsigned long flags; - size_t len; - int i; - - /* - * We can't copy from userspace directly. Access to - * current_driver_name is protected with a write_lock with irqs - * disabled. Since copy_from_user can fault and may sleep we - * need to copy to temporary buffer first - */ - len = min(count, (size_t)(NAME_MAX_LEN - 1)); - if (copy_from_user(buf, userbuf, len)) - return -EFAULT; - - buf[len] = 0; - - write_lock_irqsave(&driver_name_lock, flags); - - /* - * Now handle the string we got from userspace very carefully. - * The rules are: - * - only use the first token we got - * - token delimiter is everything looking like a space - * character (' ', '\n', '\t' ...) - * - */ - if (!isalnum(buf[0])) { - /* - * If the first character userspace gave us is not - * alphanumerical then assume the filter should be - * switched off. - */ - if (current_driver_name[0]) - pr_info("DMA-API: switching off dma-debug driver filter\n"); - current_driver_name[0] = 0; - current_driver = NULL; - goto out_unlock; - } - - /* - * Now parse out the first token and use it as the name for the - * driver to filter for. - */ - for (i = 0; i < NAME_MAX_LEN - 1; ++i) { - current_driver_name[i] = buf[i]; - if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0) - break; - } - current_driver_name[i] = 0; - current_driver = NULL; - - pr_info("DMA-API: enable driver filter for driver [%s]\n", - current_driver_name); - -out_unlock: - write_unlock_irqrestore(&driver_name_lock, flags); - - return count; -} - -static const struct file_operations filter_fops = { - .read = filter_read, - .write = filter_write, - .llseek = default_llseek, -}; - -static int dma_debug_fs_init(void) -{ - dma_debug_dent = debugfs_create_dir("dma-api", NULL); - if (!dma_debug_dent) { - pr_err("DMA-API: can not create debugfs directory\n"); - return -ENOMEM; - } - - global_disable_dent = debugfs_create_bool("disabled", 0444, - dma_debug_dent, - &global_disable); - if (!global_disable_dent) - goto out_err; - - error_count_dent = debugfs_create_u32("error_count", 0444, - dma_debug_dent, &error_count); - if (!error_count_dent) - goto out_err; - - show_all_errors_dent = debugfs_create_u32("all_errors", 0644, - dma_debug_dent, - &show_all_errors); - if (!show_all_errors_dent) - goto out_err; - - show_num_errors_dent = debugfs_create_u32("num_errors", 0644, - dma_debug_dent, - &show_num_errors); - if (!show_num_errors_dent) - goto out_err; - - num_free_entries_dent = debugfs_create_u32("num_free_entries", 0444, - dma_debug_dent, - &num_free_entries); - if (!num_free_entries_dent) - goto out_err; - - min_free_entries_dent = debugfs_create_u32("min_free_entries", 0444, - dma_debug_dent, - &min_free_entries); - if (!min_free_entries_dent) - goto out_err; - - filter_dent = debugfs_create_file("driver_filter", 0644, - dma_debug_dent, NULL, &filter_fops); - if (!filter_dent) - goto out_err; - - return 0; - -out_err: - debugfs_remove_recursive(dma_debug_dent); - - return -ENOMEM; -} - -static int device_dma_allocations(struct device *dev, struct dma_debug_entry **out_entry) -{ - struct dma_debug_entry *entry; - unsigned long flags; - int count = 0, i; - - for (i = 0; i < HASH_SIZE; ++i) { - spin_lock_irqsave(&dma_entry_hash[i].lock, flags); - list_for_each_entry(entry, &dma_entry_hash[i].list, list) { - if (entry->dev == dev) { - count += 1; - *out_entry = entry; - } - } - spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); - } - - return count; -} - -static int dma_debug_device_change(struct notifier_block *nb, unsigned long action, void *data) -{ - struct device *dev = data; - struct dma_debug_entry *uninitialized_var(entry); - int count; - - if (dma_debug_disabled()) - return 0; - - switch (action) { - case BUS_NOTIFY_UNBOUND_DRIVER: - count = device_dma_allocations(dev, &entry); - if (count == 0) - break; - err_printk(dev, entry, "DMA-API: device driver has pending " - "DMA allocations while released from device " - "[count=%d]\n" - "One of leaked entries details: " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [mapped as %s]\n", - count, entry->dev_addr, entry->size, - dir2name[entry->direction], type2name[entry->type]); - break; - default: - break; - } - - return 0; -} - -void dma_debug_add_bus(struct bus_type *bus) -{ - struct notifier_block *nb; - - if (dma_debug_disabled()) - return; - - nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL); - if (nb == NULL) { - pr_err("dma_debug_add_bus: out of memory\n"); - return; - } - - nb->notifier_call = dma_debug_device_change; - - bus_register_notifier(bus, nb); -} - -static int dma_debug_init(void) -{ - int i; - - /* Do not use dma_debug_initialized here, since we really want to be - * called to set dma_debug_initialized - */ - if (global_disable) - return 0; - - for (i = 0; i < HASH_SIZE; ++i) { - INIT_LIST_HEAD(&dma_entry_hash[i].list); - spin_lock_init(&dma_entry_hash[i].lock); - } - - if (dma_debug_fs_init() != 0) { - pr_err("DMA-API: error creating debugfs entries - disabling\n"); - global_disable = true; - - return 0; - } - - if (prealloc_memory(nr_prealloc_entries) != 0) { - pr_err("DMA-API: debugging out of memory error - disabled\n"); - global_disable = true; - - return 0; - } - - nr_total_entries = num_free_entries; - - dma_debug_initialized = true; - - pr_info("DMA-API: debugging enabled by kernel config\n"); - return 0; -} -core_initcall(dma_debug_init); - -static __init int dma_debug_cmdline(char *str) -{ - if (!str) - return -EINVAL; - - if (strncmp(str, "off", 3) == 0) { - pr_info("DMA-API: debugging disabled on kernel command line\n"); - global_disable = true; - } - - return 0; -} - -static __init int dma_debug_entries_cmdline(char *str) -{ - if (!str) - return -EINVAL; - if (!get_option(&str, &nr_prealloc_entries)) - nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; - return 0; -} - -__setup("dma_debug=", dma_debug_cmdline); -__setup("dma_debug_entries=", dma_debug_entries_cmdline); - -static void check_unmap(struct dma_debug_entry *ref) -{ - struct dma_debug_entry *entry; - struct hash_bucket *bucket; - unsigned long flags; - - bucket = get_hash_bucket(ref, &flags); - entry = bucket_find_exact(bucket, ref); - - if (!entry) { - /* must drop lock before calling dma_mapping_error */ - put_hash_bucket(bucket, &flags); - - if (dma_mapping_error(ref->dev, ref->dev_addr)) { - err_printk(ref->dev, NULL, - "DMA-API: device driver tries to free an " - "invalid DMA memory address\n"); - } else { - err_printk(ref->dev, NULL, - "DMA-API: device driver tries to free DMA " - "memory it has not allocated [device " - "address=0x%016llx] [size=%llu bytes]\n", - ref->dev_addr, ref->size); - } - return; - } - - if (ref->size != entry->size) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA memory with different size " - "[device address=0x%016llx] [map size=%llu bytes] " - "[unmap size=%llu bytes]\n", - ref->dev_addr, entry->size, ref->size); - } - - if (ref->type != entry->type) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA memory with wrong function " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped as %s] [unmapped as %s]\n", - ref->dev_addr, ref->size, - type2name[entry->type], type2name[ref->type]); - } else if ((entry->type == dma_debug_coherent) && - (phys_addr(ref) != phys_addr(entry))) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA memory with different CPU address " - "[device address=0x%016llx] [size=%llu bytes] " - "[cpu alloc address=0x%016llx] " - "[cpu free address=0x%016llx]", - ref->dev_addr, ref->size, - phys_addr(entry), - phys_addr(ref)); - } - - if (ref->sg_call_ents && ref->type == dma_debug_sg && - ref->sg_call_ents != entry->sg_call_ents) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA sg list with different entry count " - "[map count=%d] [unmap count=%d]\n", - entry->sg_call_ents, ref->sg_call_ents); - } - - /* - * This may be no bug in reality - but most implementations of the - * DMA API don't handle this properly, so check for it here - */ - if (ref->direction != entry->direction) { - err_printk(ref->dev, entry, "DMA-API: device driver frees " - "DMA memory with different direction " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [unmapped with %s]\n", - ref->dev_addr, ref->size, - dir2name[entry->direction], - dir2name[ref->direction]); - } - - /* - * Drivers should use dma_mapping_error() to check the returned - * addresses of dma_map_single() and dma_map_page(). - * If not, print this warning message. See Documentation/DMA-API.txt. - */ - if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { - err_printk(ref->dev, entry, - "DMA-API: device driver failed to check map error" - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped as %s]", - ref->dev_addr, ref->size, - type2name[entry->type]); - } - - hash_bucket_del(entry); - dma_entry_free(entry); - - put_hash_bucket(bucket, &flags); -} - -static void check_for_stack(struct device *dev, - struct page *page, size_t offset) -{ - void *addr; - struct vm_struct *stack_vm_area = task_stack_vm_area(current); - - if (!stack_vm_area) { - /* Stack is direct-mapped. */ - if (PageHighMem(page)) - return; - addr = page_address(page) + offset; - if (object_is_on_stack(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); - } else { - /* Stack is vmalloced. */ - int i; - - for (i = 0; i < stack_vm_area->nr_pages; i++) { - if (page != stack_vm_area->pages[i]) - continue; - - addr = (u8 *)current->stack + i * PAGE_SIZE + offset; - err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); - break; - } - } -} - -static inline bool overlap(void *addr, unsigned long len, void *start, void *end) -{ - unsigned long a1 = (unsigned long)addr; - unsigned long b1 = a1 + len; - unsigned long a2 = (unsigned long)start; - unsigned long b2 = (unsigned long)end; - - return !(b1 <= a2 || a1 >= b2); -} - -static void check_for_illegal_area(struct device *dev, void *addr, unsigned long len) -{ - if (overlap(addr, len, _stext, _etext) || - overlap(addr, len, __start_rodata, __end_rodata)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from kernel text or rodata [addr=%p] [len=%lu]\n", addr, len); -} - -static void check_sync(struct device *dev, - struct dma_debug_entry *ref, - bool to_cpu) -{ - struct dma_debug_entry *entry; - struct hash_bucket *bucket; - unsigned long flags; - - bucket = get_hash_bucket(ref, &flags); - - entry = bucket_find_contain(&bucket, ref, &flags); - - if (!entry) { - err_printk(dev, NULL, "DMA-API: device driver tries " - "to sync DMA memory it has not allocated " - "[device address=0x%016llx] [size=%llu bytes]\n", - (unsigned long long)ref->dev_addr, ref->size); - goto out; - } - - if (ref->size > entry->size) { - err_printk(dev, entry, "DMA-API: device driver syncs" - " DMA memory outside allocated range " - "[device address=0x%016llx] " - "[allocation size=%llu bytes] " - "[sync offset+size=%llu]\n", - entry->dev_addr, entry->size, - ref->size); - } - - if (entry->direction == DMA_BIDIRECTIONAL) - goto out; - - if (ref->direction != entry->direction) { - err_printk(dev, entry, "DMA-API: device driver syncs " - "DMA memory with different direction " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [synced with %s]\n", - (unsigned long long)ref->dev_addr, entry->size, - dir2name[entry->direction], - dir2name[ref->direction]); - } - - if (to_cpu && !(entry->direction == DMA_FROM_DEVICE) && - !(ref->direction == DMA_TO_DEVICE)) - err_printk(dev, entry, "DMA-API: device driver syncs " - "device read-only DMA memory for cpu " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [synced with %s]\n", - (unsigned long long)ref->dev_addr, entry->size, - dir2name[entry->direction], - dir2name[ref->direction]); - - if (!to_cpu && !(entry->direction == DMA_TO_DEVICE) && - !(ref->direction == DMA_FROM_DEVICE)) - err_printk(dev, entry, "DMA-API: device driver syncs " - "device write-only DMA memory to device " - "[device address=0x%016llx] [size=%llu bytes] " - "[mapped with %s] [synced with %s]\n", - (unsigned long long)ref->dev_addr, entry->size, - dir2name[entry->direction], - dir2name[ref->direction]); - - if (ref->sg_call_ents && ref->type == dma_debug_sg && - ref->sg_call_ents != entry->sg_call_ents) { - err_printk(ref->dev, entry, "DMA-API: device driver syncs " - "DMA sg list with different entry count " - "[map count=%d] [sync count=%d]\n", - entry->sg_call_ents, ref->sg_call_ents); - } - -out: - put_hash_bucket(bucket, &flags); -} - -static void check_sg_segment(struct device *dev, struct scatterlist *sg) -{ -#ifdef CONFIG_DMA_API_DEBUG_SG - unsigned int max_seg = dma_get_max_seg_size(dev); - u64 start, end, boundary = dma_get_seg_boundary(dev); - - /* - * Either the driver forgot to set dma_parms appropriately, or - * whoever generated the list forgot to check them. - */ - if (sg->length > max_seg) - err_printk(dev, NULL, "DMA-API: mapping sg segment longer than device claims to support [len=%u] [max=%u]\n", - sg->length, max_seg); - /* - * In some cases this could potentially be the DMA API - * implementation's fault, but it would usually imply that - * the scatterlist was built inappropriately to begin with. - */ - start = sg_dma_address(sg); - end = start + sg_dma_len(sg) - 1; - if ((start ^ end) & ~boundary) - err_printk(dev, NULL, "DMA-API: mapping sg segment across boundary [start=0x%016llx] [end=0x%016llx] [boundary=0x%016llx]\n", - start, end, boundary); -#endif -} - -void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - bool map_single) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - if (dma_mapping_error(dev, dma_addr)) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->dev = dev; - entry->type = dma_debug_page; - entry->pfn = page_to_pfn(page); - entry->offset = offset, - entry->dev_addr = dma_addr; - entry->size = size; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - if (map_single) - entry->type = dma_debug_single; - - check_for_stack(dev, page, offset); - - if (!PageHighMem(page)) { - void *addr = page_address(page) + offset; - - check_for_illegal_area(dev, addr, size); - } - - add_dma_entry(entry); -} -EXPORT_SYMBOL(debug_dma_map_page); - -void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - struct dma_debug_entry ref; - struct dma_debug_entry *entry; - struct hash_bucket *bucket; - unsigned long flags; - - if (unlikely(dma_debug_disabled())) - return; - - ref.dev = dev; - ref.dev_addr = dma_addr; - bucket = get_hash_bucket(&ref, &flags); - - list_for_each_entry(entry, &bucket->list, list) { - if (!exact_match(&ref, entry)) - continue; - - /* - * The same physical address can be mapped multiple - * times. Without a hardware IOMMU this results in the - * same device addresses being put into the dma-debug - * hash multiple times too. This can result in false - * positives being reported. Therefore we implement a - * best-fit algorithm here which updates the first entry - * from the hash which fits the reference value and is - * not currently listed as being checked. - */ - if (entry->map_err_type == MAP_ERR_NOT_CHECKED) { - entry->map_err_type = MAP_ERR_CHECKED; - break; - } - } - - put_hash_bucket(bucket, &flags); -} -EXPORT_SYMBOL(debug_dma_mapping_error); - -void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, int direction, bool map_single) -{ - struct dma_debug_entry ref = { - .type = dma_debug_page, - .dev = dev, - .dev_addr = addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - if (map_single) - ref.type = dma_debug_single; - - check_unmap(&ref); -} -EXPORT_SYMBOL(debug_dma_unmap_page); - -void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, int mapped_ents, int direction) -{ - struct dma_debug_entry *entry; - struct scatterlist *s; - int i; - - if (unlikely(dma_debug_disabled())) - return; - - for_each_sg(sg, s, mapped_ents, i) { - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_sg; - entry->dev = dev; - entry->pfn = page_to_pfn(sg_page(s)); - entry->offset = s->offset, - entry->size = sg_dma_len(s); - entry->dev_addr = sg_dma_address(s); - entry->direction = direction; - entry->sg_call_ents = nents; - entry->sg_mapped_ents = mapped_ents; - - check_for_stack(dev, sg_page(s), s->offset); - - if (!PageHighMem(sg_page(s))) { - check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); - } - - check_sg_segment(dev, s); - - add_dma_entry(entry); - } -} -EXPORT_SYMBOL(debug_dma_map_sg); - -static int get_nr_mapped_entries(struct device *dev, - struct dma_debug_entry *ref) -{ - struct dma_debug_entry *entry; - struct hash_bucket *bucket; - unsigned long flags; - int mapped_ents; - - bucket = get_hash_bucket(ref, &flags); - entry = bucket_find_exact(bucket, ref); - mapped_ents = 0; - - if (entry) - mapped_ents = entry->sg_mapped_ents; - put_hash_bucket(bucket, &flags); - - return mapped_ents; -} - -void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, int dir) -{ - struct scatterlist *s; - int mapped_ents = 0, i; - - if (unlikely(dma_debug_disabled())) - return; - - for_each_sg(sglist, s, nelems, i) { - - struct dma_debug_entry ref = { - .type = dma_debug_sg, - .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, - .dev_addr = sg_dma_address(s), - .size = sg_dma_len(s), - .direction = dir, - .sg_call_ents = nelems, - }; - - if (mapped_ents && i >= mapped_ents) - break; - - if (!i) - mapped_ents = get_nr_mapped_entries(dev, &ref); - - check_unmap(&ref); - } -} -EXPORT_SYMBOL(debug_dma_unmap_sg); - -void debug_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t dma_addr, void *virt) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - if (unlikely(virt == NULL)) - return; - - /* handle vmalloc and linear addresses */ - if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_coherent; - entry->dev = dev; - entry->offset = offset_in_page(virt); - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = DMA_BIDIRECTIONAL; - - if (is_vmalloc_addr(virt)) - entry->pfn = vmalloc_to_pfn(virt); - else - entry->pfn = page_to_pfn(virt_to_page(virt)); - - add_dma_entry(entry); -} -EXPORT_SYMBOL(debug_dma_alloc_coherent); - -void debug_dma_free_coherent(struct device *dev, size_t size, - void *virt, dma_addr_t addr) -{ - struct dma_debug_entry ref = { - .type = dma_debug_coherent, - .dev = dev, - .offset = offset_in_page(virt), - .dev_addr = addr, - .size = size, - .direction = DMA_BIDIRECTIONAL, - }; - - /* handle vmalloc and linear addresses */ - if (!is_vmalloc_addr(virt) && !virt_addr_valid(virt)) - return; - - if (is_vmalloc_addr(virt)) - ref.pfn = vmalloc_to_pfn(virt); - else - ref.pfn = page_to_pfn(virt_to_page(virt)); - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} -EXPORT_SYMBOL(debug_dma_free_coherent); - -void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr) -{ - struct dma_debug_entry *entry; - - if (unlikely(dma_debug_disabled())) - return; - - entry = dma_entry_alloc(); - if (!entry) - return; - - entry->type = dma_debug_resource; - entry->dev = dev; - entry->pfn = PHYS_PFN(addr); - entry->offset = offset_in_page(addr); - entry->size = size; - entry->dev_addr = dma_addr; - entry->direction = direction; - entry->map_err_type = MAP_ERR_NOT_CHECKED; - - add_dma_entry(entry); -} -EXPORT_SYMBOL(debug_dma_map_resource); - -void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) -{ - struct dma_debug_entry ref = { - .type = dma_debug_resource, - .dev = dev, - .dev_addr = dma_addr, - .size = size, - .direction = direction, - }; - - if (unlikely(dma_debug_disabled())) - return; - - check_unmap(&ref); -} -EXPORT_SYMBOL(debug_dma_unmap_resource); - -void debug_dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, - size_t size, int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, true); -} -EXPORT_SYMBOL(debug_dma_sync_single_for_cpu); - -void debug_dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, - int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, false); -} -EXPORT_SYMBOL(debug_dma_sync_single_for_device); - -void debug_dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, size_t size, - int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = offset + size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, true); -} -EXPORT_SYMBOL(debug_dma_sync_single_range_for_cpu); - -void debug_dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, - unsigned long offset, - size_t size, int direction) -{ - struct dma_debug_entry ref; - - if (unlikely(dma_debug_disabled())) - return; - - ref.type = dma_debug_single; - ref.dev = dev; - ref.dev_addr = dma_handle; - ref.size = offset + size; - ref.direction = direction; - ref.sg_call_ents = 0; - - check_sync(dev, &ref, false); -} -EXPORT_SYMBOL(debug_dma_sync_single_range_for_device); - -void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, - int nelems, int direction) -{ - struct scatterlist *s; - int mapped_ents = 0, i; - - if (unlikely(dma_debug_disabled())) - return; - - for_each_sg(sg, s, nelems, i) { - - struct dma_debug_entry ref = { - .type = dma_debug_sg, - .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, - .dev_addr = sg_dma_address(s), - .size = sg_dma_len(s), - .direction = direction, - .sg_call_ents = nelems, - }; - - if (!i) - mapped_ents = get_nr_mapped_entries(dev, &ref); - - if (i >= mapped_ents) - break; - - check_sync(dev, &ref, true); - } -} -EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu); - -void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, - int nelems, int direction) -{ - struct scatterlist *s; - int mapped_ents = 0, i; - - if (unlikely(dma_debug_disabled())) - return; - - for_each_sg(sg, s, nelems, i) { - - struct dma_debug_entry ref = { - .type = dma_debug_sg, - .dev = dev, - .pfn = page_to_pfn(sg_page(s)), - .offset = s->offset, - .dev_addr = sg_dma_address(s), - .size = sg_dma_len(s), - .direction = direction, - .sg_call_ents = nelems, - }; - if (!i) - mapped_ents = get_nr_mapped_entries(dev, &ref); - - if (i >= mapped_ents) - break; - - check_sync(dev, &ref, false); - } -} -EXPORT_SYMBOL(debug_dma_sync_sg_for_device); - -static int __init dma_debug_driver_setup(char *str) -{ - int i; - - for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) { - current_driver_name[i] = *str; - if (*str == 0) - break; - } - - if (current_driver_name[0]) - pr_info("DMA-API: enable driver filter for driver [%s]\n", - current_driver_name); - - - return 1; -} -__setup("dma_debug_driver=", dma_debug_driver_setup); diff --git a/lib/dma-direct.c b/lib/dma-direct.c deleted file mode 100644 index 8be8106270c2..000000000000 --- a/lib/dma-direct.c +++ /dev/null @@ -1,204 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DMA operations that map physical memory directly without using an IOMMU or - * flushing caches. - */ -#include -#include -#include -#include -#include -#include -#include - -#define DIRECT_MAPPING_ERROR 0 - -/* - * Most architectures use ZONE_DMA for the first 16 Megabytes, but - * some use it for entirely different regions: - */ -#ifndef ARCH_ZONE_DMA_BITS -#define ARCH_ZONE_DMA_BITS 24 -#endif - -/* - * For AMD SEV all DMA must be to unencrypted addresses. - */ -static inline bool force_dma_unencrypted(void) -{ - return sev_active(); -} - -static bool -check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, - const char *caller) -{ - if (unlikely(dev && !dma_capable(dev, dma_addr, size))) { - if (!dev->dma_mask) { - dev_err(dev, - "%s: call on device without dma_mask\n", - caller); - return false; - } - - if (*dev->dma_mask >= DMA_BIT_MASK(32)) { - dev_err(dev, - "%s: overflow %pad+%zu of device mask %llx\n", - caller, &dma_addr, size, *dev->dma_mask); - } - return false; - } - return true; -} - -static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) -{ - dma_addr_t addr = force_dma_unencrypted() ? - __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); - return addr + size - 1 <= dev->coherent_dma_mask; -} - -void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - int page_order = get_order(size); - struct page *page = NULL; - void *ret; - - /* we always manually zero the memory once we are done: */ - gfp &= ~__GFP_ZERO; - - /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ - if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - gfp |= GFP_DMA; - if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) - gfp |= GFP_DMA32; - -again: - /* CMA can be used only in the context which permits sleeping */ - if (gfpflags_allow_blocking(gfp)) { - page = dma_alloc_from_contiguous(dev, count, page_order, gfp); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - dma_release_from_contiguous(dev, page, count); - page = NULL; - } - } - if (!page) - page = alloc_pages_node(dev_to_node(dev), gfp, page_order); - - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - __free_pages(page, page_order); - page = NULL; - - if (IS_ENABLED(CONFIG_ZONE_DMA32) && - dev->coherent_dma_mask < DMA_BIT_MASK(64) && - !(gfp & (GFP_DMA32 | GFP_DMA))) { - gfp |= GFP_DMA32; - goto again; - } - - if (IS_ENABLED(CONFIG_ZONE_DMA) && - dev->coherent_dma_mask < DMA_BIT_MASK(32) && - !(gfp & GFP_DMA)) { - gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } - } - - if (!page) - return NULL; - ret = page_address(page); - if (force_dma_unencrypted()) { - set_memory_decrypted((unsigned long)ret, 1 << page_order); - *dma_handle = __phys_to_dma(dev, page_to_phys(page)); - } else { - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - } - memset(ret, 0, size); - return ret; -} - -/* - * NOTE: this function must never look at the dma_addr argument, because we want - * to be able to use it as a helper for iommu implementations as well. - */ -void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_addr, unsigned long attrs) -{ - unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned int page_order = get_order(size); - - if (force_dma_unencrypted()) - set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); - if (!dma_release_from_contiguous(dev, virt_to_page(cpu_addr), count)) - free_pages((unsigned long)cpu_addr, page_order); -} - -dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - dma_addr_t dma_addr = phys_to_dma(dev, page_to_phys(page)) + offset; - - if (!check_addr(dev, dma_addr, size, __func__)) - return DIRECT_MAPPING_ERROR; - return dma_addr; -} - -int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, - enum dma_data_direction dir, unsigned long attrs) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg_page(sg)); - - sg_dma_address(sg) = phys_to_dma(dev, sg_phys(sg)); - if (!check_addr(dev, sg_dma_address(sg), sg->length, __func__)) - return 0; - sg_dma_len(sg) = sg->length; - } - - return nents; -} - -int dma_direct_supported(struct device *dev, u64 mask) -{ -#ifdef CONFIG_ZONE_DMA - if (mask < DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - return 0; -#else - /* - * Because 32-bit DMA masks are so common we expect every architecture - * to be able to satisfy them - either by not supporting more physical - * memory, or by providing a ZONE_DMA32. If neither is the case, the - * architecture needs to use an IOMMU instead of the direct mapping. - */ - if (mask < DMA_BIT_MASK(32)) - return 0; -#endif - /* - * Various PCI/PCIe bridges have broken support for > 32bit DMA even - * if the device itself might support it. - */ - if (dev->dma_32bit_limit && mask > DMA_BIT_MASK(32)) - return 0; - return 1; -} - -int dma_direct_mapping_error(struct device *dev, dma_addr_t dma_addr) -{ - return dma_addr == DIRECT_MAPPING_ERROR; -} - -const struct dma_map_ops dma_direct_ops = { - .alloc = dma_direct_alloc, - .free = dma_direct_free, - .map_page = dma_direct_map_page, - .map_sg = dma_direct_map_sg, - .dma_supported = dma_direct_supported, - .mapping_error = dma_direct_mapping_error, -}; -EXPORT_SYMBOL(dma_direct_ops); diff --git a/lib/dma-noncoherent.c b/lib/dma-noncoherent.c deleted file mode 100644 index 79e9a757387f..000000000000 --- a/lib/dma-noncoherent.c +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2018 Christoph Hellwig. - * - * DMA operations that map physical memory directly without providing cache - * coherence. - */ -#include -#include -#include -#include -#include - -static void dma_noncoherent_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - arch_sync_dma_for_device(dev, dma_to_phys(dev, addr), size, dir); -} - -static void dma_noncoherent_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_device(dev, sg_phys(sg), sg->length, dir); -} - -static dma_addr_t dma_noncoherent_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - dma_addr_t addr; - - addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); - if (!dma_mapping_error(dev, addr) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, page_to_phys(page) + offset, - size, dir); - return addr; -} - -static int dma_noncoherent_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - nents = dma_direct_map_sg(dev, sgl, nents, dir, attrs); - if (nents > 0 && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_sg_for_device(dev, sgl, nents, dir); - return nents; -} - -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU -static void dma_noncoherent_sync_single_for_cpu(struct device *dev, - dma_addr_t addr, size_t size, enum dma_data_direction dir) -{ - arch_sync_dma_for_cpu(dev, dma_to_phys(dev, addr), size, dir); -} - -static void dma_noncoherent_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nents, i) - arch_sync_dma_for_cpu(dev, sg_phys(sg), sg->length, dir); -} - -static void dma_noncoherent_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_single_for_cpu(dev, addr, size, dir); -} - -static void dma_noncoherent_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, unsigned long attrs) -{ - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_noncoherent_sync_sg_for_cpu(dev, sgl, nents, dir); -} -#endif - -const struct dma_map_ops dma_noncoherent_ops = { - .alloc = arch_dma_alloc, - .free = arch_dma_free, - .mmap = arch_dma_mmap, - .sync_single_for_device = dma_noncoherent_sync_single_for_device, - .sync_sg_for_device = dma_noncoherent_sync_sg_for_device, - .map_page = dma_noncoherent_map_page, - .map_sg = dma_noncoherent_map_sg, -#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU - .sync_single_for_cpu = dma_noncoherent_sync_single_for_cpu, - .sync_sg_for_cpu = dma_noncoherent_sync_sg_for_cpu, - .unmap_page = dma_noncoherent_unmap_page, - .unmap_sg = dma_noncoherent_unmap_sg, -#endif - .dma_supported = dma_direct_supported, - .mapping_error = dma_direct_mapping_error, - .cache_sync = arch_dma_cache_sync, -}; -EXPORT_SYMBOL(dma_noncoherent_ops); diff --git a/lib/dma-virt.c b/lib/dma-virt.c deleted file mode 100644 index 8e61a02ef9ca..000000000000 --- a/lib/dma-virt.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * lib/dma-virt.c - * - * DMA operations that map to virtual addresses without flushing memory. - */ -#include -#include -#include -#include - -static void *dma_virt_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, - unsigned long attrs) -{ - void *ret; - - ret = (void *)__get_free_pages(gfp, get_order(size)); - if (ret) - *dma_handle = (uintptr_t)ret; - return ret; -} - -static void dma_virt_free(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_addr, - unsigned long attrs) -{ - free_pages((unsigned long)cpu_addr, get_order(size)); -} - -static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - return (uintptr_t)(page_address(page) + offset); -} - -static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg_page(sg)); - sg_dma_address(sg) = (uintptr_t)sg_virt(sg); - sg_dma_len(sg) = sg->length; - } - - return nents; -} - -const struct dma_map_ops dma_virt_ops = { - .alloc = dma_virt_alloc, - .free = dma_virt_free, - .map_page = dma_virt_map_page, - .map_sg = dma_virt_map_sg, -}; -EXPORT_SYMBOL(dma_virt_ops); diff --git a/lib/swiotlb.c b/lib/swiotlb.c deleted file mode 100644 index 04b68d9dffac..000000000000 --- a/lib/swiotlb.c +++ /dev/null @@ -1,1087 +0,0 @@ -/* - * Dynamic DMA mapping support. - * - * This implementation is a fallback for platforms that do not support - * I/O TLBs (aka DMA address translation hardware). - * Copyright (C) 2000 Asit Mallick - * Copyright (C) 2000 Goutham Rao - * Copyright (C) 2000, 2003 Hewlett-Packard Co - * David Mosberger-Tang - * - * 03/05/07 davidm Switch from PCI-DMA to generic device DMA API. - * 00/12/13 davidm Rename to swiotlb.c and add mark_clean() to avoid - * unnecessary i-cache flushing. - * 04/07/.. ak Better overflow handling. Assorted fixes. - * 05/09/10 linville Add support for syncing ranges, support syncing for - * DMA_BIDIRECTIONAL mappings, miscellaneous cleanup. - * 08/12/11 beckyb Add highmem support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include - -#define OFFSET(val,align) ((unsigned long) \ - ( (val) & ( (align) - 1))) - -#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) - -/* - * Minimum IO TLB size to bother booting with. Systems with mainly - * 64bit capable cards will only lightly use the swiotlb. If we can't - * allocate a contiguous 1MB, we're probably in trouble anyway. - */ -#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) - -enum swiotlb_force swiotlb_force; - -/* - * Used to do a quick range check in swiotlb_tbl_unmap_single and - * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this - * API. - */ -static phys_addr_t io_tlb_start, io_tlb_end; - -/* - * The number of IO TLB blocks (in groups of 64) between io_tlb_start and - * io_tlb_end. This is command line adjustable via setup_io_tlb_npages. - */ -static unsigned long io_tlb_nslabs; - -/* - * When the IOMMU overflows we return a fallback buffer. This sets the size. - */ -static unsigned long io_tlb_overflow = 32*1024; - -static phys_addr_t io_tlb_overflow_buffer; - -/* - * This is a free list describing the number of free entries available from - * each index - */ -static unsigned int *io_tlb_list; -static unsigned int io_tlb_index; - -/* - * Max segment that we can provide which (if pages are contingous) will - * not be bounced (unless SWIOTLB_FORCE is set). - */ -unsigned int max_segment; - -/* - * We need to save away the original address corresponding to a mapped entry - * for the sync operations. - */ -#define INVALID_PHYS_ADDR (~(phys_addr_t)0) -static phys_addr_t *io_tlb_orig_addr; - -/* - * Protect the above data structures in the map and unmap calls - */ -static DEFINE_SPINLOCK(io_tlb_lock); - -static int late_alloc; - -static int __init -setup_io_tlb_npages(char *str) -{ - if (isdigit(*str)) { - io_tlb_nslabs = simple_strtoul(str, &str, 0); - /* avoid tail segment of size < IO_TLB_SEGSIZE */ - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - if (*str == ',') - ++str; - if (!strcmp(str, "force")) { - swiotlb_force = SWIOTLB_FORCE; - } else if (!strcmp(str, "noforce")) { - swiotlb_force = SWIOTLB_NO_FORCE; - io_tlb_nslabs = 1; - } - - return 0; -} -early_param("swiotlb", setup_io_tlb_npages); -/* make io_tlb_overflow tunable too? */ - -unsigned long swiotlb_nr_tbl(void) -{ - return io_tlb_nslabs; -} -EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); - -unsigned int swiotlb_max_segment(void) -{ - return max_segment; -} -EXPORT_SYMBOL_GPL(swiotlb_max_segment); - -void swiotlb_set_max_segment(unsigned int val) -{ - if (swiotlb_force == SWIOTLB_FORCE) - max_segment = 1; - else - max_segment = rounddown(val, PAGE_SIZE); -} - -/* default to 64MB */ -#define IO_TLB_DEFAULT_SIZE (64UL<<20) -unsigned long swiotlb_size_or_default(void) -{ - unsigned long size; - - size = io_tlb_nslabs << IO_TLB_SHIFT; - - return size ? size : (IO_TLB_DEFAULT_SIZE); -} - -static bool no_iotlb_memory; - -void swiotlb_print_info(void) -{ - unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; - unsigned char *vstart, *vend; - - if (no_iotlb_memory) { - pr_warn("software IO TLB: No low mem\n"); - return; - } - - vstart = phys_to_virt(io_tlb_start); - vend = phys_to_virt(io_tlb_end); - - printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n", - (unsigned long long)io_tlb_start, - (unsigned long long)io_tlb_end, - bytes >> 20, vstart, vend - 1); -} - -/* - * Early SWIOTLB allocation may be too early to allow an architecture to - * perform the desired operations. This function allows the architecture to - * call SWIOTLB when the operations are possible. It needs to be called - * before the SWIOTLB memory is used. - */ -void __init swiotlb_update_mem_attributes(void) -{ - void *vaddr; - unsigned long bytes; - - if (no_iotlb_memory || late_alloc) - return; - - vaddr = phys_to_virt(io_tlb_start); - bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); - set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); - - vaddr = phys_to_virt(io_tlb_overflow_buffer); - bytes = PAGE_ALIGN(io_tlb_overflow); - set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); -} - -int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) -{ - void *v_overflow_buffer; - unsigned long i, bytes; - - bytes = nslabs << IO_TLB_SHIFT; - - io_tlb_nslabs = nslabs; - io_tlb_start = __pa(tlb); - io_tlb_end = io_tlb_start + bytes; - - /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = memblock_virt_alloc_low_nopanic( - PAGE_ALIGN(io_tlb_overflow), - PAGE_SIZE); - if (!v_overflow_buffer) - return -ENOMEM; - - io_tlb_overflow_buffer = __pa(v_overflow_buffer); - - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ - io_tlb_list = memblock_virt_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), - PAGE_SIZE); - io_tlb_orig_addr = memblock_virt_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), - PAGE_SIZE); - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - io_tlb_index = 0; - - if (verbose) - swiotlb_print_info(); - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); - return 0; -} - -/* - * Statically reserve bounce buffer space and initialize bounce buffer data - * structures for the software IO TLB used to implement the DMA API. - */ -void __init -swiotlb_init(int verbose) -{ - size_t default_size = IO_TLB_DEFAULT_SIZE; - unsigned char *vstart; - unsigned long bytes; - - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - - bytes = io_tlb_nslabs << IO_TLB_SHIFT; - - /* Get IO TLB memory from the low pages */ - vstart = memblock_virt_alloc_low_nopanic(PAGE_ALIGN(bytes), PAGE_SIZE); - if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) - return; - - if (io_tlb_start) - memblock_free_early(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - pr_warn("Cannot allocate SWIOTLB buffer"); - no_iotlb_memory = true; -} - -/* - * Systems with larger DMA zones (those that don't support ISA) can - * initialize the swiotlb later using the slab allocator if needed. - * This should be just like above, but with some error catching. - */ -int -swiotlb_late_init_with_default_size(size_t default_size) -{ - unsigned long bytes, req_nslabs = io_tlb_nslabs; - unsigned char *vstart = NULL; - unsigned int order; - int rc = 0; - - if (!io_tlb_nslabs) { - io_tlb_nslabs = (default_size >> IO_TLB_SHIFT); - io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); - } - - /* - * Get IO TLB memory from the low pages - */ - order = get_order(io_tlb_nslabs << IO_TLB_SHIFT); - io_tlb_nslabs = SLABS_PER_PAGE << order; - bytes = io_tlb_nslabs << IO_TLB_SHIFT; - - while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN, - order); - if (vstart) - break; - order--; - } - - if (!vstart) { - io_tlb_nslabs = req_nslabs; - return -ENOMEM; - } - if (order != get_order(bytes)) { - printk(KERN_WARNING "Warning: only able to allocate %ld MB " - "for software IO TLB\n", (PAGE_SIZE << order) >> 20); - io_tlb_nslabs = SLABS_PER_PAGE << order; - } - rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs); - if (rc) - free_pages((unsigned long)vstart, order); - - return rc; -} - -int -swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) -{ - unsigned long i, bytes; - unsigned char *v_overflow_buffer; - - bytes = nslabs << IO_TLB_SHIFT; - - io_tlb_nslabs = nslabs; - io_tlb_start = virt_to_phys(tlb); - io_tlb_end = io_tlb_start + bytes; - - set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT); - memset(tlb, 0, bytes); - - /* - * Get the overflow emergency buffer - */ - v_overflow_buffer = (void *)__get_free_pages(GFP_DMA, - get_order(io_tlb_overflow)); - if (!v_overflow_buffer) - goto cleanup2; - - set_memory_decrypted((unsigned long)v_overflow_buffer, - io_tlb_overflow >> PAGE_SHIFT); - memset(v_overflow_buffer, 0, io_tlb_overflow); - io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); - - /* - * Allocate and initialize the free list array. This array is used - * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE - * between io_tlb_start and io_tlb_end. - */ - io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * sizeof(int))); - if (!io_tlb_list) - goto cleanup3; - - io_tlb_orig_addr = (phys_addr_t *) - __get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * - sizeof(phys_addr_t))); - if (!io_tlb_orig_addr) - goto cleanup4; - - for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - io_tlb_index = 0; - - swiotlb_print_info(); - - late_alloc = 1; - - swiotlb_set_max_segment(io_tlb_nslabs << IO_TLB_SHIFT); - - return 0; - -cleanup4: - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - io_tlb_list = NULL; -cleanup3: - free_pages((unsigned long)v_overflow_buffer, - get_order(io_tlb_overflow)); - io_tlb_overflow_buffer = 0; -cleanup2: - io_tlb_end = 0; - io_tlb_start = 0; - io_tlb_nslabs = 0; - max_segment = 0; - return -ENOMEM; -} - -void __init swiotlb_exit(void) -{ - if (!io_tlb_orig_addr) - return; - - if (late_alloc) { - free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer), - get_order(io_tlb_overflow)); - free_pages((unsigned long)io_tlb_orig_addr, - get_order(io_tlb_nslabs * sizeof(phys_addr_t))); - free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * - sizeof(int))); - free_pages((unsigned long)phys_to_virt(io_tlb_start), - get_order(io_tlb_nslabs << IO_TLB_SHIFT)); - } else { - memblock_free_late(io_tlb_overflow_buffer, - PAGE_ALIGN(io_tlb_overflow)); - memblock_free_late(__pa(io_tlb_orig_addr), - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); - memblock_free_late(__pa(io_tlb_list), - PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); - memblock_free_late(io_tlb_start, - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); - } - io_tlb_nslabs = 0; - max_segment = 0; -} - -int is_swiotlb_buffer(phys_addr_t paddr) -{ - return paddr >= io_tlb_start && paddr < io_tlb_end; -} - -/* - * Bounce: copy the swiotlb buffer back to the original dma location - */ -static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir) -{ - unsigned long pfn = PFN_DOWN(orig_addr); - unsigned char *vaddr = phys_to_virt(tlb_addr); - - if (PageHighMem(pfn_to_page(pfn))) { - /* The buffer does not have a mapping. Map it in and copy */ - unsigned int offset = orig_addr & ~PAGE_MASK; - char *buffer; - unsigned int sz = 0; - unsigned long flags; - - while (size) { - sz = min_t(size_t, PAGE_SIZE - offset, size); - - local_irq_save(flags); - buffer = kmap_atomic(pfn_to_page(pfn)); - if (dir == DMA_TO_DEVICE) - memcpy(vaddr, buffer + offset, sz); - else - memcpy(buffer + offset, vaddr, sz); - kunmap_atomic(buffer); - local_irq_restore(flags); - - size -= sz; - pfn++; - vaddr += sz; - offset = 0; - } - } else if (dir == DMA_TO_DEVICE) { - memcpy(vaddr, phys_to_virt(orig_addr), size); - } else { - memcpy(phys_to_virt(orig_addr), vaddr, size); - } -} - -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - unsigned long flags; - phys_addr_t tlb_addr; - unsigned int nslots, stride, index, wrap; - int i; - unsigned long mask; - unsigned long offset_slots; - unsigned long max_slots; - - if (no_iotlb_memory) - panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); - - if (mem_encrypt_active()) - pr_warn_once("%s is active and system is using DMA bounce buffers\n", - sme_active() ? "SME" : "SEV"); - - mask = dma_get_seg_boundary(hwdev); - - tbl_dma_addr &= mask; - - offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - - /* - * Carefully handle integer overflow which can occur when mask == ~0UL. - */ - max_slots = mask + 1 - ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT - : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); - - /* - * For mappings greater than or equal to a page, we limit the stride - * (and hence alignment) to a page size. - */ - nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - if (size >= PAGE_SIZE) - stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); - else - stride = 1; - - BUG_ON(!nslots); - - /* - * Find suitable number of IO TLB entries size that will fit this - * request and allocate a buffer from that IO TLB pool. - */ - spin_lock_irqsave(&io_tlb_lock, flags); - index = ALIGN(io_tlb_index, stride); - if (index >= io_tlb_nslabs) - index = 0; - wrap = index; - - do { - while (iommu_is_span_boundary(index, nslots, offset_slots, - max_slots)) { - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - if (index == wrap) - goto not_found; - } - - /* - * If we find a slot that indicates we have 'nslots' number of - * contiguous buffers, we allocate the buffers from that slot - * and mark the entries as '0' indicating unavailable. - */ - if (io_tlb_list[index] >= nslots) { - int count = 0; - - for (i = index; i < (int) (index + nslots); i++) - io_tlb_list[i] = 0; - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; - tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); - - /* - * Update the indices to avoid searching in the next - * round. - */ - io_tlb_index = ((index + nslots) < io_tlb_nslabs - ? (index + nslots) : 0); - - goto found; - } - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - } while (index != wrap); - -not_found: - spin_unlock_irqrestore(&io_tlb_lock, flags); - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size); - return SWIOTLB_MAP_ERROR; -found: - spin_unlock_irqrestore(&io_tlb_lock, flags); - - /* - * Save away the mapping from the original address to the DMA address. - * This is needed when we sync the memory. Then we sync the buffer if - * needed. - */ - for (i = 0; i < nslots; i++) - io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE); - - return tlb_addr; -} - -/* - * Allocates bounce buffer and returns its physical address. - */ -static phys_addr_t -map_single(struct device *hwdev, phys_addr_t phys, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - dma_addr_t start_dma_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) { - dev_warn_ratelimited(hwdev, "Cannot do DMA to address %pa\n", - &phys); - return SWIOTLB_MAP_ERROR; - } - - start_dma_addr = __phys_to_dma(hwdev, io_tlb_start); - return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, - dir, attrs); -} - -/* - * tlb_addr is the physical address of the bounce buffer to unmap. - */ -void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - unsigned long flags; - int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - /* - * First, sync the memory before unmapping the entry - */ - if (orig_addr != INVALID_PHYS_ADDR && - !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL))) - swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE); - - /* - * Return the buffer to the free list by setting the corresponding - * entries to indicate the number of contiguous entries available. - * While returning the entries to the free list, we merge the entries - * with slots below and above the pool being returned. - */ - spin_lock_irqsave(&io_tlb_lock, flags); - { - count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? - io_tlb_list[index + nslots] : 0); - /* - * Step 1: return the slots to the free list, merging the - * slots with superceeding slots - */ - for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - /* - * Step 2: merge the returned slots with the preceding slots, - * if available (non zero) - */ - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; - } - spin_unlock_irqrestore(&io_tlb_lock, flags); -} - -void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) -{ - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; - phys_addr_t orig_addr = io_tlb_orig_addr[index]; - - if (orig_addr == INVALID_PHYS_ADDR) - return; - orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); - - switch (target) { - case SYNC_FOR_CPU: - if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_FROM_DEVICE); - else - BUG_ON(dir != DMA_TO_DEVICE); - break; - case SYNC_FOR_DEVICE: - if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(orig_addr, tlb_addr, - size, DMA_TO_DEVICE); - else - BUG_ON(dir != DMA_FROM_DEVICE); - break; - default: - BUG(); - } -} - -static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr, - size_t size) -{ - u64 mask = DMA_BIT_MASK(32); - - if (dev && dev->coherent_dma_mask) - mask = dev->coherent_dma_mask; - return addr + size - 1 <= mask; -} - -static void * -swiotlb_alloc_buffer(struct device *dev, size_t size, dma_addr_t *dma_handle, - unsigned long attrs) -{ - phys_addr_t phys_addr; - - if (swiotlb_force == SWIOTLB_NO_FORCE) - goto out_warn; - - phys_addr = swiotlb_tbl_map_single(dev, - __phys_to_dma(dev, io_tlb_start), - 0, size, DMA_FROM_DEVICE, attrs); - if (phys_addr == SWIOTLB_MAP_ERROR) - goto out_warn; - - *dma_handle = __phys_to_dma(dev, phys_addr); - if (!dma_coherent_ok(dev, *dma_handle, size)) - goto out_unmap; - - memset(phys_to_virt(phys_addr), 0, size); - return phys_to_virt(phys_addr); - -out_unmap: - dev_warn(dev, "hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", - (unsigned long long)dev->coherent_dma_mask, - (unsigned long long)*dma_handle); - - /* - * DMA_TO_DEVICE to avoid memcpy in unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); -out_warn: - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { - dev_warn(dev, - "swiotlb: coherent allocation failed, size=%zu\n", - size); - dump_stack(); - } - return NULL; -} - -static bool swiotlb_free_buffer(struct device *dev, size_t size, - dma_addr_t dma_addr) -{ - phys_addr_t phys_addr = dma_to_phys(dev, dma_addr); - - WARN_ON_ONCE(irqs_disabled()); - - if (!is_swiotlb_buffer(phys_addr)) - return false; - - /* - * DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single. - * DMA_ATTR_SKIP_CPU_SYNC is optional. - */ - swiotlb_tbl_unmap_single(dev, phys_addr, size, DMA_TO_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); - return true; -} - -static void -swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir, - int do_panic) -{ - if (swiotlb_force == SWIOTLB_NO_FORCE) - return; - - /* - * Ran out of IOMMU space for this operation. This is very bad. - * Unfortunately the drivers cannot handle this operation properly. - * unless they check for dma_mapping_error (most don't) - * When the mapping is small enough return a static buffer to limit - * the damage, or panic when the transfer is too big. - */ - dev_err_ratelimited(dev, "DMA: Out of SW-IOMMU space for %zu bytes\n", - size); - - if (size <= io_tlb_overflow || !do_panic) - return; - - if (dir == DMA_BIDIRECTIONAL) - panic("DMA: Random memory could be DMA accessed\n"); - if (dir == DMA_FROM_DEVICE) - panic("DMA: Random memory could be DMA written\n"); - if (dir == DMA_TO_DEVICE) - panic("DMA: Random memory could be DMA read\n"); -} - -/* - * Map a single buffer of the indicated size for DMA in streaming mode. The - * physical address to use is returned. - * - * Once the device is given the dma address, the device owns this memory until - * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. - */ -dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t map, phys = page_to_phys(page) + offset; - dma_addr_t dev_addr = phys_to_dma(dev, phys); - - BUG_ON(dir == DMA_NONE); - /* - * If the address happens to be in the device's DMA window, - * we can safely return the device addr and not worry about bounce - * buffering it. - */ - if (dma_capable(dev, dev_addr, size) && swiotlb_force != SWIOTLB_FORCE) - return dev_addr; - - trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - - /* Oh well, have to allocate and map a bounce buffer. */ - map = map_single(dev, phys, size, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - swiotlb_full(dev, size, dir, 1); - return __phys_to_dma(dev, io_tlb_overflow_buffer); - } - - dev_addr = __phys_to_dma(dev, map); - - /* Ensure that the address returned is DMA'ble */ - if (dma_capable(dev, dev_addr, size)) - return dev_addr; - - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); - - return __phys_to_dma(dev, io_tlb_overflow_buffer); -} - -/* - * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous swiotlb_map_page call. All - * other usages are undefined. - * - * After this call, reads by the cpu to the buffer are guaranteed to see - * whatever the device wrote there. - */ -static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); - - BUG_ON(dir == DMA_NONE); - - if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs); - return; - } - - if (dir != DMA_FROM_DEVICE) - return; - - /* - * phys_to_virt doesn't work with hihgmem page but we could - * call dma_mark_clean() with hihgmem page here. However, we - * are fine since dma_mark_clean() is null on POWERPC. We can - * make dma_mark_clean() take a physical address if necessary. - */ - dma_mark_clean(phys_to_virt(paddr), size); -} - -void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - unmap_single(hwdev, dev_addr, size, dir, attrs); -} - -/* - * Make physical memory consistent for a single streaming mode DMA translation - * after a transfer. - * - * If you perform a swiotlb_map_page() but wish to interrogate the buffer - * using the cpu, yet do not wish to teardown the dma mapping, you must - * call this function before doing so. At the next point you give the dma - * address back to the card, you must first perform a - * swiotlb_dma_sync_for_device, and then the device again owns the buffer - */ -static void -swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - enum dma_sync_target target) -{ - phys_addr_t paddr = dma_to_phys(hwdev, dev_addr); - - BUG_ON(dir == DMA_NONE); - - if (is_swiotlb_buffer(paddr)) { - swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); - return; - } - - if (dir != DMA_FROM_DEVICE) - return; - - dma_mark_clean(phys_to_virt(paddr), size); -} - -void -swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) -{ - swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); -} - -void -swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir) -{ - swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); -} - -/* - * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above swiotlb_map_page - * interface. Here the scatter gather list elements are each tagged with the - * appropriate dma address and length. They are obtained via - * sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for swiotlb_map_page are the - * same here. - */ -int -swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, - enum dma_data_direction dir, unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) { - phys_addr_t paddr = sg_phys(sg); - dma_addr_t dev_addr = phys_to_dma(hwdev, paddr); - - if (swiotlb_force == SWIOTLB_FORCE || - !dma_capable(hwdev, dev_addr, sg->length)) { - phys_addr_t map = map_single(hwdev, sg_phys(sg), - sg->length, dir, attrs); - if (map == SWIOTLB_MAP_ERROR) { - /* Don't panic here, we expect map_sg users - to do proper error handling. */ - swiotlb_full(hwdev, sg->length, dir, 0); - attrs |= DMA_ATTR_SKIP_CPU_SYNC; - swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, - attrs); - sg_dma_len(sgl) = 0; - return 0; - } - sg->dma_address = __phys_to_dma(hwdev, map); - } else - sg->dma_address = dev_addr; - sg_dma_len(sg) = sg->length; - } - return nelems; -} - -/* - * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_page() above. - */ -void -swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *sg; - int i; - - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) - unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, - attrs); -} - -/* - * Make physical memory consistent for a set of streaming mode DMA translations - * after a transfer. - * - * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules - * and usage. - */ -static void -swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, - int nelems, enum dma_data_direction dir, - enum dma_sync_target target) -{ - struct scatterlist *sg; - int i; - - for_each_sg(sgl, sg, nelems, i) - swiotlb_sync_single(hwdev, sg->dma_address, - sg_dma_len(sg), dir, target); -} - -void -swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); -} - -void -swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir) -{ - swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); -} - -int -swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) -{ - return (dma_addr == __phys_to_dma(hwdev, io_tlb_overflow_buffer)); -} - -/* - * Return whether the given device DMA address mask can be supported - * properly. For example, if your device can only drive the low 24-bits - * during bus mastering, then you would pass 0x00ffffff as the mask to - * this function. - */ -int -swiotlb_dma_supported(struct device *hwdev, u64 mask) -{ - return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask; -} - -void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp, unsigned long attrs) -{ - void *vaddr; - - /* temporary workaround: */ - if (gfp & __GFP_NOWARN) - attrs |= DMA_ATTR_NO_WARN; - - /* - * Don't print a warning when the first allocation attempt fails. - * swiotlb_alloc_coherent() will print a warning when the DMA memory - * allocation ultimately failed. - */ - gfp |= __GFP_NOWARN; - - vaddr = dma_direct_alloc(dev, size, dma_handle, gfp, attrs); - if (!vaddr) - vaddr = swiotlb_alloc_buffer(dev, size, dma_handle, attrs); - return vaddr; -} - -void swiotlb_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, unsigned long attrs) -{ - if (!swiotlb_free_buffer(dev, size, dma_addr)) - dma_direct_free(dev, size, vaddr, dma_addr, attrs); -} - -const struct dma_map_ops swiotlb_dma_ops = { - .mapping_error = swiotlb_dma_mapping_error, - .alloc = swiotlb_alloc, - .free = swiotlb_free, - .sync_single_for_cpu = swiotlb_sync_single_for_cpu, - .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .dma_supported = dma_direct_supported, -}; -- cgit v1.2.3 From a09c591306881dfb04387c6ee7b7e2e4683fa531 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 13 Jun 2018 13:17:26 +0200 Subject: ACPI / LPSS: Avoid PM quirks on suspend and resume from S3 It is reported that commit a192aa923b66a (ACPI / LPSS: Consolidate runtime PM and system sleep handling) introduced a system suspend regression on some machines, but the only functional change made by it was to cause the PM quirks in the LPSS to also be used during system suspend and resume. While that should always work for suspend-to-idle, it turns out to be problematic for S3 (suspend-to-RAM). To address that issue restore the previous S3 suspend and resume behavior of the LPSS to avoid applying PM quirks then. Fixes: a192aa923b66a (ACPI / LPSS: Consolidate runtime PM and system sleep handling) Link: https://bugs.launchpad.net/bugs/1774950 Reported-by: Kai-Heng Feng Tested-by: Kai-Heng Feng Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Reviewed-by: Andy Shevchenko Acked-by: Mika Westerberg Cc: 4.15+ # 4.15+ --- drivers/acpi/acpi_lpss.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index cb6ac5c65c2e..55e4577b504c 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "internal.h" @@ -944,9 +945,10 @@ static void lpss_iosf_exit_d3_state(void) mutex_unlock(&lpss_iosf_mutex); } -static int acpi_lpss_suspend(struct device *dev, bool wakeup) +static int acpi_lpss_suspend(struct device *dev, bool runtime) { struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev)); + bool wakeup = runtime || device_may_wakeup(dev); int ret; if (pdata->dev_desc->flags & LPSS_SAVE_CTX) @@ -959,13 +961,14 @@ static int acpi_lpss_suspend(struct device *dev, bool wakeup) * wrong status for devices being about to be powered off. See * lpss_iosf_enter_d3_state() for further information. */ - if (lpss_quirks & LPSS_QUIRK_ALWAYS_POWER_ON && iosf_mbi_available()) + if ((runtime || !pm_suspend_via_firmware()) && + lpss_quirks & LPSS_QUIRK_ALWAYS_POWER_ON && iosf_mbi_available()) lpss_iosf_enter_d3_state(); return ret; } -static int acpi_lpss_resume(struct device *dev) +static int acpi_lpss_resume(struct device *dev, bool runtime) { struct lpss_private_data *pdata = acpi_driver_data(ACPI_COMPANION(dev)); int ret; @@ -974,7 +977,8 @@ static int acpi_lpss_resume(struct device *dev) * This call is kept first to be in symmetry with * acpi_lpss_runtime_suspend() one. */ - if (lpss_quirks & LPSS_QUIRK_ALWAYS_POWER_ON && iosf_mbi_available()) + if ((runtime || !pm_resume_via_firmware()) && + lpss_quirks & LPSS_QUIRK_ALWAYS_POWER_ON && iosf_mbi_available()) lpss_iosf_exit_d3_state(); ret = acpi_dev_resume(dev); @@ -998,12 +1002,12 @@ static int acpi_lpss_suspend_late(struct device *dev) return 0; ret = pm_generic_suspend_late(dev); - return ret ? ret : acpi_lpss_suspend(dev, device_may_wakeup(dev)); + return ret ? ret : acpi_lpss_suspend(dev, false); } static int acpi_lpss_resume_early(struct device *dev) { - int ret = acpi_lpss_resume(dev); + int ret = acpi_lpss_resume(dev, false); return ret ? ret : pm_generic_resume_early(dev); } @@ -1018,7 +1022,7 @@ static int acpi_lpss_runtime_suspend(struct device *dev) static int acpi_lpss_runtime_resume(struct device *dev) { - int ret = acpi_lpss_resume(dev); + int ret = acpi_lpss_resume(dev, true); return ret ? ret : pm_generic_runtime_resume(dev); } -- cgit v1.2.3 From 47e5abfb546a3ace23a77453dc2e9db92704c5ac Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Jun 2018 10:01:52 +0200 Subject: PM / core: Fix supplier device runtime PM usage counter imbalance If a device link is added via device_link_add() by the driver of the link's consumer device, the supplier's runtime PM usage counter is going to be dropped by the pm_runtime_put_suppliers() call in driver_probe_device(). However, in that case it is not incremented unless the supplier driver is already present and the link is not stateless. That leads to a runtime PM usage counter imbalance for the supplier device in a few cases. To prevent that from happening, bump up the supplier runtime PM usage counter in device_link_add() for all links with the DL_FLAG_PM_RUNTIME flag set that are added at the consumer probe time. Use pm_runtime_get_noresume() for that as the callers of device_link_add() who want the supplier to be resumed by it are expected to pass DL_FLAG_RPM_ACTIVE in flags to it anyway, but additionally resume the supplier if the link is added during consumer driver probe to retain the existing behavior for the callers depending on it. Fixes: 21d5c57b3726 (PM / runtime: Use device links) Reported-by: Ulf Hansson Reviewed-by: Ulf Hansson Tested-by: Marek Szyprowski Cc: 4.10+ # 4.10+ Signed-off-by: Rafael J. Wysocki --- drivers/base/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 36622b52e419..df3e1a44707a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -236,6 +236,13 @@ struct device_link *device_link_add(struct device *consumer, link->rpm_active = true; } pm_runtime_new_link(consumer); + /* + * If the link is being added by the consumer driver at probe + * time, balance the decrementation of the supplier's runtime PM + * usage counter after consumer probe in driver_probe_device(). + */ + if (consumer->links.status == DL_DEV_PROBING) + pm_runtime_get_noresume(supplier); } get_device(supplier); link->supplier = supplier; @@ -255,12 +262,12 @@ struct device_link *device_link_add(struct device *consumer, switch (consumer->links.status) { case DL_DEV_PROBING: /* - * Balance the decrementation of the supplier's - * runtime PM usage counter after consumer probe - * in driver_probe_device(). + * Some callers expect the link creation during + * consumer driver probe to resume the supplier + * even without DL_FLAG_RPM_ACTIVE. */ if (flags & DL_FLAG_PM_RUNTIME) - pm_runtime_get_sync(supplier); + pm_runtime_resume(supplier); link->status = DL_STATE_CONSUMER_PROBE; break; -- cgit v1.2.3 From d5681f59ee3d4a2e60b9234e94c163cbbf559d0a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 14 Jun 2018 09:39:17 -0400 Subject: NFS: Fix an rcu deadlock in nfs_delegation_find_inode() I was able to reproduce this pretty regularily using xfstests generic/013 on NFS v4.0. Reported-by: Ross Zwisler Fixes: 6c342655022d (NFSv4: Return NFS4ERR_DELAY when a delegation recall fails due to igrab()) Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index bbd0465535eb..f033f3a69a3b 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -883,8 +883,10 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, rcu_read_lock(); list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { res = nfs_delegation_find_inode_server(server, fhandle); - if (res != ERR_PTR(-ENOENT)) + if (res != ERR_PTR(-ENOENT)) { + rcu_read_unlock(); return res; + } } rcu_read_unlock(); return ERR_PTR(-ENOENT); -- cgit v1.2.3 From 07480cbc05ef1ff7301cb11afb7d894ad3d0916a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Jun 2018 11:06:55 -0700 Subject: tools: bpftool: improve accuracy of load time BPF program load time is reported from the kernel relative to boot time. If conversion to wall clock does not take nanosecond parts into account, the load time reported by bpftool may differ by one second from run to run. This means JSON object reported by bpftool for a program will randomly change. Fixes: 71bb428fe2c1 ("tools: bpf: add bpftool") Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/prog.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index a4f435203fef..05f42a46d6ed 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -90,7 +90,9 @@ static void print_boot_time(__u64 nsecs, char *buf, unsigned int size) } wallclock_secs = (real_time_ts.tv_sec - boot_time_ts.tv_sec) + - nsecs / 1000000000; + (real_time_ts.tv_nsec - boot_time_ts.tv_nsec + nsecs) / + 1000000000; + if (!localtime_r(&wallclock_secs, &load_tm)) { snprintf(buf, size, "%llu", nsecs / 1000000000); -- cgit v1.2.3 From 47cf52a246e526e2092d60ac01c54af9bd45dcc9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Jun 2018 11:06:56 -0700 Subject: selftests/bpf: test offloads even with BPF programs present Modern distroes increasingly make use of BPF programs. Default Ubuntu 18.04 installation boots with a number of cgroup_skb programs loaded. test_offloads.py tries to check if programs and maps are not leaked on error paths by confirming the list of programs on the system is empty between tests. Since we can no longer expect the system to have no BPF objects at boot try to remember the programs and maps present at the start, and skip those when scanning the system. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_offload.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index e78aad0a68bb..be800d0e7a84 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -163,6 +163,10 @@ def bpftool(args, JSON=True, ns="", fail=True): def bpftool_prog_list(expected=None, ns=""): _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True) + # Remove the base progs + for p in base_progs: + if p in progs: + progs.remove(p) if expected is not None: if len(progs) != expected: fail(True, "%d BPF programs loaded, expected %d" % @@ -171,6 +175,10 @@ def bpftool_prog_list(expected=None, ns=""): def bpftool_map_list(expected=None, ns=""): _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) + # Remove the base maps + for m in base_maps: + if m in maps: + maps.remove(m) if expected is not None: if len(maps) != expected: fail(True, "%d BPF maps loaded, expected %d" % @@ -585,8 +593,8 @@ skip(os.getuid() != 0, "test must be run as root") # Check tools ret, progs = bpftool("prog", fail=False) skip(ret != 0, "bpftool not installed") -# Check no BPF programs are loaded -skip(len(progs) != 0, "BPF programs already loaded on the system") +base_progs = progs +_, base_maps = bpftool("map") # Check netdevsim ret, out = cmd("modprobe netdevsim", fail=False) -- cgit v1.2.3 From 36ffdbc0a2d9f7280e49dbe5ea53c289ad112a8c Mon Sep 17 00:00:00 2001 From: Jian Wang Date: Fri, 15 Jun 2018 03:22:17 +0200 Subject: bpf, selftest: check tunnel type more accurately Grep tunnel type directly to make sure 'ip' command supports it. Signed-off-by: Jian Wang Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_tunnel.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index aeb2901f21f4..c4b5fbbaa760 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -668,7 +668,7 @@ cleanup_exit() check() { - ip link help $1 2>&1 | grep -q "^Usage:" + ip link help 2>&1 | grep -q "\s$1\s" if [ $? -ne 0 ];then echo "SKIP $1: iproute2 not support" cleanup -- cgit v1.2.3 From 26bf8a89d887c0686acef0f44eaadd49abfcab03 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 14 Jun 2018 05:01:06 -0700 Subject: bpf, selftests: delete xfrm tunnel when test exits. Make the printting of bpf xfrm tunnel better and cleanup xfrm state and policy when xfrm test finishes. Signed-off-by: William Tu Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_tunnel.sh | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index c4b5fbbaa760..546aee3e9fb4 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -608,28 +608,26 @@ setup_xfrm_tunnel() test_xfrm_tunnel() { config_device - #tcpdump -nei veth1 ip & - output=$(mktemp) - cat /sys/kernel/debug/tracing/trace_pipe | tee $output & - setup_xfrm_tunnel + > /sys/kernel/debug/tracing/trace + setup_xfrm_tunnel tc qdisc add dev veth1 clsact tc filter add dev veth1 proto ip ingress bpf da obj test_tunnel_kern.o \ sec xfrm_get_state ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 sleep 1 - grep "reqid 1" $output + grep "reqid 1" /sys/kernel/debug/tracing/trace check_err $? - grep "spi 0x1" $output + grep "spi 0x1" /sys/kernel/debug/tracing/trace check_err $? - grep "remote ip 0xac100164" $output + grep "remote ip 0xac100164" /sys/kernel/debug/tracing/trace check_err $? cleanup if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: xfrm tunnel"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: xfrm tunnel"${NC} + echo -e ${RED}"FAIL: xfrm tunnel"${NC} + return 1 + fi + echo -e ${GREEN}"PASS: xfrm tunnel"${NC} } attach_bpf() @@ -657,6 +655,10 @@ cleanup() ip link del ip6geneve11 2> /dev/null ip link del erspan11 2> /dev/null ip link del ip6erspan11 2> /dev/null + ip xfrm policy delete dir out src 10.1.1.200/32 dst 10.1.1.100/32 2> /dev/null + ip xfrm policy delete dir in src 10.1.1.100/32 dst 10.1.1.200/32 2> /dev/null + ip xfrm state delete src 172.16.1.100 dst 172.16.1.200 proto esp spi 0x1 2> /dev/null + ip xfrm state delete src 172.16.1.200 dst 172.16.1.100 proto esp spi 0x2 2> /dev/null } cleanup_exit() -- cgit v1.2.3 From 2fbb56446fde14a80790de9b182ae6f7c36a039a Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 12 Jun 2018 12:11:31 -0500 Subject: smb3: note that smb3.11 posix extensions mount option is experimental Signed-off-by: Steve French --- fs/cifs/connect.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 96645a7d8f27..267c6f70cf98 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3029,8 +3029,11 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) #ifdef CONFIG_CIFS_SMB311 if ((volume_info->linux_ext) && (ses->server->posix_ext_supported)) { - if (ses->server->vals->protocol_id == SMB311_PROT_ID) + if (ses->server->vals->protocol_id == SMB311_PROT_ID) { tcon->posix_extensions = true; + printk_once(KERN_WARNING + "SMB3.11 POSIX Extensions are experimental\n"); + } } #endif /* 311 */ -- cgit v1.2.3 From a93864d93977b99bda6c348a09b90a3d7ef8db3a Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 14 Jun 2018 06:48:35 +1000 Subject: cifs: add lease tracking to the cached root fid Use a read lease for the cached root fid so that we can detect when the content of the directory changes (via a break) at which time we close the handle. On next access to the root the handle will be reopened and cached again. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 12 +++++++++--- fs/cifs/cifsproto.h | 1 + fs/cifs/cifssmb.c | 8 ++++---- fs/cifs/misc.c | 7 ++++--- fs/cifs/smb2misc.c | 16 +++++++++++++++- fs/cifs/smb2ops.c | 34 +++++++++++++++++++++++++--------- 6 files changed, 58 insertions(+), 20 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1efa2e65bc1a..ff71fbd619bf 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -883,6 +883,14 @@ cap_unix(struct cifs_ses *ses) return ses->server->vals->cap_unix & ses->capabilities; } +struct cached_fid { + bool is_valid:1; /* Do we have a useable root fid */ + struct cifs_fid *fid; + struct mutex fid_mutex; + struct cifs_tcon *tcon; + struct work_struct lease_break; +}; + /* * there is one of these for each connection to a resource on a particular * session @@ -987,9 +995,7 @@ struct cifs_tcon { struct fscache_cookie *fscache; /* cookie for share */ #endif struct list_head pending_opens; /* list of incomplete opens */ - bool valid_root_fid:1; /* Do we have a useable root fid */ - struct mutex prfid_mutex; /* prevents reopen race after dead ses*/ - struct cifs_fid *prfid; /* handle to the directory at top of share */ + struct cached_fid crfid; /* Cached root fid */ /* BB add field for back pointer to sb struct(s)? */ }; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4e0d183c3d10..3a13b44069fe 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -552,6 +552,7 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); void cifs_aio_ctx_release(struct kref *refcount); int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); +void smb2_cached_lease_break(struct work_struct *work); int cifs_alloc_hash(const char *name, struct crypto_shash **shash, struct sdesc **sdesc); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 42329b25877d..d352da325de3 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -107,10 +107,10 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) } spin_unlock(&tcon->open_file_lock); - mutex_lock(&tcon->prfid_mutex); - tcon->valid_root_fid = false; - memset(tcon->prfid, 0, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->prfid_mutex); + mutex_lock(&tcon->crfid.fid_mutex); + tcon->crfid.is_valid = false; + memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->crfid.fid_mutex); /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index af29ade195c0..bb40f3c050f6 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -117,8 +117,9 @@ tconInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); spin_lock_init(&ret_buf->open_file_lock); - mutex_init(&ret_buf->prfid_mutex); - ret_buf->prfid = kzalloc(sizeof(struct cifs_fid), GFP_KERNEL); + mutex_init(&ret_buf->crfid.fid_mutex); + ret_buf->crfid.fid = kzalloc(sizeof(struct cifs_fid), + GFP_KERNEL); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -136,7 +137,7 @@ tconInfoFree(struct cifs_tcon *buf_to_free) atomic_dec(&tconInfoAllocCount); kfree(buf_to_free->nativeFileSystem); kzfree(buf_to_free->password); - kfree(buf_to_free->prfid); + kfree(buf_to_free->crfid.fid); kfree(buf_to_free); } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index e2bec47c6845..0de87ca33e2e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -492,10 +492,11 @@ cifs_ses_oplock_break(struct work_struct *work) { struct smb2_lease_break_work *lw = container_of(work, struct smb2_lease_break_work, lease_break); - int rc; + int rc = 0; rc = SMB2_lease_break(0, tlink_tcon(lw->tlink), lw->lease_key, lw->lease_state); + cifs_dbg(FYI, "Lease release rc %d\n", rc); cifs_put_tlink(lw->tlink); kfree(lw); @@ -561,6 +562,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, open->oplock = lease_state; } + return found; } @@ -603,6 +605,18 @@ smb2_is_valid_lease_break(char *buffer) return true; } spin_unlock(&tcon->open_file_lock); + + if (tcon->crfid.is_valid && + !memcmp(rsp->LeaseKey, + tcon->crfid.fid->lease_key, + SMB2_LEASE_KEY_SIZE)) { + INIT_WORK(&tcon->crfid.lease_break, + smb2_cached_lease_break); + queue_work(cifsiod_wq, + &tcon->crfid.lease_break); + spin_unlock(&cifs_tcp_ses_lock); + return true; + } } } } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b15f5957d645..682bcfa246be 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -323,6 +323,21 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) } #endif /* STATS2 */ +void +smb2_cached_lease_break(struct work_struct *work) +{ + struct cached_fid *cfid = container_of(work, + struct cached_fid, lease_break); + mutex_lock(&cfid->fid_mutex); + if (cfid->is_valid) { + cifs_dbg(FYI, "clear cached root file handle\n"); + SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid, + cfid->fid->volatile_fid); + cfid->is_valid = false; + } + mutex_unlock(&cfid->fid_mutex); +} + /* * Open the directory at the root of a share */ @@ -331,13 +346,13 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) struct cifs_open_parms oparams; int rc; __le16 srch_path = 0; /* Null - since an open of top of share */ - u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + u8 oplock = SMB2_OPLOCK_LEVEL_II; - mutex_lock(&tcon->prfid_mutex); - if (tcon->valid_root_fid) { + mutex_lock(&tcon->crfid.fid_mutex); + if (tcon->crfid.is_valid) { cifs_dbg(FYI, "found a cached root file handle\n"); - memcpy(pfid, tcon->prfid, sizeof(struct cifs_fid)); - mutex_unlock(&tcon->prfid_mutex); + memcpy(pfid, tcon->crfid.fid, sizeof(struct cifs_fid)); + mutex_unlock(&tcon->crfid.fid_mutex); return 0; } @@ -350,10 +365,11 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL); if (rc == 0) { - memcpy(tcon->prfid, pfid, sizeof(struct cifs_fid)); - tcon->valid_root_fid = true; + memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); + tcon->crfid.tcon = tcon; + tcon->crfid.is_valid = true; } - mutex_unlock(&tcon->prfid_mutex); + mutex_unlock(&tcon->crfid.fid_mutex); return rc; } @@ -436,7 +452,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_fid fid; - if ((*full_path == 0) && tcon->valid_root_fid) + if ((*full_path == 0) && tcon->crfid.is_valid) return 0; utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); -- cgit v1.2.3 From 290c3982f66ab750e85863efcb1fdd736985e5d2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Jun 2018 16:46:56 -0500 Subject: cifs: minor documentation updates Various minor cifs/smb3 documentation updates Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- Documentation/filesystems/cifs/AUTHORS | 7 ++++--- Documentation/filesystems/cifs/CHANGES | 3 +++ Documentation/filesystems/cifs/TODO | 17 +++++++++-------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/Documentation/filesystems/cifs/AUTHORS b/Documentation/filesystems/cifs/AUTHORS index 9f4f87e16240..75865da2ce14 100644 --- a/Documentation/filesystems/cifs/AUTHORS +++ b/Documentation/filesystems/cifs/AUTHORS @@ -42,9 +42,11 @@ Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) Scott Lovenberg Pavel Shilovsky (for great work adding SMB2 support, and various SMB3 features) Aurelien Aptel (for DFS SMB3 work and some key bug fixes) -Ronnie Sahlberg (for SMB3 xattr work and bug fixes) +Ronnie Sahlberg (for SMB3 xattr work, bug fixes, and lots of great work on compounding) Shirish Pargaonkar (for many ACL patches over the years) Sachin Prabhu (many bug fixes, including for reconnect, copy offload and security) +Paulo Alcantara +Long Li (some great work on RDMA, SMB Direct) Test case and Bug Report contributors @@ -58,5 +60,4 @@ mention to the Stanford Checker (SWAT) which pointed out many minor bugs in error paths. Valuable suggestions also have come from Al Viro and Dave Miller. -And thanks to the IBM LTC and Power test teams and SuSE testers for -finding multiple bugs during excellent stress test runs. +And thanks to the IBM LTC and Power test teams and SuSE and Citrix and RedHat testers for finding multiple bugs during excellent stress test runs. diff --git a/Documentation/filesystems/cifs/CHANGES b/Documentation/filesystems/cifs/CHANGES index bc0025cdd1c9..455e1cc494a9 100644 --- a/Documentation/filesystems/cifs/CHANGES +++ b/Documentation/filesystems/cifs/CHANGES @@ -1,3 +1,6 @@ +See https://wiki.samba.org/index.php/LinuxCIFSKernel for +more current information. + Version 1.62 ------------ Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened diff --git a/Documentation/filesystems/cifs/TODO b/Documentation/filesystems/cifs/TODO index c5adf149b57f..852499aed64b 100644 --- a/Documentation/filesystems/cifs/TODO +++ b/Documentation/filesystems/cifs/TODO @@ -9,14 +9,14 @@ is a partial list of the known problems and missing features: a) SMB3 (and SMB3.02) missing optional features: - multichannel (started), integration with RDMA - - directory leases (improved metadata caching) - - T10 copy offload (copy chunk, and "Duplicate Extents" ioctl + - directory leases (improved metadata caching), started (root dir only) + - T10 copy offload ie "ODX" (copy chunk, and "Duplicate Extents" ioctl currently the only two server side copy mechanisms supported) b) improved sparse file support c) Directory entry caching relies on a 1 second timer, rather than -using Directory Leases +using Directory Leases, currently only the root file handle is cached longer d) quota support (needs minor kernel change since quota calls to make it to network filesystems or deviceless filesystems) @@ -42,6 +42,8 @@ mount or a per server basis to client UIDs or nobody if no mapping exists. Also better integration with winbind for resolving SID owners k) Add tools to take advantage of more smb3 specific ioctls and features +(passthrough ioctl/fsctl for sending various SMB3 fsctls to the server +is in progress) l) encrypted file support @@ -71,9 +73,8 @@ t) split cifs and smb3 support into separate modules so legacy (and less secure) CIFS dialect can be disabled in environments that don't need it and simplify the code. -u) Finish up SMB3.1.1 dialect support - -v) POSIX Extensions for SMB3.1.1 +v) POSIX Extensions for SMB3.1.1 (started, create and mkdir support added +so far). KNOWN BUGS ==================================== @@ -92,8 +93,8 @@ Misc testing to do 1) check out max path names and max path name components against various server types. Try nested symlinks (8 deep). Return max path name in stat -f information -2) Improve xfstest's cifs enablement and adapt xfstests where needed to test -cifs better +2) Improve xfstest's cifs/smb3 enablement and adapt xfstests where needed to test +cifs/smb3 better 3) Additional performance testing and optimization using iozone and similar - there are some easy changes that can be done to parallelize sequential writes, -- cgit v1.2.3 From d409014e4feeab486fb36b350abfc4c94de8be37 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Jun 2018 17:05:58 -0500 Subject: smb3: increase initial number of credits requested to allow write Compared to other clients the Linux smb3 client ramps up credits very slowly, taking more than 128 operations before a maximum size write could be sent (since the number of credits requested is only 2 per small operation, causing the credit limit to grow very slowly). This lack of credits initially would impact large i/o performance, when large i/o is tried early before enough credits are built up. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2pdu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index af032e1a3eac..328e23abd241 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -990,8 +990,9 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) req->PreviousSessionId = sess_data->previous_session; req->Flags = 0; /* MBZ */ - /* to enable echos and oplocks */ - req->sync_hdr.CreditRequest = cpu_to_le16(3); + + /* enough to enable echos and oplocks and one max size write */ + req->sync_hdr.CreditRequest = cpu_to_le16(130); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) -- cgit v1.2.3 From c713c8770fa5bfbeaac088cc7b959c7a6ba79f93 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 12 Jun 2018 08:00:58 +1000 Subject: cifs: push rfc1002 generation down the stack Move the generation of the 4 byte length field down the stack and generate it immediately before we start writing the data to the socket. Signed-off-by: Ronnie Sahlberg Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 23 +++++++++++--- fs/cifs/cifsproto.h | 2 +- fs/cifs/smb2ops.c | 71 +++++++++++++++++------------------------- fs/cifs/smb2pdu.c | 38 ++++++++--------------- fs/cifs/smb2transport.c | 18 +++++------ fs/cifs/transport.c | 82 +++++++++++++++++++------------------------------ 6 files changed, 99 insertions(+), 135 deletions(-) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 937251cc61c0..f23ff848b158 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -37,7 +37,6 @@ #include int __cifs_calc_signature(struct smb_rqst *rqst, - int start, struct TCP_Server_Info *server, char *signature, struct shash_desc *shash) { @@ -45,16 +44,30 @@ int __cifs_calc_signature(struct smb_rqst *rqst, int rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; + int is_smb2 = server->vals->header_preamble_size == 0; - for (i = start; i < n_vec; i++) { + /* iov[0] is actual data and not the rfc1002 length for SMB2+ */ + if (is_smb2) { + rc = crypto_shash_update(shash, + iov[0].iov_base, iov[0].iov_len); + } else { + if (n_vec < 2 || iov[0].iov_len != 4) + return -EIO; + } + + for (i = 1; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } - if (i == 1 && iov[1].iov_len <= 4) - break; /* nothing to sign or corrupt header */ + if (is_smb2) { + if (i == 0 && iov[0].iov_len <= 4) + break; /* nothing to sign or corrupt header */ + } else + if (i == 1 && iov[1].iov_len <= 4) + break; /* nothing to sign or corrupt header */ rc = crypto_shash_update(shash, iov[i].iov_base, iov[i].iov_len); if (rc) { @@ -118,7 +131,7 @@ static int cifs_calc_signature(struct smb_rqst *rqst, return rc; } - return __cifs_calc_signature(rqst, 1, server, signature, + return __cifs_calc_signature(rqst, server, signature, &server->secmech.sdescmd5->shash); } diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3a13b44069fe..4f9218281ff3 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -544,7 +544,7 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_written); -int __cifs_calc_signature(struct smb_rqst *rqst, int start, +int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, char *signature, struct shash_desc *shash); enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 682bcfa246be..9153407f97e8 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2167,7 +2167,7 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, struct smb_rqst *old_rq) { struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base; memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; @@ -2187,14 +2187,13 @@ static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, } /* Assumes: - * rqst->rq_iov[0] is rfc1002 length - * rqst->rq_iov[1] is tranform header - * rqst->rq_iov[2+] data to be encrypted/decrypted + * rqst->rq_iov[0] is tranform header + * rqst->rq_iov[1+] data to be encrypted/decrypted */ static struct scatterlist * init_sg(struct smb_rqst *rqst, u8 *sign) { - unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages; + unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1; unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; struct scatterlist *sg; unsigned int i; @@ -2205,10 +2204,10 @@ init_sg(struct smb_rqst *rqst, u8 *sign) return NULL; sg_init_table(sg, sg_len); - smb2_sg_set_buf(&sg[0], rqst->rq_iov[1].iov_base + 20, assoc_data_len); - for (i = 1; i < rqst->rq_nvec - 1; i++) - smb2_sg_set_buf(&sg[i], rqst->rq_iov[i+1].iov_base, - rqst->rq_iov[i+1].iov_len); + smb2_sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 20, assoc_data_len); + for (i = 1; i < rqst->rq_nvec; i++) + smb2_sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base, + rqst->rq_iov[i].iov_len); for (j = 0; i < sg_len - 1; i++, j++) { unsigned int len, offset; @@ -2240,11 +2239,10 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) return 1; } /* - * Encrypt or decrypt @rqst message. @rqst has the following format: - * iov[0] - rfc1002 length - * iov[1] - transform header (associate data), - * iov[2-N] and pages - data to encrypt. - * On success return encrypted data in iov[2-N] and pages, leave iov[0-1] + * Encrypt or decrypt @rqst message. @rqst[0] has the following format: + * iov[0] - transform header (associate data), + * iov[1-N] - SMB2 header and pages - data to encrypt. + * On success return encrypted data in iov[1-N] and pages, leave iov[0] * untouched. */ static int @@ -2339,10 +2337,6 @@ free_req: return rc; } -/* - * This is called from smb_send_rqst. At this point we have the rfc1002 - * header as the first element in the vector. - */ static int smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, struct smb_rqst *old_rq) @@ -2351,7 +2345,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, struct page **pages; struct smb2_transform_hdr *tr_hdr; unsigned int npages = old_rq->rq_npages; - unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base); + unsigned int orig_len = 0; int i; int rc = -ENOMEM; @@ -2365,24 +2359,23 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, new_rq->rq_pagesz = old_rq->rq_pagesz; new_rq->rq_tailsz = old_rq->rq_tailsz; + for (i = 0; i < old_rq->rq_nvec; i++) + orig_len += old_rq->rq_iov[i].iov_len; + for (i = 0; i < npages; i++) { pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); if (!pages[i]) goto err_free_pages; } - /* Make space for one extra iov to hold the transform header */ iov = kmalloc_array(old_rq->rq_nvec + 1, sizeof(struct kvec), GFP_KERNEL); if (!iov) goto err_free_pages; - /* copy all iovs from the old except the 1st one (rfc1002 length) */ - memcpy(&iov[2], &old_rq->rq_iov[1], - sizeof(struct kvec) * (old_rq->rq_nvec - 1)); - /* copy the rfc1002 iov */ - iov[0].iov_base = old_rq->rq_iov[0].iov_base; - iov[0].iov_len = old_rq->rq_iov[0].iov_len; + /* copy all iovs from the old */ + memcpy(&iov[1], &old_rq->rq_iov[0], + sizeof(struct kvec) * old_rq->rq_nvec); new_rq->rq_iov = iov; new_rq->rq_nvec = old_rq->rq_nvec + 1; @@ -2393,12 +2386,8 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, /* fill the 2nd iov with a transform header */ fill_transform_hdr(tr_hdr, orig_len, old_rq); - new_rq->rq_iov[1].iov_base = tr_hdr; - new_rq->rq_iov[1].iov_len = sizeof(struct smb2_transform_hdr); - - /* Update rfc1002 header */ - inc_rfc1001_len(new_rq->rq_iov[0].iov_base, - sizeof(struct smb2_transform_hdr)); + new_rq->rq_iov[0].iov_base = tr_hdr; + new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr); /* copy pages form the old */ for (i = 0; i < npages; i++) { @@ -2442,7 +2431,7 @@ smb3_free_transform_rq(struct smb_rqst *rqst) put_page(rqst->rq_pages[i]); kfree(rqst->rq_pages); /* free transform header */ - kfree(rqst->rq_iov[1].iov_base); + kfree(rqst->rq_iov[0].iov_base); kfree(rqst->rq_iov); } @@ -2459,19 +2448,17 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, unsigned int buf_data_size, struct page **pages, unsigned int npages, unsigned int page_data_size) { - struct kvec iov[3]; + struct kvec iov[2]; struct smb_rqst rqst = {NULL}; int rc; - iov[0].iov_base = NULL; - iov[0].iov_len = 0; - iov[1].iov_base = buf; - iov[1].iov_len = sizeof(struct smb2_transform_hdr); - iov[2].iov_base = buf + sizeof(struct smb2_transform_hdr); - iov[2].iov_len = buf_data_size; + iov[0].iov_base = buf; + iov[0].iov_len = sizeof(struct smb2_transform_hdr); + iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr); + iov[1].iov_len = buf_data_size; rqst.rq_iov = iov; - rqst.rq_nvec = 3; + rqst.rq_nvec = 2; rqst.rq_pages = pages; rqst.rq_npages = npages; rqst.rq_pagesz = PAGE_SIZE; @@ -2483,7 +2470,7 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, if (rc) return rc; - memmove(buf, iov[2].iov_base, buf_data_size); + memmove(buf, iov[1].iov_base, buf_data_size); server->total_read = buf_data_size + page_data_size; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 328e23abd241..c48608c5a0fb 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2595,11 +2595,10 @@ SMB2_echo(struct TCP_Server_Info *server) { struct smb2_echo_req *req; int rc = 0; - struct kvec iov[2]; + struct kvec iov[1]; struct smb_rqst rqst = { .rq_iov = iov, - .rq_nvec = 2 }; + .rq_nvec = 1 }; unsigned int total_len; - __be32 rfc1002_marker; cifs_dbg(FYI, "In echo request\n"); @@ -2615,11 +2614,8 @@ SMB2_echo(struct TCP_Server_Info *server) req->sync_hdr.CreditRequest = cpu_to_le16(1); - iov[0].iov_len = 4; - rfc1002_marker = cpu_to_be32(total_len); - iov[0].iov_base = &rfc1002_marker; - iov[1].iov_len = total_len; - iov[1].iov_base = (char *)req; + iov[0].iov_len = total_len; + iov[0].iov_base = (char *)req; rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL, server, CIFS_ECHO_OP); @@ -2849,10 +2845,9 @@ smb2_async_readv(struct cifs_readdata *rdata) struct smb2_sync_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, - .rq_nvec = 2 }; + .rq_nvec = 1 }; struct TCP_Server_Info *server; unsigned int total_len; - __be32 req_len; cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", __func__, rdata->offset, rdata->bytes); @@ -2883,12 +2878,8 @@ smb2_async_readv(struct cifs_readdata *rdata) if (smb3_encryption_required(io_parms.tcon)) flags |= CIFS_TRANSFORM_REQ; - req_len = cpu_to_be32(total_len); - - rdata->iov[0].iov_base = &req_len; - rdata->iov[0].iov_len = sizeof(__be32); - rdata->iov[1].iov_base = buf; - rdata->iov[1].iov_len = total_len; + rdata->iov[0].iov_base = buf; + rdata->iov[0].iov_len = total_len; shdr = (struct smb2_sync_hdr *)buf; @@ -3063,10 +3054,9 @@ smb2_async_writev(struct cifs_writedata *wdata, struct smb2_sync_hdr *shdr; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct kvec iov[2]; + struct kvec iov[1]; struct smb_rqst rqst = { }; unsigned int total_len; - __be32 rfc1002_marker; rc = smb2_plain_req_init(SMB2_WRITE, tcon, (void **) &req, &total_len); if (rc) { @@ -3138,15 +3128,11 @@ smb2_async_writev(struct cifs_writedata *wdata, v1->length = cpu_to_le32(wdata->mr->mr->length); } #endif - /* 4 for rfc1002 length field and 1 for Buffer */ - iov[0].iov_len = 4; - rfc1002_marker = cpu_to_be32(total_len - 1 + wdata->bytes); - iov[0].iov_base = &rfc1002_marker; - iov[1].iov_len = total_len - 1; - iov[1].iov_base = (char *)req; + iov[0].iov_len = total_len - 1; + iov[0].iov_base = (char *)req; rqst.rq_iov = iov; - rqst.rq_nvec = 2; + rqst.rq_nvec = 1; rqst.rq_pages = wdata->pages; rqst.rq_offset = wdata->page_offset; rqst.rq_npages = wdata->nr_pages; @@ -3154,7 +3140,7 @@ smb2_async_writev(struct cifs_writedata *wdata, rqst.rq_tailsz = wdata->tailsz; #ifdef CONFIG_CIFS_SMB_DIRECT if (wdata->mr) { - iov[1].iov_len += sizeof(struct smbd_buffer_descriptor_v1); + iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1); rqst.rq_npages = 0; } #endif diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 349d5ccf854c..51b9437c3c7b 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -171,9 +171,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; - int iov_hdr_index = rqst->rq_nvec > 1 ? 1 : 0; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)iov[iov_hdr_index].iov_base; + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; struct cifs_ses *ses; ses = smb2_find_smb_ses(server, shdr->SessionId); @@ -204,7 +202,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } - rc = __cifs_calc_signature(rqst, iov_hdr_index, server, sigptr, + rc = __cifs_calc_signature(rqst, server, sigptr, &server->secmech.sdeschmacsha256->shash); if (!rc) @@ -414,9 +412,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; - int iov_hdr_index = rqst->rq_nvec > 1 ? 1 : 0; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)iov[iov_hdr_index].iov_base; + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; struct cifs_ses *ses; ses = smb2_find_smb_ses(server, shdr->SessionId); @@ -447,7 +443,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } - rc = __cifs_calc_signature(rqst, iov_hdr_index, server, sigptr, + rc = __cifs_calc_signature(rqst, server, sigptr, &server->secmech.sdesccmacaes->shash); if (!rc) @@ -462,7 +458,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; if (!(shdr->Flags & SMB2_FLAGS_SIGNED) || server->tcpStatus == CifsNeedNegotiate) @@ -635,7 +631,7 @@ smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst) { int rc; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(ses->server, shdr); @@ -656,7 +652,7 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(server, shdr); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 1f1a68f89110..63f25f919b24 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -241,13 +241,14 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) int rc; struct kvec *iov = rqst->rq_iov; int n_vec = rqst->rq_nvec; - unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base); - unsigned long send_length; + unsigned int send_length; unsigned int i; size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; int val = 1; + __be32 rfc1002_marker; + if (cifs_rdma_enabled(server) && server->smbd_conn) { rc = smbd_send(server->smbd_conn, rqst); goto smbd_done; @@ -255,26 +256,34 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) if (ssocket == NULL) return -ENOTSOCK; - /* sanity check send length */ send_length = rqst_len(rqst); - if (send_length != smb_buf_length + 4) { - WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n", - send_length, smb_buf_length); - return -EIO; - } - - if (n_vec < 2) - return -EIO; - - cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length); - dump_smb(iov[0].iov_base, iov[0].iov_len); - dump_smb(iov[1].iov_base, iov[1].iov_len); + rfc1002_marker = cpu_to_be32(send_length); /* cork the socket */ kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, (char *)&val, sizeof(val)); size = 0; + /* Generate a rfc1002 marker for SMB2+ */ + if (server->vals->header_preamble_size == 0) { + struct kvec hiov = { + .iov_base = &rfc1002_marker, + .iov_len = 4 + }; + iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, &hiov, + 1, 4); + rc = smb_send_kvec(server, &smb_msg, &sent); + if (rc < 0) + goto uncork; + + total_len += sent; + send_length += 4; + } + + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", send_length); + dump_smb(iov[0].iov_base, iov[0].iov_len); + dump_smb(iov[1].iov_base, iov[1].iov_len); + for (i = 0; i < n_vec; i++) size += iov[i].iov_len; @@ -308,9 +317,9 @@ uncork: kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, (char *)&val, sizeof(val)); - if ((total_len > 0) && (total_len != smb_buf_length + 4)) { + if ((total_len > 0) && (total_len != send_length)) { cifs_dbg(FYI, "partial send (wanted=%u sent=%zu): terminating session\n", - smb_buf_length + 4, total_len); + send_length, total_len); /* * If we have only sent part of an SMB then the next SMB could * be taken as the remainder of this one. We need to kill the @@ -730,7 +739,6 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, * to the same server. We may make this configurable later or * use ses->maxReq. */ - rc = wait_for_free_request(ses->server, timeout, optype); if (rc) return rc; @@ -766,8 +774,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, #ifdef CONFIG_CIFS_SMB311 if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) - smb311_update_preauth_hash(ses, rqst->rq_iov+1, - rqst->rq_nvec-1); + smb311_update_preauth_hash(ses, rqst->rq_iov, + rqst->rq_nvec); #endif if (timeout == CIFS_ASYNC_OP) @@ -812,8 +820,8 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses, #ifdef CONFIG_CIFS_SMB311 if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { struct kvec iov = { - .iov_base = buf, - .iov_len = midQ->resp_buf_size + .iov_base = resp_iov->iov_base, + .iov_len = resp_iov->iov_len }; smb311_update_preauth_hash(ses, &iov, 1); } @@ -879,39 +887,13 @@ smb2_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, struct kvec *resp_iov) { struct smb_rqst rqst; - struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov; int rc; - int i; - __u32 count; - __be32 rfc1002_marker; - - if (n_vec + 1 > CIFS_MAX_IOV_SIZE) { - new_iov = kmalloc_array(n_vec + 1, sizeof(struct kvec), - GFP_KERNEL); - if (!new_iov) - return -ENOMEM; - } else - new_iov = s_iov; - - /* 1st iov is an RFC1002 Session Message length */ - memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec)); - - count = 0; - for (i = 1; i < n_vec + 1; i++) - count += new_iov[i].iov_len; - - rfc1002_marker = cpu_to_be32(count); - - new_iov[0].iov_base = &rfc1002_marker; - new_iov[0].iov_len = 4; memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = new_iov; - rqst.rq_nvec = n_vec + 1; + rqst.rq_iov = iov; + rqst.rq_nvec = n_vec; rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); - if (n_vec + 1 > CIFS_MAX_IOV_SIZE) - kfree(new_iov); return rc; } -- cgit v1.2.3 From 40eff45b5dc7df73fc8926c9bd81bde2d4c8ccca Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 12 Jun 2018 08:00:59 +1000 Subject: cifs: remove smb2_send_recv() Now that we have the plumbing to pass request without an rfc1002 header all the way down to the point we write to the socket we no longer need the smb2_send_recv() function. Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 4 -- fs/cifs/smb2pdu.c | 137 ++++++++++++++++++++++++++++++++++++++++++++-------- fs/cifs/transport.c | 17 ------- 3 files changed, 116 insertions(+), 42 deletions(-) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4f9218281ff3..03018be17283 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -112,10 +112,6 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, struct kvec *, int /* nvec to send */, int * /* type of buf returned */, const int flags, struct kvec * /* resp vec */); -extern int smb2_send_recv(const unsigned int xid, struct cifs_ses *pses, - struct kvec *pkvec, int nvec_to_send, - int *pbuftype, const int flags, - struct kvec *presp); extern int SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *ptcon, struct smb_hdr *in_buf , diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index c48608c5a0fb..7daf38ab814a 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -602,6 +602,7 @@ static void assemble_neg_contexts(struct smb2_negotiate_req *req, int SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) { + struct smb_rqst rqst; struct smb2_negotiate_req *req; struct smb2_negotiate_rsp *rsp; struct kvec iov[1]; @@ -673,7 +674,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base; /* @@ -1028,6 +1033,7 @@ static int SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) { int rc; + struct smb_rqst rqst; struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base; struct kvec rsp_iov = { NULL, 0 }; @@ -1036,10 +1042,13 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cpu_to_le16(sizeof(struct smb2_sess_setup_req) - 1 /* pad */); req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - /* BB add code to build os and lm fields */ + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = sess_data->iov; + rqst.rq_nvec = 2; - rc = smb2_send_recv(sess_data->xid, sess_data->ses, - sess_data->iov, 2, + /* BB add code to build os and lm fields */ + rc = cifs_send_recv(sess_data->xid, sess_data->ses, + &rqst, &sess_data->buf0_type, CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); cifs_small_buf_release(sess_data->iov[0].iov_base); @@ -1377,6 +1386,7 @@ out: int SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) { + struct smb_rqst rqst; struct smb2_logoff_req *req; /* response is also trivial struct */ int rc = 0; struct TCP_Server_Info *server; @@ -1414,7 +1424,11 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); /* * No tcon so can't do @@ -1444,6 +1458,7 @@ int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, const struct nls_table *cp) { + struct smb_rqst rqst; struct smb2_tree_connect_req *req; struct smb2_tree_connect_rsp *rsp = NULL; struct kvec iov[2]; @@ -1500,7 +1515,11 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, !smb3_encryption_required(tcon)) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base; @@ -1564,6 +1583,7 @@ tcon_error_exit: int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) { + struct smb_rqst rqst; struct smb2_tree_disconnect_req *req; /* response is trivial */ int rc = 0; struct cifs_ses *ses = tcon->ses; @@ -1594,7 +1614,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE); @@ -1892,6 +1916,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, struct kvec *err_iov, int *buftype) { + struct smb_rqst rqst; struct smb2_create_req *req; struct smb2_create_rsp *rsp; struct TCP_Server_Info *server; @@ -2044,7 +2069,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } #endif /* SMB311 */ - rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; @@ -2100,6 +2129,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */) { + struct smb_rqst rqst; struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp; struct cifs_ses *ses; @@ -2190,7 +2220,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; - rc = smb2_send_recv(xid, ses, iov, n_iov, &resp_buftype, flags, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; @@ -2275,6 +2309,7 @@ int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int flags) { + struct smb_rqst rqst; struct smb2_close_req *req; struct smb2_close_rsp *rsp; struct cifs_ses *ses = tcon->ses; @@ -2302,7 +2337,11 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_close_rsp *)rsp_iov.iov_base; @@ -2388,6 +2427,7 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, u32 additional_info, size_t output_len, size_t min_len, void **data, u32 *dlen) { + struct smb_rqst rqst; struct smb2_query_info_req *req; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov[2]; @@ -2428,7 +2468,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon, /* 1 for Buffer */ iov[0].iov_len = total_len - 1; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; @@ -2630,6 +2674,7 @@ int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid) { + struct smb_rqst rqst; struct smb2_flush_req *req; struct cifs_ses *ses = tcon->ses; struct kvec iov[1]; @@ -2657,7 +2702,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); if (rc != 0) { @@ -2918,6 +2967,7 @@ int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, char **buf, int *buf_type) { + struct smb_rqst rqst; int resp_buftype, rc = -EACCES; struct smb2_read_plain_req *req = NULL; struct smb2_read_rsp *rsp = NULL; @@ -2938,7 +2988,11 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; @@ -3197,6 +3251,7 @@ int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, unsigned int *nbytes, struct kvec *iov, int n_vec) { + struct smb_rqst rqst; int rc = 0; struct smb2_write_req *req = NULL; struct smb2_write_rsp *rsp = NULL; @@ -3238,7 +3293,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, /* 1 for Buffer */ iov[0].iov_len = total_len - 1; - rc = smb2_send_recv(xid, io_parms->tcon->ses, iov, n_vec + 1, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_vec + 1; + + rc = cifs_send_recv(xid, io_parms->tcon->ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; @@ -3310,6 +3369,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int index, struct cifs_search_info *srch_inf) { + struct smb_rqst rqst; struct smb2_query_directory_req *req; struct smb2_query_directory_rsp *rsp = NULL; struct kvec iov[2]; @@ -3382,7 +3442,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_base = (char *)(req->Buffer); iov[1].iov_len = len; - rc = smb2_send_recv(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; @@ -3441,6 +3505,7 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, u8 info_type, u32 additional_info, unsigned int num, void **data, unsigned int *size) { + struct smb_rqst rqst; struct smb2_set_info_req *req; struct smb2_set_info_rsp *rsp = NULL; struct kvec *iov; @@ -3496,7 +3561,11 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon, iov[i].iov_len = size[i]; } - rc = smb2_send_recv(xid, ses, iov, num, &resp_buftype, flags, + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = num; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(req); rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base; @@ -3651,6 +3720,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, __u8 oplock_level) { + struct smb_rqst rqst; int rc; struct smb2_oplock_break *req = NULL; struct cifs_ses *ses = tcon->ses; @@ -3679,7 +3749,11 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -3742,6 +3816,7 @@ int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) { + struct smb_rqst rqst; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov; struct kvec rsp_iov; @@ -3760,7 +3835,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3785,6 +3864,7 @@ int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int level) { + struct smb_rqst rqst; struct smb2_query_info_rsp *rsp = NULL; struct kvec iov; struct kvec rsp_iov; @@ -3816,7 +3896,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - rc = smb2_send_recv(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = &iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); cifs_small_buf_release(iov.iov_base); if (rc) { cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); @@ -3855,6 +3939,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, const __u64 persist_fid, const __u64 volatile_fid, const __u32 pid, const __u32 num_lock, struct smb2_lock_element *buf) { + struct smb_rqst rqst; int rc = 0; struct smb2_lock_req *req = NULL; struct kvec iov[2]; @@ -3887,7 +3972,12 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_len = count; cifs_stats_inc(&tcon->stats.cifs_stats.num_locks); - rc = smb2_send_recv(xid, tcon->ses, iov, 2, &resp_buf_type, flags, + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 2; + + rc = cifs_send_recv(xid, tcon->ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { @@ -3921,6 +4011,7 @@ int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, __u8 *lease_key, const __le32 lease_state) { + struct smb_rqst rqst; int rc; struct smb2_lease_ack *req = NULL; struct cifs_ses *ses = tcon->ses; @@ -3951,7 +4042,11 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; - rc = smb2_send_recv(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov); + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = 1; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buf_type, flags, &rsp_iov); cifs_small_buf_release(req); if (rc) { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 63f25f919b24..8b922ffc2345 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -880,23 +880,6 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, return rc; } -/* Like SendReceive2 but iov[0] does not contain an rfc1002 header */ -int -smb2_send_recv(const unsigned int xid, struct cifs_ses *ses, - struct kvec *iov, int n_vec, int *resp_buf_type /* ret */, - const int flags, struct kvec *resp_iov) -{ - struct smb_rqst rqst; - int rc; - - memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = iov; - rqst.rq_nvec = n_vec; - - rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov); - return rc; -} - int SendReceive(const unsigned int xid, struct cifs_ses *ses, struct smb_hdr *in_buf, struct smb_hdr *out_buf, -- cgit v1.2.3 From 07cd952f3ad51bc7f87dcdba0faac979ee559fd3 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 12 Jun 2018 08:01:00 +1000 Subject: cifs: update __smb_send_rqst() to take an array of requests Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/transport.c | 79 +++++++++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 8b922ffc2345..13c244dfb3c1 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -202,7 +202,7 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, } static unsigned long -rqst_len(struct smb_rqst *rqst) +smb2_rqst_len(struct smb_rqst *rqst) { unsigned int i; struct kvec *iov = rqst->rq_iov; @@ -236,13 +236,14 @@ rqst_len(struct smb_rqst *rqst) } static int -__smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) +__smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, + struct smb_rqst *rqst) { - int rc; - struct kvec *iov = rqst->rq_iov; - int n_vec = rqst->rq_nvec; - unsigned int send_length; - unsigned int i; + int rc = 0; + struct kvec *iov; + int n_vec; + unsigned int send_length = 0; + unsigned int i, j; size_t total_len = 0, sent, size; struct socket *ssocket = server->ssocket; struct msghdr smb_msg; @@ -256,14 +257,14 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) if (ssocket == NULL) return -ENOTSOCK; - send_length = rqst_len(rqst); - rfc1002_marker = cpu_to_be32(send_length); - /* cork the socket */ kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK, (char *)&val, sizeof(val)); - size = 0; + for (j = 0; j < num_rqst; j++) + send_length += smb2_rqst_len(&rqst[j]); + rfc1002_marker = cpu_to_be32(send_length); + /* Generate a rfc1002 marker for SMB2+ */ if (server->vals->header_preamble_size == 0) { struct kvec hiov = { @@ -280,35 +281,43 @@ __smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst) send_length += 4; } - cifs_dbg(FYI, "Sending smb: smb_len=%u\n", send_length); - dump_smb(iov[0].iov_base, iov[0].iov_len); - dump_smb(iov[1].iov_base, iov[1].iov_len); + for (j = 0; j < num_rqst; j++) { + iov = rqst[j].rq_iov; + n_vec = rqst[j].rq_nvec; - for (i = 0; i < n_vec; i++) - size += iov[i].iov_len; + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", send_length); + dump_smb(iov[0].iov_base, iov[0].iov_len); + dump_smb(iov[1].iov_base, iov[1].iov_len); - iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, iov, n_vec, size); - - rc = smb_send_kvec(server, &smb_msg, &sent); - if (rc < 0) - goto uncork; + size = 0; + for (i = 0; i < n_vec; i++) + size += iov[i].iov_len; - total_len += sent; + iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, + iov, n_vec, size); - /* now walk the page array and send each page in it */ - for (i = 0; i < rqst->rq_npages; i++) { - struct bio_vec bvec; - - bvec.bv_page = rqst->rq_pages[i]; - rqst_page_get_length(rqst, i, &bvec.bv_len, &bvec.bv_offset); - - iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, - &bvec, 1, bvec.bv_len); rc = smb_send_kvec(server, &smb_msg, &sent); if (rc < 0) - break; + goto uncork; total_len += sent; + + /* now walk the page array and send each page in it */ + for (i = 0; i < rqst[j].rq_npages; i++) { + struct bio_vec bvec; + + bvec.bv_page = rqst[j].rq_pages[i]; + rqst_page_get_length(&rqst[j], i, &bvec.bv_len, + &bvec.bv_offset); + + iov_iter_bvec(&smb_msg.msg_iter, WRITE | ITER_BVEC, + &bvec, 1, bvec.bv_len); + rc = smb_send_kvec(server, &smb_msg, &sent); + if (rc < 0) + break; + + total_len += sent; + } } uncork: @@ -344,7 +353,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst, int flags) int rc; if (!(flags & CIFS_TRANSFORM_REQ)) - return __smb_send_rqst(server, rqst); + return __smb_send_rqst(server, 1, rqst); if (!server->ops->init_transform_rq || !server->ops->free_transform_rq) { @@ -356,7 +365,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst, int flags) if (rc) return rc; - rc = __smb_send_rqst(server, &cur_rqst); + rc = __smb_send_rqst(server, 1, &cur_rqst); server->ops->free_transform_rq(&cur_rqst); return rc; } @@ -374,7 +383,7 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, iov[1].iov_base = (char *)smb_buffer + 4; iov[1].iov_len = smb_buf_length; - return __smb_send_rqst(server, &rqst); + return __smb_send_rqst(server, 1, &rqst); } static int -- cgit v1.2.3 From cd2dca60be6f4742b63373fd7e4b445c8a634e51 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 13 Jun 2018 13:54:14 -0300 Subject: cifs: Fix encryption/signing Since the rfc1002 generation was moved down to __smb_send_rqst(), the transform header is now in rqst->rq_iov[0]. Correctly assign the transform header pointer in crypt_message(). Signed-off-by: Paulo Alcantara Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 9153407f97e8..c3648e9b5ec7 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2187,7 +2187,7 @@ static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, } /* Assumes: - * rqst->rq_iov[0] is tranform header + * rqst->rq_iov[0] is transform header * rqst->rq_iov[1+] data to be encrypted/decrypted */ static struct scatterlist * @@ -2249,7 +2249,7 @@ static int crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc) { struct smb2_transform_hdr *tr_hdr = - (struct smb2_transform_hdr *)rqst->rq_iov[1].iov_base; + (struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base; unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20; int rc = 0; struct scatterlist *sg; -- cgit v1.2.3 From e2292430c49dbbe2d54438a4c05dd937a8eeecdd Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 14 Jun 2018 15:43:16 +0200 Subject: CIFS: move default port definitions to cifsglob.h Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 3 +++ fs/cifs/connect.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ff71fbd619bf..ff280447dae5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -33,6 +33,9 @@ #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ +#define CIFS_PORT 445 +#define RFC1001_PORT 139 + /* * The sizes of various internal tables and strings */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 267c6f70cf98..a57da1b88bdf 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -57,9 +57,6 @@ #include "smb2proto.h" #include "smbdirect.h" -#define CIFS_PORT 445 -#define RFC1001_PORT 139 - extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; -- cgit v1.2.3 From bead042cccca5a7c5626b851b66a30698aa0ac36 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 14 Jun 2018 15:43:17 +0200 Subject: CIFS: complete PDU definitions for interface queries Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/smb2pdu.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index a345560001ce..824dddeee3f2 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -851,8 +851,11 @@ struct validate_negotiate_info_rsp { __le16 Dialect; /* Dialect in use for the connection */ } __packed; -#define RSS_CAPABLE 0x00000001 -#define RDMA_CAPABLE 0x00000002 +#define RSS_CAPABLE cpu_to_le32(0x00000001) +#define RDMA_CAPABLE cpu_to_le32(0x00000002) + +#define INTERNETWORK cpu_to_le16(0x0002) +#define INTERNETWORKV6 cpu_to_le16(0x0017) struct network_interface_info_ioctl_rsp { __le32 Next; /* next interface. zero if this is last one */ @@ -860,7 +863,21 @@ struct network_interface_info_ioctl_rsp { __le32 Capability; /* RSS or RDMA Capable */ __le32 Reserved; __le64 LinkSpeed; - char SockAddr_Storage[128]; + __le16 Family; + __u8 Buffer[126]; +} __packed; + +struct iface_info_ipv4 { + __be16 Port; + __be32 IPv4Address; + __be64 Reserved; +} __packed; + +struct iface_info_ipv6 { + __be16 Port; + __be32 FlowInfo; + __u8 IPv6Address[16]; + __be32 ScopeId; } __packed; #define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */ -- cgit v1.2.3 From b6f0dd5d75f9689d57c7ea49e52c80cabb876cb4 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 14 Jun 2018 15:43:18 +0200 Subject: CIFS: add iface info to struct cifs_ses Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 21 +++++++++++++++++++++ fs/cifs/misc.c | 2 ++ 2 files changed, 23 insertions(+) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ff280447dae5..9dd5f1a3d64b 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -841,6 +841,13 @@ static inline void cifs_set_net_ns(struct TCP_Server_Info *srv, struct net *net) #endif +struct cifs_server_iface { + size_t speed; + unsigned int rdma_capable : 1; + unsigned int rss_capable : 1; + struct sockaddr_storage sockaddr; +}; + /* * Session structure. One of these for each uid session with a particular host */ @@ -878,6 +885,20 @@ struct cifs_ses { #ifdef CONFIG_CIFS_SMB311 __u8 preauth_sha_hash[SMB2_PREAUTH_HASH_SIZE]; #endif /* 3.1.1 */ + + /* + * Network interfaces available on the server this session is + * connected to. + * + * Other channels can be opened by connecting and binding this + * session to interfaces from this list. + * + * iface_lock should be taken when accessing any of these fields + */ + spinlock_t iface_lock; + struct cifs_server_iface *iface_list; + size_t iface_count; + unsigned long iface_last_update; /* jiffies */ }; static inline bool diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bb40f3c050f6..53e8362cbc4a 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -82,6 +82,7 @@ sesInfoAlloc(void) INIT_LIST_HEAD(&ret_buf->smb_ses_list); INIT_LIST_HEAD(&ret_buf->tcon_list); mutex_init(&ret_buf->session_mutex); + spin_lock_init(&ret_buf->iface_lock); } return ret_buf; } @@ -102,6 +103,7 @@ sesInfoFree(struct cifs_ses *buf_to_free) kfree(buf_to_free->user_name); kfree(buf_to_free->domainName); kzfree(buf_to_free->auth_key.response); + kfree(buf_to_free->iface_list); kzfree(buf_to_free); } -- cgit v1.2.3 From fe856be475f7cf5ffcde57341d175ce9fd09434b Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 14 Jun 2018 17:04:51 +0200 Subject: CIFS: parse and store info on iface queries Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 155 insertions(+), 15 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c3648e9b5ec7..b2390e9a6843 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -294,34 +294,176 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info) return rsize; } -#ifdef CONFIG_CIFS_STATS2 + +static int +parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf, + size_t buf_len, + struct cifs_server_iface **iface_list, + size_t *iface_count) +{ + struct network_interface_info_ioctl_rsp *p; + struct sockaddr_in *addr4; + struct sockaddr_in6 *addr6; + struct iface_info_ipv4 *p4; + struct iface_info_ipv6 *p6; + struct cifs_server_iface *info; + ssize_t bytes_left; + size_t next = 0; + int nb_iface = 0; + int rc = 0; + + *iface_list = NULL; + *iface_count = 0; + + /* + * Fist pass: count and sanity check + */ + + bytes_left = buf_len; + p = buf; + while (bytes_left >= sizeof(*p)) { + nb_iface++; + next = le32_to_cpu(p->Next); + if (!next) { + bytes_left -= sizeof(*p); + break; + } + p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); + bytes_left -= next; + } + + if (!nb_iface) { + cifs_dbg(VFS, "%s: malformed interface info\n", __func__); + rc = -EINVAL; + goto out; + } + + if (bytes_left || p->Next) + cifs_dbg(VFS, "%s: incomplete interface info\n", __func__); + + + /* + * Second pass: extract info to internal structure + */ + + *iface_list = kcalloc(nb_iface, sizeof(**iface_list), GFP_KERNEL); + if (!*iface_list) { + rc = -ENOMEM; + goto out; + } + + info = *iface_list; + bytes_left = buf_len; + p = buf; + while (bytes_left >= sizeof(*p)) { + info->speed = le64_to_cpu(p->LinkSpeed); + info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE); + info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE); + + cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count); + cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed); + cifs_dbg(FYI, "%s: capabilities 0x%08x\n", __func__, + le32_to_cpu(p->Capability)); + + switch (p->Family) { + /* + * The kernel and wire socket structures have the same + * layout and use network byte order but make the + * conversion explicit in case either one changes. + */ + case INTERNETWORK: + addr4 = (struct sockaddr_in *)&info->sockaddr; + p4 = (struct iface_info_ipv4 *)p->Buffer; + addr4->sin_family = AF_INET; + memcpy(&addr4->sin_addr, &p4->IPv4Address, 4); + + /* [MS-SMB2] 2.2.32.5.1.1 Clients MUST ignore these */ + addr4->sin_port = cpu_to_be16(CIFS_PORT); + + cifs_dbg(FYI, "%s: ipv4 %pI4\n", __func__, + &addr4->sin_addr); + break; + case INTERNETWORKV6: + addr6 = (struct sockaddr_in6 *)&info->sockaddr; + p6 = (struct iface_info_ipv6 *)p->Buffer; + addr6->sin6_family = AF_INET6; + memcpy(&addr6->sin6_addr, &p6->IPv6Address, 16); + + /* [MS-SMB2] 2.2.32.5.1.2 Clients MUST ignore these */ + addr6->sin6_flowinfo = 0; + addr6->sin6_scope_id = 0; + addr6->sin6_port = cpu_to_be16(CIFS_PORT); + + cifs_dbg(FYI, "%s: ipv6 %pI6\n", __func__, + &addr6->sin6_addr); + break; + default: + cifs_dbg(VFS, + "%s: skipping unsupported socket family\n", + __func__); + goto next_iface; + } + + (*iface_count)++; + info++; +next_iface: + next = le32_to_cpu(p->Next); + if (!next) + break; + p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next); + bytes_left -= next; + } + + if (!*iface_count) { + rc = -EINVAL; + goto out; + } + +out: + if (rc) { + kfree(*iface_list); + *iface_count = 0; + *iface_list = NULL; + } + return rc; +} + + static int SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) { int rc; unsigned int ret_data_len = 0; - struct network_interface_info_ioctl_rsp *out_buf; + struct network_interface_info_ioctl_rsp *out_buf = NULL; + struct cifs_server_iface *iface_list; + size_t iface_count; + struct cifs_ses *ses = tcon->ses; rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID, FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, NULL /* no data input */, 0 /* no data input */, (char **)&out_buf, &ret_data_len); - if (rc != 0) + if (rc != 0) { cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); - else if (ret_data_len < sizeof(struct network_interface_info_ioctl_rsp)) { - cifs_dbg(VFS, "server returned bad net interface info buf\n"); - rc = -EINVAL; - } else { - /* Dump info on first interface */ - cifs_dbg(FYI, "Adapter Capability 0x%x\t", - le32_to_cpu(out_buf->Capability)); - cifs_dbg(FYI, "Link Speed %lld\n", - le64_to_cpu(out_buf->LinkSpeed)); + goto out; } + + rc = parse_server_interfaces(out_buf, ret_data_len, + &iface_list, &iface_count); + if (rc) + goto out; + + spin_lock(&ses->iface_lock); + kfree(ses->iface_list); + ses->iface_list = iface_list; + ses->iface_count = iface_count; + ses->iface_last_update = jiffies; + spin_unlock(&ses->iface_lock); + +out: kfree(out_buf); return rc; } -#endif /* STATS2 */ void smb2_cached_lease_break(struct work_struct *work) @@ -399,9 +541,7 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon) if (rc) return; -#ifdef CONFIG_CIFS_STATS2 SMB3_request_interfaces(xid, tcon); -#endif /* STATS2 */ SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_ATTRIBUTE_INFORMATION); -- cgit v1.2.3 From bc0fe8b207a14a6d7ecf8812edb92c8bdd5c9b2d Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Thu, 14 Jun 2018 15:43:20 +0200 Subject: CIFS: dump every session iface info Signed-off-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 116146022aa1..e8db245194e7 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -126,6 +126,25 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_putc(m, '\n'); } +static void +cifs_dump_iface(struct seq_file *m, struct cifs_server_iface *iface) +{ + struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr; + struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr; + + seq_printf(m, "\t\tSpeed: %zu bps\n", iface->speed); + seq_puts(m, "\t\tCapabilities: "); + if (iface->rdma_capable) + seq_puts(m, "rdma "); + if (iface->rss_capable) + seq_puts(m, "rss "); + seq_putc(m, '\n'); + if (iface->sockaddr.ss_family == AF_INET) + seq_printf(m, "\t\tIPv4: %pI4\n", &ipv4->sin_addr); + else if (iface->sockaddr.ss_family == AF_INET6) + seq_printf(m, "\t\tIPv6: %pI6\n", &ipv6->sin6_addr); +} + static int cifs_debug_data_proc_show(struct seq_file *m, void *v) { struct list_head *tmp1, *tmp2, *tmp3; @@ -312,6 +331,14 @@ skip_rdma: mid_entry->mid); } spin_unlock(&GlobalMid_Lock); + + spin_lock(&ses->iface_lock); + seq_printf(m, "\n\tServer interfaces: %zu\n", ses->iface_count); + for (j = 0; j < ses->iface_count; j++) { + seq_printf(m, "\t%d)\n", j); + cifs_dump_iface(m, &ses->iface_list[j]); + } + spin_unlock(&ses->iface_lock); } } spin_unlock(&cifs_tcp_ses_lock); -- cgit v1.2.3 From 662bf5bc0a6f7b2abf7f9125c6319f06bb2efcf9 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Thu, 14 Jun 2018 17:34:08 -0300 Subject: cifs: Fix kernel oops when traceSMB is enabled When traceSMB is enabled through 'echo 1 > /proc/fs/cifs/traceSMB', after a mount, the following oops is triggered: [ 27.137943] BUG: unable to handle kernel paging request at ffff8800f80c268b [ 27.143396] PGD 2c6b067 P4D 2c6b067 PUD 0 [ 27.145386] Oops: 0000 [#1] SMP PTI [ 27.146186] CPU: 2 PID: 2655 Comm: mount.cifs Not tainted 4.17.0+ #39 [ 27.147174] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 [ 27.148969] RIP: 0010:hex_dump_to_buffer+0x413/0x4b0 [ 27.149738] Code: 48 8b 44 24 08 31 db 45 31 d2 48 89 6c 24 18 44 89 6c 24 24 48 c7 c1 78 b5 23 82 4c 89 64 24 10 44 89 d5 41 89 dc 4c 8d 58 02 <44> 0f b7 00 4d 89 dd eb 1f 83 c5 01 41 01 c4 41 39 ef 0f 84 48 fe [ 27.152396] RSP: 0018:ffffc9000058f8c0 EFLAGS: 00010246 [ 27.153129] RAX: ffff8800f80c268b RBX: 0000000000000000 RCX: ffffffff8223b578 [ 27.153867] RDX: 0000000000000000 RSI: ffffffff81a55496 RDI: 0000000000000008 [ 27.154612] RBP: 0000000000000000 R08: 0000000000000020 R09: 0000000000000083 [ 27.155355] R10: 0000000000000000 R11: ffff8800f80c268d R12: 0000000000000000 [ 27.156101] R13: 0000000000000002 R14: ffffc9000058f94d R15: 0000000000000008 [ 27.156838] FS: 00007f1693a6b740(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 [ 27.158354] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 27.159093] CR2: ffff8800f80c268b CR3: 00000000798fa001 CR4: 0000000000360ee0 [ 27.159892] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 27.160661] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 27.161464] Call Trace: [ 27.162123] print_hex_dump+0xd3/0x160 [ 27.162814] journal-offline (2658) used greatest stack depth: 13144 bytes left [ 27.162824] ? __release_sock+0x60/0xd0 [ 27.165344] ? tcp_sendmsg+0x31/0x40 [ 27.166177] dump_smb+0x39/0x40 [ 27.166972] ? vsnprintf+0x236/0x490 [ 27.167807] __smb_send_rqst.constprop.12+0x103/0x430 [ 27.168554] ? apic_timer_interrupt+0xa/0x20 [ 27.169306] smb_send_rqst+0x48/0xc0 [ 27.169984] cifs_send_recv+0xda/0x420 [ 27.170639] SMB2_negotiate+0x23d/0xfa0 [ 27.171301] ? vsnprintf+0x236/0x490 [ 27.171961] ? smb2_negotiate+0x19/0x30 [ 27.172586] smb2_negotiate+0x19/0x30 [ 27.173257] cifs_negotiate_protocol+0x70/0xd0 [ 27.173935] ? kstrdup+0x43/0x60 [ 27.174551] cifs_get_smb_ses+0x295/0xbe0 [ 27.175260] ? lock_timer_base+0x67/0x80 [ 27.175936] ? __internal_add_timer+0x1a/0x50 [ 27.176575] ? add_timer+0x10f/0x230 [ 27.177267] cifs_mount+0x101/0x1190 [ 27.177940] ? cifs_smb3_do_mount+0x144/0x5c0 [ 27.178575] cifs_smb3_do_mount+0x144/0x5c0 [ 27.179270] mount_fs+0x35/0x150 [ 27.179930] vfs_kern_mount.part.28+0x54/0xf0 [ 27.180567] do_mount+0x5ad/0xc40 [ 27.181234] ? kmem_cache_alloc_trace+0xed/0x1a0 [ 27.181916] ksys_mount+0x80/0xd0 [ 27.182535] __x64_sys_mount+0x21/0x30 [ 27.183220] do_syscall_64+0x4e/0x100 [ 27.183882] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 27.184535] RIP: 0033:0x7f169339055a [ 27.185192] Code: 48 8b 0d 41 d9 2b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 0e d9 2b 00 f7 d8 64 89 01 48 [ 27.187268] RSP: 002b:00007fff7b44eb58 EFLAGS: 00000202 ORIG_RAX: 00000000000000a5 [ 27.188515] RAX: ffffffffffffffda RBX: 00007f1693a7e70e RCX: 00007f169339055a [ 27.189244] RDX: 000055b9f97f64e5 RSI: 000055b9f97f652c RDI: 00007fff7b45074f [ 27.189974] RBP: 000055b9fb8c9260 R08: 000055b9fb8ca8f0 R09: 0000000000000000 [ 27.190721] R10: 0000000000000000 R11: 0000000000000202 R12: 000055b9fb8ca8f0 [ 27.191429] R13: 0000000000000000 R14: 00007f1693a7c000 R15: 00007f1693a7e91d [ 27.192167] Modules linked in: [ 27.192797] CR2: ffff8800f80c268b [ 27.193435] ---[ end trace 67404c618badf323 ]--- The problem was that dump_smb() had been called with an invalid pointer, that is, in __smb_send_rqst(), iov[1] doesn't exist (n_vec == 1). This patch fixes it by relying on the n_vec value to dump out the smb packets. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/transport.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 13c244dfb3c1..a3ea42a4cb98 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -281,17 +281,17 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, send_length += 4; } + cifs_dbg(FYI, "Sending smb: smb_len=%u\n", send_length); + for (j = 0; j < num_rqst; j++) { iov = rqst[j].rq_iov; n_vec = rqst[j].rq_nvec; - cifs_dbg(FYI, "Sending smb: smb_len=%u\n", send_length); - dump_smb(iov[0].iov_base, iov[0].iov_len); - dump_smb(iov[1].iov_base, iov[1].iov_len); - size = 0; - for (i = 0; i < n_vec; i++) + for (i = 0; i < n_vec; i++) { + dump_smb(iov[i].iov_base, iov[i].iov_len); size += iov[i].iov_len; + } iov_iter_kvec(&smb_msg.msg_iter, WRITE | ITER_KVEC, iov, n_vec, size); -- cgit v1.2.3 From bea851b8babe6c87c36e97c9de0dd0bea0dd5802 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 14 Jun 2018 21:56:32 -0500 Subject: smb3: Fix mode on mkdir on smb311 mounts mkdir was not passing the mode on smb3.11 mounts with posix extensions Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 4 ++ fs/cifs/inode.c | 13 ++++- fs/cifs/smb2ops.c | 1 + fs/cifs/smb2pdu.c | 153 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2proto.h | 4 ++ fs/cifs/trace.h | 3 +- 6 files changed, 175 insertions(+), 3 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9dd5f1a3d64b..bd78da59a4fd 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -315,6 +315,10 @@ struct smb_version_operations { /* send echo request */ int (*echo)(struct TCP_Server_Info *); /* create directory */ + int (*posix_mkdir)(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb); int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, struct cifs_sb_info *); /* set info on created directory */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f4697f548a39..a2cfb33e85c1 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1575,6 +1575,17 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) goto mkdir_out; } + server = tcon->ses->server; + +#ifdef CONFIG_CIFS_SMB311 + if ((server->ops->posix_mkdir) && (tcon->posix_extensions)) { + rc = server->ops->posix_mkdir(xid, inode, mode, tcon, full_path, + cifs_sb); + d_drop(direntry); /* for time being always refresh inode info */ + goto mkdir_out; + } +#endif /* SMB311 */ + if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb, @@ -1583,8 +1594,6 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) goto mkdir_out; } - server = tcon->ses->server; - if (!server->ops->mkdir) { rc = -ENOSYS; goto mkdir_out; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b2390e9a6843..badcfb2f3c22 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3313,6 +3313,7 @@ struct smb_version_operations smb311_operations = { .set_compression = smb2_set_compression, .mkdir = smb2_mkdir, .mkdir_setinfo = smb2_mkdir_setinfo, + .posix_mkdir = smb311_posix_mkdir, .rmdir = smb2_rmdir, .unlink = smb2_unlink, .rename = smb2_rename_path, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7daf38ab814a..810b85787c91 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1911,6 +1911,159 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, return 0; } +#ifdef CONFIG_CIFS_SMB311 +int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb) +{ + struct smb_rqst rqst; + struct smb2_create_req *req; + struct smb2_create_rsp *rsp; + struct TCP_Server_Info *server; + struct cifs_ses *ses = tcon->ses; + struct kvec iov[3]; /* make sure at least one for each open context */ + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype; + int uni_path_len; + __le16 *copy_path = NULL; + int copy_size; + int rc = 0; + unsigned int n_iov = 2; + __u32 file_attributes = 0; + char *pc_buf = NULL; + int flags = 0; + unsigned int total_len; + __le16 *path = cifs_convert_path_to_utf16(full_path, cifs_sb); + + if (!path) + return -ENOMEM; + + cifs_dbg(FYI, "mkdir\n"); + + if (ses && (ses->server)) + server = ses->server; + else + return -EIO; + + rc = smb2_plain_req_init(SMB2_CREATE, tcon, (void **) &req, &total_len); + + if (rc) + return rc; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + + req->ImpersonationLevel = IL_IMPERSONATION; + req->DesiredAccess = cpu_to_le32(FILE_WRITE_ATTRIBUTES); + /* File attributes ignored on open (used in create though) */ + req->FileAttributes = cpu_to_le32(file_attributes); + req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(FILE_CREATE); + req->CreateOptions = cpu_to_le32(CREATE_NOT_FILE); + + iov[0].iov_base = (char *)req; + /* -1 since last byte is buf[0] which is sent below (path) */ + iov[0].iov_len = total_len - 1; + + req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); + + /* [MS-SMB2] 2.2.13 NameOffset: + * If SMB2_FLAGS_DFS_OPERATIONS is set in the Flags field of + * the SMB2 header, the file name includes a prefix that will + * be processed during DFS name normalization as specified in + * section 3.3.5.9. Otherwise, the file name is relative to + * the share that is identified by the TreeId in the SMB2 + * header. + */ + if (tcon->share_flags & SHI1005_FLAGS_DFS) { + int name_len; + + req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + rc = alloc_path_with_tree_prefix(©_path, ©_size, + &name_len, + tcon->treeName, path); + if (rc) { + cifs_small_buf_release(req); + return rc; + } + req->NameLength = cpu_to_le16(name_len * 2); + uni_path_len = copy_size; + path = copy_path; + } else { + uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; + /* MUST set path len (NameLength) to 0 opening root of share */ + req->NameLength = cpu_to_le16(uni_path_len - 2); + if (uni_path_len % 8 != 0) { + copy_size = roundup(uni_path_len, 8); + copy_path = kzalloc(copy_size, GFP_KERNEL); + if (!copy_path) { + cifs_small_buf_release(req); + return -ENOMEM; + } + memcpy((char *)copy_path, (const char *)path, + uni_path_len); + uni_path_len = copy_size; + path = copy_path; + } + } + + iov[1].iov_len = uni_path_len; + iov[1].iov_base = path; + req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_NONE; + + if (tcon->posix_extensions) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + rc = add_posix_context(iov, &n_iov, mode); + if (rc) { + cifs_small_buf_release(req); + kfree(copy_path); + return rc; + } + pc_buf = iov[n_iov-1].iov_base; + } + + + memset(&rqst, 0, sizeof(struct smb_rqst)); + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, + &rsp_iov); + + cifs_small_buf_release(req); + rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; + + if (rc != 0) { + cifs_stats_fail_inc(tcon, SMB2_CREATE_HE); + trace_smb3_posix_mkdir_err(xid, tcon->tid, ses->Suid, + CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES, rc); + goto smb311_mkdir_exit; + } else + trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, + ses->Suid, CREATE_NOT_FILE, + FILE_WRITE_ATTRIBUTES); + + SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId); + + /* Eventually save off posix specific response info and timestaps */ + +smb311_mkdir_exit: + kfree(copy_path); + kfree(pc_buf); + free_rsp_buf(resp_buftype, rsp); + return rc; + +} +#endif /* SMB311 */ + int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, __u8 *oplock, struct smb2_file_all_info *buf, diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index c84020057bd8..78371c1a6503 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -79,6 +79,10 @@ extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, bool set_alloc); extern int smb2_set_file_info(struct inode *inode, const char *full_path, FILE_BASIC_INFO *buf, const unsigned int xid); +extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, + const char *full_path, + struct cifs_sb_info *cifs_sb); extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index 61e74d455d90..67e413f6ee4d 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -378,7 +378,7 @@ DEFINE_EVENT(smb3_open_err_class, smb3_##name, \ TP_ARGS(xid, tid, sesid, create_options, desired_access, rc)) DEFINE_SMB3_OPEN_ERR_EVENT(open_err); - +DEFINE_SMB3_OPEN_ERR_EVENT(posix_mkdir_err); DECLARE_EVENT_CLASS(smb3_open_done_class, TP_PROTO(unsigned int xid, @@ -420,6 +420,7 @@ DEFINE_EVENT(smb3_open_done_class, smb3_##name, \ TP_ARGS(xid, fid, tid, sesid, create_options, desired_access)) DEFINE_SMB3_OPEN_DONE_EVENT(open_done); +DEFINE_SMB3_OPEN_DONE_EVENT(posix_mkdir_done); #endif /* _CIFS_TRACE_H */ -- cgit v1.2.3 From 115d5d288dc3368e3d6e7eb9ee213b342f072c23 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 14 Jun 2018 21:59:31 -0500 Subject: smb3: do not display empty interface list If server does not support listing interfaces then do not display empty "Server interfaces" line to avoid confusing users. Signed-off-by: Steve French CC: Aurelien Aptel --- fs/cifs/cifs_debug.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e8db245194e7..bfe999505815 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -333,7 +333,9 @@ skip_rdma: spin_unlock(&GlobalMid_Lock); spin_lock(&ses->iface_lock); - seq_printf(m, "\n\tServer interfaces: %zu\n", ses->iface_count); + if (ses->iface_count) + seq_printf(m, "\n\tServer interfaces: %zu\n", + ses->iface_count); for (j = 0; j < ses->iface_count; j++) { seq_printf(m, "\t%d)\n", j); cifs_dump_iface(m, &ses->iface_list[j]); -- cgit v1.2.3 From d819d298c7258849d56eb400be436aff3ba2aae2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 14 Jun 2018 22:30:56 -0500 Subject: smb3: fix corrupt path in subdirs on smb311 with posix Signed-off-by: Steve French --- fs/cifs/smb2misc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 0de87ca33e2e..3ff7cec2da81 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -454,7 +454,8 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) #ifdef CONFIG_CIFS_SMB311 /* SMB311 POSIX extensions paths do not include leading slash */ else if (cifs_sb_master_tlink(cifs_sb) && - cifs_sb_master_tcon(cifs_sb)->posix_extensions) { + cifs_sb_master_tcon(cifs_sb)->posix_extensions && + (from[0] == '/')) { start_of_path = from + 1; } #endif /* 311 */ -- cgit v1.2.3 From f70b359b3830b7a5b72c78136edc740839b67acd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 1 Jun 2018 10:59:25 +0300 Subject: crypto: chtls - use after free in chtls_pt_recvmsg() We call chtls_free_skb() but then we dereference it on the next lines. Also "skb" can't be NULL, we just dereferenced it on the line before. I have moved the free down a couple lines to fix this issue. Fixes: 17a7d24aa89d ("crypto: chtls - generic handling of data and hdr") Signed-off-by: Dan Carpenter Signed-off-by: Herbert Xu --- drivers/crypto/chelsio/chtls/chtls_io.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c index 51fc6821cbbf..708e232e3cdf 100644 --- a/drivers/crypto/chelsio/chtls/chtls_io.c +++ b/drivers/crypto/chelsio/chtls/chtls_io.c @@ -1548,15 +1548,14 @@ skip_copy: tp->urg_data = 0; if ((avail + offset) >= skb->len) { - if (likely(skb)) - chtls_free_skb(sk, skb); - buffers_freed++; if (ULP_SKB_CB(skb)->flags & ULPCB_FLAG_TLS_HDR) { tp->copied_seq += skb->len; hws->rcvpld = skb->hdr_len; } else { tp->copied_seq += hws->rcvpld; } + chtls_free_skb(sk, skb); + buffers_freed++; hws->copied_seq = 0; if (copied >= target && !skb_peek(&sk->sk_receive_queue)) -- cgit v1.2.3 From 6e88f01206edab0e5bc105d8f35fac10f4ee14c5 Mon Sep 17 00:00:00 2001 From: Jia He Date: Fri, 8 Jun 2018 15:41:44 +0800 Subject: crypto: arm64/aes-blk - fix and move skcipher_walk_done out of kernel_neon_begin, _end In a arm64 server(QDF2400),I met a similar might-sleep warning as [1]: [ 7.019116] BUG: sleeping function called from invalid context at ./include/crypto/algapi.h:416 [ 7.027863] in_atomic(): 1, irqs_disabled(): 0, pid: 410, name: cryptomgr_test [ 7.035106] 1 lock held by cryptomgr_test/410: [ 7.039549] #0: (ptrval) (&drbg->drbg_mutex){+.+.}, at: drbg_instantiate+0x34/0x398 [ 7.048038] CPU: 9 PID: 410 Comm: cryptomgr_test Not tainted 4.17.0-rc6+ #27 [ 7.068228] dump_backtrace+0x0/0x1c0 [ 7.071890] show_stack+0x24/0x30 [ 7.075208] dump_stack+0xb0/0xec [ 7.078523] ___might_sleep+0x160/0x238 [ 7.082360] skcipher_walk_done+0x118/0x2c8 [ 7.086545] ctr_encrypt+0x98/0x130 [ 7.090035] simd_skcipher_encrypt+0x68/0xc0 [ 7.094304] drbg_kcapi_sym_ctr+0xd4/0x1f8 [ 7.098400] drbg_ctr_update+0x98/0x330 [ 7.102236] drbg_seed+0x1b8/0x2f0 [ 7.105637] drbg_instantiate+0x2ac/0x398 [ 7.109646] drbg_kcapi_seed+0xbc/0x188 [ 7.113482] crypto_rng_reset+0x4c/0xb0 [ 7.117319] alg_test_drbg+0xec/0x330 [ 7.120981] alg_test.part.6+0x1c8/0x3c8 [ 7.124903] alg_test+0x58/0xa0 [ 7.128044] cryptomgr_test+0x50/0x58 [ 7.131708] kthread+0x134/0x138 [ 7.134936] ret_from_fork+0x10/0x1c Seems there is a bug in Ard Biesheuvel's commit. Fixes: 683381747270 ("crypto: arm64/aes-blk - move kernel mode neon en/disable into loop") [1] https://www.spinics.net/lists/linux-crypto/msg33103.html Signed-off-by: jia.he@hxt-semitech.com Acked-by: Ard Biesheuvel Cc: # 4.17 Acked-by: Will Deacon Signed-off-by: Herbert Xu --- arch/arm64/crypto/aes-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 253188fb8cb0..e3e50950a863 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -223,8 +223,8 @@ static int ctr_encrypt(struct skcipher_request *req) kernel_neon_begin(); aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, (u8 *)ctx->key_enc, rounds, blocks, walk.iv); - err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE); } if (walk.nbytes) { u8 __aligned(8) tail[AES_BLOCK_SIZE]; -- cgit v1.2.3 From f044a84e040b85cd609851ac88ae8b54b2cc0b75 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Fri, 8 Jun 2018 11:53:41 +0200 Subject: crypto: don't optimize keccakf() keccakf() is the only function in kernel that uses __optimize() macro. __optimize() breaks frame pointer unwinder as optimized code uses RBP, and amusingly this always lead to degraded performance as gcc does not inline across different optimizations levels, so keccakf() wasn't inlined into its callers and keccakf_round() wasn't inlined into keccakf(). Drop __optimize() to resolve both problems. Signed-off-by: Dmitry Vyukov Fixes: 83dee2ce1ae7 ("crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize") Reported-by: syzbot+37035ccfa9a0a017ffcf@syzkaller.appspotmail.com Reported-by: syzbot+e073e4740cfbb3ae200b@syzkaller.appspotmail.com Cc: linux-crypto@vger.kernel.org Cc: "David S. Miller" Cc: Herbert Xu Cc: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/sha3_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c index 264ec12c0b9c..7f6735d9003f 100644 --- a/crypto/sha3_generic.c +++ b/crypto/sha3_generic.c @@ -152,7 +152,7 @@ static SHA3_INLINE void keccakf_round(u64 st[25]) st[24] ^= bc[ 4]; } -static void __optimize("O3") keccakf(u64 st[25]) +static void keccakf(u64 st[25]) { int round; -- cgit v1.2.3 From a81ae8095712d1513fe8d58527c92c439b43233e Mon Sep 17 00:00:00 2001 From: Ondrej Mosnáček Date: Wed, 13 Jun 2018 16:44:17 +0200 Subject: crypto: morus640 - Fix out-of-bounds access We must load the block from the temporary variable here, not directly from the input. Also add forgotten zeroing-out of the uninitialized part of the temporary block (as is done correctly in morus1280.c). Fixes: 396be41f16fd ("crypto: morus - Add generic MORUS AEAD implementations") Reported-by: syzbot+1fafa9c4cf42df33f716@syzkaller.appspotmail.com Reported-by: syzbot+d82643ba80bf6937cd44@syzkaller.appspotmail.com Signed-off-by: Ondrej Mosnacek Signed-off-by: Herbert Xu --- crypto/morus640.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/morus640.c b/crypto/morus640.c index 9fbcde307daf..5eede3749e64 100644 --- a/crypto/morus640.c +++ b/crypto/morus640.c @@ -274,8 +274,9 @@ static void crypto_morus640_decrypt_chunk(struct morus640_state *state, u8 *dst, union morus640_block_in tail; memcpy(tail.bytes, src, size); + memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size); - crypto_morus640_load_a(&m, src); + crypto_morus640_load_a(&m, tail.bytes); crypto_morus640_core(state, &m); crypto_morus640_store_a(tail.bytes, &m); memset(tail.bytes + size, 0, MORUS640_BLOCK_SIZE - size); -- cgit v1.2.3 From 837bf7cc3b7504385ae0e829c72e470dfc27cf6c Mon Sep 17 00:00:00 2001 From: Michael Büsch Date: Thu, 14 Jun 2018 20:08:11 +0200 Subject: hwrng: core - Always drop the RNG in hwrng_unregister() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit enable_best_rng() is used in hwrng_unregister() to switch away from the currently active RNG, if that is the one currently being removed. However enable_best_rng() might fail, if the next RNG's init routine fails. In that case enable_best_rng() will return an error code and the currently active RNG will remain active. After unregistering this might lead to crashes due to use-after-free. Fix this by dropping the currently active RNG, if enable_best_rng() failed. This will result in no RNG to be active, if the next-best one failed to initialize. This problem was introduced by 142a27f0a731ddcf467546960a5585970ca98e21 Fixes: 142a27f0a731 ("hwrng: core - Reset user selected rng by...") Reported-by: Wirz Tested-by: Wirz Signed-off-by: Michael Büsch Cc: stable@vger.kernel.org Signed-off-by: Herbert Xu --- drivers/char/hw_random/core.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 91bb98c42a1c..aaf9e5afaad4 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -516,11 +516,18 @@ EXPORT_SYMBOL_GPL(hwrng_register); void hwrng_unregister(struct hwrng *rng) { + int err; + mutex_lock(&rng_mutex); list_del(&rng->list); - if (current_rng == rng) - enable_best_rng(); + if (current_rng == rng) { + err = enable_best_rng(); + if (err) { + drop_current_rng(); + cur_rng_set_by_user = 0; + } + } if (list_empty(&rng_list)) { mutex_unlock(&rng_mutex); -- cgit v1.2.3 From 7d1982b4e335c1b184406b7566f6041bfe313c35 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 02:30:47 +0200 Subject: bpf: fix panic in prog load calls cleanup While testing I found that when hitting error path in bpf_prog_load() where we jump to free_used_maps and prog contained BPF to BPF calls that were JITed earlier, then we never clean up the bpf_prog_kallsyms_add() done under jit_subprogs(). Add proper API to make BPF kallsyms deletion more clear and fix that. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +++ kernel/bpf/core.c | 14 ++++++++++++++ kernel/bpf/syscall.c | 8 ++------ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 45fc0f5000d8..297c56fa9cee 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -961,6 +961,9 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) } #endif /* CONFIG_BPF_JIT */ +void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp); +void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); + #define BPF_ANC BIT(15) static inline bool bpf_needs_clear_a(const struct sock_filter *first) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9f1493705f40..1061968adcc1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -350,6 +350,20 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, return prog_adj; } +void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) +{ + int i; + + for (i = 0; i < fp->aux->func_cnt; i++) + bpf_prog_kallsyms_del(fp->aux->func[i]); +} + +void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) +{ + bpf_prog_kallsyms_del_subprogs(fp); + bpf_prog_kallsyms_del(fp); +} + #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0fa20624707f..0f62692fe635 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1034,14 +1034,9 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { - int i; - /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); - - for (i = 0; i < prog->aux->func_cnt; i++) - bpf_prog_kallsyms_del(prog->aux->func[i]); - bpf_prog_kallsyms_del(prog); + bpf_prog_kallsyms_del_all(prog); call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } @@ -1384,6 +1379,7 @@ static int bpf_prog_load(union bpf_attr *attr) return err; free_used_maps: + bpf_prog_kallsyms_del_subprogs(prog); free_used_maps(prog->aux); free_prog: bpf_prog_uncharge_memlock(prog); -- cgit v1.2.3 From 9facc336876f7ecf9edba4c67b90426fde4ec898 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 Jun 2018 02:30:48 +0200 Subject: bpf: reject any prog that failed read-only lock We currently lock any JITed image as read-only via bpf_jit_binary_lock_ro() as well as the BPF image as read-only through bpf_prog_lock_ro(). In the case any of these would fail we throw a WARN_ON_ONCE() in order to yell loudly to the log. Perhaps, to some extend, this may be comparable to an allocation where __GFP_NOWARN is explicitly not set. Added via 65869a47f348 ("bpf: improve read-only handling"), this behavior is slightly different compared to any of the other in-kernel set_memory_ro() users who do not check the return code of set_memory_ro() and friends /at all/ (e.g. in the case of module_enable_ro() / module_disable_ro()). Given in BPF this is mandatory hardening step, we want to know whether there are any issues that would leave both BPF data writable. So it happens that syzkaller enabled fault injection and it triggered memory allocation failure deep inside x86's change_page_attr_set_clr() which was triggered from set_memory_ro(). Now, there are two options: i) leaving everything as is, and ii) reworking the image locking code in order to have a final checkpoint out of the central bpf_prog_select_runtime() which probes whether any of the calls during prog setup weren't successful, and then bailing out with an error. Option ii) is a better approach since this additional paranoia avoids altogether leaving any potential W+X pages from BPF side in the system. Therefore, lets be strict about it, and reject programs in such unlikely occasion. While testing I noticed also that one bpf_prog_lock_ro() call was missing on the outer dummy prog in case of calls, e.g. in the destructor we call bpf_prog_free_deferred() on the main prog where we try to bpf_prog_unlock_free() the program, and since we go via bpf_prog_select_runtime() do that as well. Reported-by: syzbot+3b889862e65a98317058@syzkaller.appspotmail.com Reported-by: syzbot+9e762b52dd17e616a7a5@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 60 ++++++++++++++++++++++++++++++++------------------ kernel/bpf/core.c | 55 +++++++++++++++++++++++++++++++++++++++------ kernel/bpf/syscall.c | 4 +--- 3 files changed, 87 insertions(+), 32 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 297c56fa9cee..108f9812e196 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -469,7 +469,8 @@ struct sock_fprog_kern { }; struct bpf_binary_header { - unsigned int pages; + u16 pages; + u16 locked:1; u8 image[]; }; @@ -671,15 +672,18 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) -#ifdef CONFIG_ARCH_HAS_SET_MEMORY static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { +#ifdef CONFIG_ARCH_HAS_SET_MEMORY fp->locked = 1; - WARN_ON_ONCE(set_memory_ro((unsigned long)fp, fp->pages)); + if (set_memory_ro((unsigned long)fp, fp->pages)) + fp->locked = 0; +#endif } static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) { +#ifdef CONFIG_ARCH_HAS_SET_MEMORY if (fp->locked) { WARN_ON_ONCE(set_memory_rw((unsigned long)fp, fp->pages)); /* In case set_memory_rw() fails, we want to be the first @@ -687,34 +691,30 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) */ fp->locked = 0; } +#endif } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { - WARN_ON_ONCE(set_memory_ro((unsigned long)hdr, hdr->pages)); -} - -static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) -{ - WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); -} -#else -static inline void bpf_prog_lock_ro(struct bpf_prog *fp) -{ -} - -static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) -{ -} - -static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) -{ +#ifdef CONFIG_ARCH_HAS_SET_MEMORY + hdr->locked = 1; + if (set_memory_ro((unsigned long)hdr, hdr->pages)) + hdr->locked = 0; +#endif } static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) { +#ifdef CONFIG_ARCH_HAS_SET_MEMORY + if (hdr->locked) { + WARN_ON_ONCE(set_memory_rw((unsigned long)hdr, hdr->pages)); + /* In case set_memory_rw() fails, we want to be the first + * to crash here instead of some random place later on. + */ + hdr->locked = 0; + } +#endif } -#endif /* CONFIG_ARCH_HAS_SET_MEMORY */ static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) @@ -725,6 +725,22 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp) return (void *)addr; } +#ifdef CONFIG_ARCH_HAS_SET_MEMORY +static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp) +{ + if (!fp->locked) + return -ENOLCK; + if (fp->jited) { + const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); + + if (!hdr->locked) + return -ENOLCK; + } + + return 0; +} +#endif + int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1061968adcc1..a9e6c04d0f4a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -598,6 +598,8 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_fill_ill_insns(hdr, size); hdr->pages = size / PAGE_SIZE; + hdr->locked = 0; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -1448,6 +1450,33 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) return 0; } +static int bpf_prog_check_pages_ro_locked(const struct bpf_prog *fp) +{ +#ifdef CONFIG_ARCH_HAS_SET_MEMORY + int i, err; + + for (i = 0; i < fp->aux->func_cnt; i++) { + err = bpf_prog_check_pages_ro_single(fp->aux->func[i]); + if (err) + return err; + } + + return bpf_prog_check_pages_ro_single(fp); +#endif + return 0; +} + +static void bpf_prog_select_func(struct bpf_prog *fp) +{ +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + + fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0_warn; +#endif +} + /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with internal BPF program @@ -1458,13 +1487,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); + /* In case of BPF to BPF calls, verifier did all the prep + * work with regards to JITing, etc. + */ + if (fp->bpf_func) + goto finalize; - fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; -#else - fp->bpf_func = __bpf_prog_ret0_warn; -#endif + bpf_prog_select_func(fp); /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1485,6 +1514,8 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) if (*err) return fp; } + +finalize: bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at @@ -1493,7 +1524,17 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); - + if (*err) + return fp; + + /* Checkpoint: at this point onwards any cBPF -> eBPF or + * native eBPF program is read-only. If we failed to change + * the page attributes (e.g. allocation failure from + * splitting large pages), then reject the whole program + * in order to guarantee not ending up with any W+X pages + * from BPF side in kernel. + */ + *err = bpf_prog_check_pages_ro_locked(fp); return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0f62692fe635..35dc466641f2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1353,9 +1353,7 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - /* eBPF program is ready to be JITed */ - if (!prog->bpf_func) - prog = bpf_prog_select_runtime(prog, &err); + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; -- cgit v1.2.3 From 6d5fc1957989266006db6ef3dfb9159b42cf0189 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 14 Jun 2018 11:07:42 +0900 Subject: xdp: Fix handling of devmap in generic XDP Commit 67f29e07e131 ("bpf: devmap introduce dev_map_enqueue") changed the return value type of __devmap_lookup_elem() from struct net_device * to struct bpf_dtab_netdev * but forgot to modify generic XDP code accordingly. Thus generic XDP incorrectly used struct bpf_dtab_netdev where struct net_device is expected, then skb->dev was set to invalid value. v2: - Fix compiler warning without CONFIG_BPF_SYSCALL. Fixes: 67f29e07e131 ("bpf: devmap introduce dev_map_enqueue") Signed-off-by: Toshiaki Makita Acked-by: Yonghong Song Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 12 ++++++++++++ include/linux/filter.h | 16 ++++++++++++++++ kernel/bpf/devmap.c | 14 ++++++++++++++ net/core/filter.c | 21 ++++----------------- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 995c3b1e59bf..7df32a3200f7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,12 +488,15 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ struct xdp_buff; +struct sk_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); +int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, + struct bpf_prog *xdp_prog); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); @@ -586,6 +589,15 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return 0; } +struct sk_buff; + +static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + return 0; +} + static inline struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { diff --git a/include/linux/filter.h b/include/linux/filter.h index 108f9812e196..b615df57b7d5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -802,6 +803,21 @@ static inline bool bpf_dump_raw_ok(void) struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); +static inline int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, + struct net_device *fwd) +{ + unsigned int len; + + if (unlikely(!(fwd->flags & IFF_UP))) + return -ENETDOWN; + + len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; + if (skb->len > len) + return -EMSGSIZE; + + return 0; +} + /* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a7cc7b3494a9..642c97f6d1b8 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -345,6 +345,20 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, return bq_enqueue(dst, xdpf, dev_rx); } +int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + int err; + + err = __xdp_generic_ok_fwd_dev(skb, dst->dev); + if (unlikely(err)) + return err; + skb->dev = dst->dev; + generic_xdp_tx(skb, xdp_prog); + + return 0; +} + static void *dev_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); diff --git a/net/core/filter.c b/net/core/filter.c index 3d9ba7e5965a..e7f12e9f598c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3214,20 +3214,6 @@ err: } EXPORT_SYMBOL_GPL(xdp_do_redirect); -static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) -{ - unsigned int len; - - if (unlikely(!(fwd->flags & IFF_UP))) - return -ENETDOWN; - - len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; - if (skb->len > len) - return -EMSGSIZE; - - return 0; -} - static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, @@ -3256,10 +3242,11 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, } if (map->map_type == BPF_MAP_TYPE_DEVMAP) { - if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) + struct bpf_dtab_netdev *dst = fwd; + + err = dev_map_generic_redirect(dst, skb, xdp_prog); + if (unlikely(err)) goto err; - skb->dev = fwd; - generic_xdp_tx(skb, xdp_prog); } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { struct xdp_sock *xs = fwd; -- cgit v1.2.3 From 35e2cc1ba755cf9dbd042e308b2928c868767a98 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 15 Jun 2018 10:22:44 -0300 Subject: cifs: Use correct packet length in SMB2_TRANSFORM header In smb3_init_transform_rq(), 'orig_len' was only counting the request length, but forgot to count any data pages in the request. Writing or creating files with the 'seal' mount option was broken. In addition, do some code refactoring by exporting smb2_rqst_len() to calculate the appropriate packet size and avoid duplicating the same calculation all over the code. The start of the io vector is either the rfc1002 length (4 bytes) or a SMB2 header which is always > 4. Use this fact to check and skip the rfc1002 length if requested. Signed-off-by: Paulo Alcantara Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 7 +++---- fs/cifs/smb2proto.h | 2 ++ fs/cifs/smbdirect.c | 19 +++++-------------- fs/cifs/transport.c | 19 ++++++++++++++----- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index badcfb2f3c22..0356b5559c71 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2485,7 +2485,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, struct page **pages; struct smb2_transform_hdr *tr_hdr; unsigned int npages = old_rq->rq_npages; - unsigned int orig_len = 0; + unsigned int orig_len; int i; int rc = -ENOMEM; @@ -2499,9 +2499,6 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, new_rq->rq_pagesz = old_rq->rq_pagesz; new_rq->rq_tailsz = old_rq->rq_tailsz; - for (i = 0; i < old_rq->rq_nvec; i++) - orig_len += old_rq->rq_iov[i].iov_len; - for (i = 0; i < npages; i++) { pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM); if (!pages[i]) @@ -2524,6 +2521,8 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq, if (!tr_hdr) goto err_free_iov; + orig_len = smb2_rqst_len(old_rq, false); + /* fill the 2nd iov with a transform header */ fill_transform_hdr(tr_hdr, orig_len, old_rq); new_rq->rq_iov[0].iov_base = tr_hdr; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 78371c1a6503..3ae208ac2a77 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -113,6 +113,8 @@ extern int smb2_unlock_range(struct cifsFileInfo *cfile, extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); extern void smb2_reconnect_server(struct work_struct *work); extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server); +extern unsigned long +smb2_rqst_len(struct smb_rqst *rqst, bool skip_rfc1002_marker); /* * SMB2 Worker functions - most of protocol specific implementation details diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c index e459c97151b3..6fd94d9ffac2 100644 --- a/fs/cifs/smbdirect.c +++ b/fs/cifs/smbdirect.c @@ -18,6 +18,7 @@ #include "smbdirect.h" #include "cifs_debug.h" #include "cifsproto.h" +#include "smb2proto.h" static struct smbd_response *get_empty_queue_buffer( struct smbd_connection *info); @@ -2087,7 +2088,7 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) struct kvec vec; int nvecs; int size; - unsigned int buflen = 0, remaining_data_length; + unsigned int buflen, remaining_data_length; int start, i, j; int max_iov_size = info->max_send_size - sizeof(struct smbd_data_transfer); @@ -2111,25 +2112,13 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) log_write(ERR, "expected the pdu length in 1st iov, but got %zu\n", rqst->rq_iov[0].iov_len); return -EINVAL; } - iov = &rqst->rq_iov[1]; - - /* total up iov array first */ - for (i = 0; i < rqst->rq_nvec-1; i++) { - buflen += iov[i].iov_len; - } /* * Add in the page array if there is one. The caller needs to set * rq_tailsz to PAGE_SIZE when the buffer has multiple pages and * ends at page boundary */ - if (rqst->rq_npages) { - if (rqst->rq_npages == 1) - buflen += rqst->rq_tailsz; - else - buflen += rqst->rq_pagesz * (rqst->rq_npages - 1) - - rqst->rq_offset + rqst->rq_tailsz; - } + buflen = smb2_rqst_len(rqst, true); if (buflen + sizeof(struct smbd_data_transfer) > info->max_fragmented_send_size) { @@ -2139,6 +2128,8 @@ int smbd_send(struct smbd_connection *info, struct smb_rqst *rqst) goto done; } + iov = &rqst->rq_iov[1]; + cifs_dbg(FYI, "Sending smb (RDMA): smb_len=%u\n", buflen); for (i = 0; i < rqst->rq_nvec-1; i++) dump_smb(iov[i].iov_base, iov[i].iov_len); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index a3ea42a4cb98..fb57dfbfb749 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -201,15 +201,24 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, return 0; } -static unsigned long -smb2_rqst_len(struct smb_rqst *rqst) +unsigned long +smb2_rqst_len(struct smb_rqst *rqst, bool skip_rfc1002_marker) { unsigned int i; - struct kvec *iov = rqst->rq_iov; + struct kvec *iov; + int nvec; unsigned long buflen = 0; + if (skip_rfc1002_marker && rqst->rq_iov[0].iov_len == 4) { + iov = &rqst->rq_iov[1]; + nvec = rqst->rq_nvec - 1; + } else { + iov = rqst->rq_iov; + nvec = rqst->rq_nvec; + } + /* total up iov array first */ - for (i = 0; i < rqst->rq_nvec; i++) + for (i = 0; i < nvec; i++) buflen += iov[i].iov_len; /* @@ -262,7 +271,7 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, (char *)&val, sizeof(val)); for (j = 0; j < num_rqst; j++) - send_length += smb2_rqst_len(&rqst[j]); + send_length += smb2_rqst_len(&rqst[j], true); rfc1002_marker = cpu_to_be32(send_length); /* Generate a rfc1002 marker for SMB2+ */ -- cgit v1.2.3 From 83ffdeadb46b61580c4c9a5319bd76d258a2963d Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Fri, 15 Jun 2018 15:58:00 -0300 Subject: cifs: Fix invalid check in __cifs_calc_signature() The following check would never evaluate to true: > if (i == 0 && iov[0].iov_len <= 4) Because 'i' always starts at 1. This patch fixes it and also move the header checks outside the for loop - which makes more sense. Signed-off-by: Paulo Alcantara Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index f23ff848b158..ee2a8ec70056 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -48,26 +48,23 @@ int __cifs_calc_signature(struct smb_rqst *rqst, /* iov[0] is actual data and not the rfc1002 length for SMB2+ */ if (is_smb2) { - rc = crypto_shash_update(shash, - iov[0].iov_base, iov[0].iov_len); + if (iov[0].iov_len <= 4) + return -EIO; + i = 0; } else { if (n_vec < 2 || iov[0].iov_len != 4) return -EIO; + i = 1; /* skip rfc1002 length */ } - for (i = 1; i < n_vec; i++) { + for (; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { cifs_dbg(VFS, "null iovec entry\n"); return -EIO; } - if (is_smb2) { - if (i == 0 && iov[0].iov_len <= 4) - break; /* nothing to sign or corrupt header */ - } else - if (i == 1 && iov[1].iov_len <= 4) - break; /* nothing to sign or corrupt header */ + rc = crypto_shash_update(shash, iov[i].iov_base, iov[i].iov_len); if (rc) { -- cgit v1.2.3 From 9bbe60a67be5a1c6f79b3c9be5003481a50529ff Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 16 Jun 2018 11:55:44 +0100 Subject: atm: Preserve value of skb->truesize when accounting to vcc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ATM accounts for in-flight TX packets in sk_wmem_alloc of the VCC on which they are to be sent. But it doesn't take ownership of those packets from the sock (if any) which originally owned them. They should remain owned by their actual sender until they've left the box. There's a hack in pskb_expand_head() to avoid adjusting skb->truesize for certain skbs, precisely to avoid messing up sk_wmem_alloc accounting. Ideally that hack would cover the ATM use case too, but it doesn't — skbs which aren't owned by any sock, for example PPP control frames, still get their truesize adjusted when the low-level ATM driver adds headroom. This has always been an issue, it seems. The truesize of a packet increases, and sk_wmem_alloc on the VCC goes negative. But this wasn't for normal traffic, only for control frames. So I think we just got away with it, and we probably needed to send 2GiB of LCP echo frames before the misaccounting would ever have caused a problem and caused atm_may_send() to start refusing packets. Commit 14afee4b609 ("net: convert sock.sk_wmem_alloc from atomic_t to refcount_t") did exactly what it was intended to do, and turned this mostly-theoretical problem into a real one, causing PPPoATM to fail immediately as sk_wmem_alloc underflows and atm_may_send() *immediately* starts refusing to allow new packets. The least intrusive solution to this problem is to stash the value of skb->truesize that was accounted to the VCC, in a new member of the ATM_SKB(skb) structure. Then in atm_pop_raw() subtract precisely that value instead of the then-current value of skb->truesize. Fixes: 158f323b9868 ("net: adjust skb->truesize in pskb_expand_head()") Signed-off-by: David Woodhouse Tested-by: Kevin Darbyshire-Bryant Signed-off-by: David S. Miller --- include/linux/atmdev.h | 15 +++++++++++++++ net/atm/br2684.c | 3 +-- net/atm/clip.c | 3 +-- net/atm/common.c | 3 +-- net/atm/lec.c | 3 +-- net/atm/mpc.c | 3 +-- net/atm/pppoatm.c | 3 +-- net/atm/raw.c | 4 ++-- 8 files changed, 23 insertions(+), 14 deletions(-) diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h index 0c27515d2cf6..8124815eb121 100644 --- a/include/linux/atmdev.h +++ b/include/linux/atmdev.h @@ -214,6 +214,7 @@ struct atmphy_ops { struct atm_skb_data { struct atm_vcc *vcc; /* ATM VCC */ unsigned long atm_options; /* ATM layer options */ + unsigned int acct_truesize; /* truesize accounted to vcc */ }; #define VCC_HTABLE_SIZE 32 @@ -241,6 +242,20 @@ void vcc_insert_socket(struct sock *sk); void atm_dev_release_vccs(struct atm_dev *dev); +static inline void atm_account_tx(struct atm_vcc *vcc, struct sk_buff *skb) +{ + /* + * Because ATM skbs may not belong to a sock (and we don't + * necessarily want to), skb->truesize may be adjusted, + * escaping the hack in pskb_expand_head() which avoids + * doing so for some cases. So stash the value of truesize + * at the time we accounted it, and atm_pop_raw() can use + * that value later, in case it changes. + */ + refcount_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + ATM_SKB(skb)->acct_truesize = skb->truesize; + ATM_SKB(skb)->atm_options = vcc->atm_options; +} static inline void atm_force_charge(struct atm_vcc *vcc,int truesize) { diff --git a/net/atm/br2684.c b/net/atm/br2684.c index 36b3adacc0dd..10462de734ea 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -252,8 +252,7 @@ static int br2684_xmit_vcc(struct sk_buff *skb, struct net_device *dev, ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc; pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev); - refcount_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc); - ATM_SKB(skb)->atm_options = atmvcc->atm_options; + atm_account_tx(atmvcc, skb); dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; diff --git a/net/atm/clip.c b/net/atm/clip.c index 66caa48a27c2..d795b9c5aea4 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -381,8 +381,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb, memcpy(here, llc_oui, sizeof(llc_oui)); ((__be16 *) here)[3] = skb->protocol; } - refcount_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); - ATM_SKB(skb)->atm_options = vcc->atm_options; + atm_account_tx(vcc, skb); entry->vccs->last_use = jiffies; pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, vcc, vcc->dev); old = xchg(&entry->vccs->xoff, 1); /* assume XOFF ... */ diff --git a/net/atm/common.c b/net/atm/common.c index 1f2af59935db..ff5748b2190f 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -630,10 +630,9 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) goto out; } pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize); - refcount_add(skb->truesize, &sk->sk_wmem_alloc); + atm_account_tx(vcc, skb); skb->dev = NULL; /* for paths shared with net_device interfaces */ - ATM_SKB(skb)->atm_options = vcc->atm_options; if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { kfree_skb(skb); error = -EFAULT; diff --git a/net/atm/lec.c b/net/atm/lec.c index 5a95fcf6f9b6..d7f5cf5b7594 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -182,9 +182,8 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) struct net_device *dev = skb->dev; ATM_SKB(skb)->vcc = vcc; - ATM_SKB(skb)->atm_options = vcc->atm_options; + atm_account_tx(vcc, skb); - refcount_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); if (vcc->send(vcc, skb) < 0) { dev->stats.tx_dropped++; return; diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 75620c2f2617..24b53c4c39c6 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -555,8 +555,7 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc) sizeof(struct llc_snap_hdr)); } - refcount_add(skb->truesize, &sk_atm(entry->shortcut)->sk_wmem_alloc); - ATM_SKB(skb)->atm_options = entry->shortcut->atm_options; + atm_account_tx(entry->shortcut, skb); entry->shortcut->send(entry->shortcut, skb); entry->packets_fwded++; mpc->in_ops->put(entry); diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c index 21d9d341a619..af8c4b38b746 100644 --- a/net/atm/pppoatm.c +++ b/net/atm/pppoatm.c @@ -350,8 +350,7 @@ static int pppoatm_send(struct ppp_channel *chan, struct sk_buff *skb) return 1; } - refcount_add(skb->truesize, &sk_atm(ATM_SKB(skb)->vcc)->sk_wmem_alloc); - ATM_SKB(skb)->atm_options = ATM_SKB(skb)->vcc->atm_options; + atm_account_tx(vcc, skb); pr_debug("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, ATM_SKB(skb)->vcc, ATM_SKB(skb)->vcc->dev); ret = ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) diff --git a/net/atm/raw.c b/net/atm/raw.c index ee10e8d46185..b3ba44aab0ee 100644 --- a/net/atm/raw.c +++ b/net/atm/raw.c @@ -35,8 +35,8 @@ static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb) struct sock *sk = sk_atm(vcc); pr_debug("(%d) %d -= %d\n", - vcc->vci, sk_wmem_alloc_get(sk), skb->truesize); - WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); + vcc->vci, sk_wmem_alloc_get(sk), ATM_SKB(skb)->acct_truesize); + WARN_ON(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, &sk->sk_wmem_alloc)); dev_kfree_skb_any(skb); sk->sk_write_space(sk); } -- cgit v1.2.3 From a9122886d9848d00a01888116a58624b9ba95cdc Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 17 Jun 2018 08:38:55 +0900 Subject: bluetooth: hci_nokia: Don't include linux/unaligned/le_struct.h directly. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This breaks the build as this header is not meant to be used in this way. ./include/linux/unaligned/access_ok.h:8:28: error: redefinition of ‘get_unaligned_le16’ static __always_inline u16 get_unaligned_le16(const void *p) ^~~~~~~~~~~~~~~~~~ In file included from drivers/bluetooth/hci_nokia.c:32: ./include/linux/unaligned/le_struct.h:7:19: note: previous definition of ‘get_unaligned_le16’ was here static inline u16 get_unaligned_le16(const void *p) Use asm/unaligned.h instead. Signed-off-by: David S. Miller --- drivers/bluetooth/hci_nokia.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_nokia.c b/drivers/bluetooth/hci_nokia.c index 14d159e2042d..2dc33e65d2d0 100644 --- a/drivers/bluetooth/hci_nokia.c +++ b/drivers/bluetooth/hci_nokia.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From 536e0019b7da4eb3badb4da5acbb70ae29e1b5ef Mon Sep 17 00:00:00 2001 From: Helge Eichelberg Date: Tue, 5 Jun 2018 19:38:32 +0200 Subject: hwmon: (dell-smm) Disable fan support for Dell XPS13 9333 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling fan related SMM functions implemented by Dell BIOS firmware on Dell XPS13 9333 freeze kernel for about 500ms. Until Dell fixes it we need to disable fan support for Dell XPS13 9333. Via "force" module param fan support can be enabled. Link: https://bugzilla.kernel.org/show_bug.cgi?id=195751 Signed-off-by: Helge Eichelberg Reviewed-by: Pali Rohár Signed-off-by: Guenter Roeck --- drivers/hwmon/dell-smm-hwmon.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index bf3bb7e1adab..9d3ef879dc51 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -1074,6 +1074,13 @@ static struct dmi_system_id i8k_blacklist_fan_support_dmi_table[] __initdata = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "Vostro 3360"), }, }, + { + .ident = "Dell XPS13 9333", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "XPS13 9333"), + }, + }, { } }; -- cgit v1.2.3 From 91bb8f45f73f19a0150c233c0f11cdeb6d71d1e9 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 12 Jun 2018 15:19:35 -0700 Subject: hwmon: (nct6775) Fix loop limit Commit cc66b3038254 ("hwmon: (nct6775) Rework temperature source and label handling") changed a loop limit from "data->temp_label_num - 1" to "32", as part of moving from a string array to a bit mask. This results in the following error, reported by UBSAN. UBSAN: Undefined behaviour in drivers/hwmon/nct6775.c:4179:27 shift exponent 32 is too large for 32-bit type 'long unsigned int' Similar to the original loop, the limit has to be one less than the number of bits. Fixes: cc66b3038254 ("hwmon: (nct6775) Rework temperature source and label handling") Reported-by: Paul Menzel Cc: Paul Menzel Tested-by: Paul Menzel Signed-off-by: Guenter Roeck --- drivers/hwmon/nct6775.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/nct6775.c b/drivers/hwmon/nct6775.c index 155d4d1d1585..f9d1349c3286 100644 --- a/drivers/hwmon/nct6775.c +++ b/drivers/hwmon/nct6775.c @@ -4175,7 +4175,7 @@ static int nct6775_probe(struct platform_device *pdev) * The temperature is already monitored if the respective bit in * is set. */ - for (i = 0; i < 32; i++) { + for (i = 0; i < 31; i++) { if (!(data->temp_mask & BIT(i + 1))) continue; if (!reg_temp_alternate[i]) -- cgit v1.2.3 From 7e85dc8cb35abf16455f1511f0670b57c1a84608 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 15 Jun 2018 13:27:31 +0300 Subject: net_sched: blackhole: tell upper qdisc about dropped packets When blackhole is used on top of classful qdisc like hfsc it breaks qlen and backlog counters because packets are disappear without notice. In HFSC non-zero qlen while all classes are inactive triggers warning: WARNING: ... at net/sched/sch_hfsc.c:1393 hfsc_dequeue+0xba4/0xe90 [sch_hfsc] and schedules watchdog work endlessly. This patch return __NET_XMIT_BYPASS in addition to NET_XMIT_SUCCESS, this flag tells upper layer: this packet is gone and isn't queued. Signed-off-by: Konstantin Khlebnikov Signed-off-by: David S. Miller --- net/sched/sch_blackhole.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_blackhole.c b/net/sched/sch_blackhole.c index c98a61e980ba..9c4c2bb547d7 100644 --- a/net/sched/sch_blackhole.c +++ b/net/sched/sch_blackhole.c @@ -21,7 +21,7 @@ static int blackhole_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { qdisc_drop(skb, sch, to_free); - return NET_XMIT_SUCCESS; + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } static struct sk_buff *blackhole_dequeue(struct Qdisc *sch) -- cgit v1.2.3 From 9fcf2b3c1c0276650fea537c71b513d27d929b05 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Sun, 17 Jun 2018 10:48:22 +0200 Subject: drm/atmel-hlcdc: check stride values in the first plane The statement always evaluates to true since the struct fields are arrays. This has shown up as a warning when compiling with clang: warning: address of array 'desc->layout.xstride' will always evaluate to 'true' [-Wpointer-bool-conversion] Check for values in the first plane instead. Fixes: 1a396789f65a ("drm: add Atmel HLCDC Display Controller support") Cc: Signed-off-by: Stefan Agner Signed-off-by: Boris Brezillon Link: https://patchwork.freedesktop.org/patch/msgid/20180617084826.31885-1-stefan@agner.ch --- drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c index e18800ed7cd1..7b8191eae68a 100644 --- a/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c +++ b/drivers/gpu/drm/atmel-hlcdc/atmel_hlcdc_plane.c @@ -875,7 +875,7 @@ static int atmel_hlcdc_plane_init_properties(struct atmel_hlcdc_plane *plane, drm_object_attach_property(&plane->base.base, props->alpha, 255); - if (desc->layout.xstride && desc->layout.pstride) { + if (desc->layout.xstride[0] && desc->layout.pstride[0]) { int ret; ret = drm_plane_create_rotation_property(&plane->base, -- cgit v1.2.3 From b23908d3c48a37c46c6a26df2cdeab1610b360ba Mon Sep 17 00:00:00 2001 From: Simon Glass Date: Sun, 17 Jun 2018 14:09:42 +0200 Subject: firmware: dmi: Add access to the SKU ID string This is used in some systems from user space for determining the identity of the device. Expose this as a file so that that user-space tools don't need to read from /sys/firmware/dmi/tables/DMI Signed-off-by: Simon Glass Signed-off-by: Jean Delvare --- drivers/firmware/dmi-id.c | 2 ++ drivers/firmware/dmi_scan.c | 1 + include/linux/mod_devicetable.h | 1 + 3 files changed, 4 insertions(+) diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c index 951b6c79f166..624a11cb07e2 100644 --- a/drivers/firmware/dmi-id.c +++ b/drivers/firmware/dmi-id.c @@ -47,6 +47,7 @@ DEFINE_DMI_ATTR_WITH_SHOW(product_name, 0444, DMI_PRODUCT_NAME); DEFINE_DMI_ATTR_WITH_SHOW(product_version, 0444, DMI_PRODUCT_VERSION); DEFINE_DMI_ATTR_WITH_SHOW(product_serial, 0400, DMI_PRODUCT_SERIAL); DEFINE_DMI_ATTR_WITH_SHOW(product_uuid, 0400, DMI_PRODUCT_UUID); +DEFINE_DMI_ATTR_WITH_SHOW(product_sku, 0444, DMI_PRODUCT_SKU); DEFINE_DMI_ATTR_WITH_SHOW(product_family, 0444, DMI_PRODUCT_FAMILY); DEFINE_DMI_ATTR_WITH_SHOW(board_vendor, 0444, DMI_BOARD_VENDOR); DEFINE_DMI_ATTR_WITH_SHOW(board_name, 0444, DMI_BOARD_NAME); @@ -193,6 +194,7 @@ static void __init dmi_id_init_attr_table(void) ADD_DMI_ATTR(product_serial, DMI_PRODUCT_SERIAL); ADD_DMI_ATTR(product_uuid, DMI_PRODUCT_UUID); ADD_DMI_ATTR(product_family, DMI_PRODUCT_FAMILY); + ADD_DMI_ATTR(product_sku, DMI_PRODUCT_SKU); ADD_DMI_ATTR(board_vendor, DMI_BOARD_VENDOR); ADD_DMI_ATTR(board_name, DMI_BOARD_NAME); ADD_DMI_ATTR(board_version, DMI_BOARD_VERSION); diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 54e66adef252..f2483548cde9 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -447,6 +447,7 @@ static void __init dmi_decode(const struct dmi_header *dm, void *dummy) dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6); dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7); dmi_save_uuid(dm, DMI_PRODUCT_UUID, 8); + dmi_save_ident(dm, DMI_PRODUCT_SKU, 25); dmi_save_ident(dm, DMI_PRODUCT_FAMILY, 26); break; case 2: /* Base Board Information */ diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 7d361be2e24f..61fbfbc03e5b 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -490,6 +490,7 @@ enum dmi_field { DMI_PRODUCT_VERSION, DMI_PRODUCT_SERIAL, DMI_PRODUCT_UUID, + DMI_PRODUCT_SKU, DMI_PRODUCT_FAMILY, DMI_BOARD_VENDOR, DMI_BOARD_NAME, -- cgit v1.2.3 From 3ed1d012ac3e60e0e95cda6fbd59352ec6dcbb88 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 16 Jun 2018 17:09:41 -0700 Subject: Fix Documentation build due to rename of main.c to mtrr.c This fixes this documentation build error that is due to a file rename: Error: Cannot open file ../arch/x86/kernel/cpu/mtrr/main.c Fixes: 0afe832e55a7 ("Merge branch 'x86-cleanups-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip") Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- Documentation/core-api/kernel-api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 8e44aea366c2..76fe2d0f5e7d 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -284,7 +284,7 @@ Resources Management MTRR Handling ------------- -.. kernel-doc:: arch/x86/kernel/cpu/mtrr/main.c +.. kernel-doc:: arch/x86/kernel/cpu/mtrr/mtrr.c :export: Security Framework -- cgit v1.2.3 From fb7298e1669ee84aa76f3483c91f1de4814aac11 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 31 May 2018 11:39:42 +0000 Subject: pinctrl: mediatek: remove redundant return value check of platform_get_resource() Remove unneeded error handling on the result of a call to platform_get_resource() when the value is passed to devm_ioremap_resource(). Signed-off-by: Wei Yongjun Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-mtk-common.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c index b3799695d8db..16ff56f93501 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common.c +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common.c @@ -1000,11 +1000,6 @@ static int mtk_eint_init(struct mtk_pinctrl *pctl, struct platform_device *pdev) return -ENOMEM; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pdev->dev, "Unable to get eint resource\n"); - return -ENODEV; - } - pctl->eint->base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(pctl->eint->base)) return PTR_ERR(pctl->eint->base); -- cgit v1.2.3 From bc3322bc166a2905bc91f774d7b22773dc7c063a Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 7 Jun 2018 13:51:33 -0300 Subject: pinctrl: devicetree: Fix pctldev pointer overwrite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b89405b6102f ("pinctrl: devicetree: Fix dt_to_map_one_config handling of hogs") causes the pinctrl hog pins to not get initialized on i.MX platforms leaving them with the IOMUX settings untouched. This causes several regressions on i.MX such as: - OV5640 camera driver can not be probed anymore on imx6qdl-sabresd because the camera clock pin is in a pinctrl_hog group and since its pinctrl initialization is skipped, the camera clock is kept in GPIO functionality instead of CLK_CKO function. - Audio stopped working on imx6qdl-wandboard and imx53-qsb for the same reason. Richard Fitzgerald explains the problem: "I see the bug. If the hog node isn't a 1st level child of the pinctrl parent node it will go around the for(;;) loop again but on the first pass I overwrite pctldev with the result of get_pinctrl_dev_from_of_node() so it doesn't point to the pinctrl driver any more." Fix the issue by stashing the original pctldev so it doesn't get overwritten. Fixes: b89405b6102f ("pinctrl: devicetree: Fix dt_to_map_one_config handling of hogs") Cc: Reported-by: Mika Penttilä Reported-by: Steve Longerbeam Suggested-by: Richard Fitzgerald Signed-off-by: Fabio Estevam Reviewed-by: Dong Aisheng Reviewed-by: Richard Fitzgerald Signed-off-by: Linus Walleij --- drivers/pinctrl/devicetree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/devicetree.c b/drivers/pinctrl/devicetree.c index b601039d6c69..c4aa411f5935 100644 --- a/drivers/pinctrl/devicetree.c +++ b/drivers/pinctrl/devicetree.c @@ -101,10 +101,11 @@ struct pinctrl_dev *of_pinctrl_get(struct device_node *np) } static int dt_to_map_one_config(struct pinctrl *p, - struct pinctrl_dev *pctldev, + struct pinctrl_dev *hog_pctldev, const char *statename, struct device_node *np_config) { + struct pinctrl_dev *pctldev = NULL; struct device_node *np_pctldev; const struct pinctrl_ops *ops; int ret; @@ -123,8 +124,10 @@ static int dt_to_map_one_config(struct pinctrl *p, return -EPROBE_DEFER; } /* If we're creating a hog we can use the passed pctldev */ - if (pctldev && (np_pctldev == p->dev->of_node)) + if (hog_pctldev && (np_pctldev == p->dev->of_node)) { + pctldev = hog_pctldev; break; + } pctldev = get_pinctrl_dev_from_of_node(np_pctldev); if (pctldev) break; -- cgit v1.2.3 From 7f57871f39912978e95db920ddbbfb2304a4bfbf Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 6 Jun 2018 14:43:38 +0100 Subject: pinctrl: single: Add allocation failure checking of saved_vals Currently saved_vals is being allocated and there is no check for failed allocation (which is more likely than normal when using GFP_ATOMIC). Fix this by checking for a failed allocation and propagating this error return down the the caller chain. Detected by CoverityScan, CID#1469841 ("Dereference null return value") Fixes: 88a1dbdec682 ("pinctrl: pinctrl-single: Add functions to save and restore pinctrl context") Signed-off-by: Colin Ian King Reviewed-by: Johan Hovold Acked-by: Tony Lindgren Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-single.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index b3153c095199..e5647dac0818 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -1590,8 +1590,11 @@ static int pcs_save_context(struct pcs_device *pcs) mux_bytes = pcs->width / BITS_PER_BYTE; - if (!pcs->saved_vals) + if (!pcs->saved_vals) { pcs->saved_vals = devm_kzalloc(pcs->dev, pcs->size, GFP_ATOMIC); + if (!pcs->saved_vals) + return -ENOMEM; + } switch (pcs->width) { case 64: @@ -1651,8 +1654,13 @@ static int pinctrl_single_suspend(struct platform_device *pdev, if (!pcs) return -EINVAL; - if (pcs->flags & PCS_CONTEXT_LOSS_OFF) - pcs_save_context(pcs); + if (pcs->flags & PCS_CONTEXT_LOSS_OFF) { + int ret; + + ret = pcs_save_context(pcs); + if (ret < 0) + return ret; + } return pinctrl_force_sleep(pcs->pctl); } -- cgit v1.2.3 From 71fd5d07b791b90587f695f048ad82e8d4c1c67e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 8 Jun 2018 12:05:47 +0200 Subject: pinctrl: actions: Fix uninitialized error in owl_pin_config_set() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc 4.1.2: drivers/pinctrl/actions/pinctrl-owl.c: In function ‘owl_pin_config_set’: drivers/pinctrl/actions/pinctrl-owl.c:336: warning: ‘ret’ may be used uninitialized in this function Indeed, if num_configs is zero, the uninitialized value will be returned as an error code. Fix this by preinitializing it to zero. Fixes: 2242ddfbf4d699b5 ("pinctrl: actions: Add Actions S900 pinctrl driver") Signed-off-by: Geert Uytterhoeven Acked-by: Manivannan Sadhasivam Signed-off-by: Linus Walleij --- drivers/pinctrl/actions/pinctrl-owl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/actions/pinctrl-owl.c b/drivers/pinctrl/actions/pinctrl-owl.c index 76243caa08c6..b5c880b50bb3 100644 --- a/drivers/pinctrl/actions/pinctrl-owl.c +++ b/drivers/pinctrl/actions/pinctrl-owl.c @@ -333,7 +333,7 @@ static int owl_pin_config_set(struct pinctrl_dev *pctrldev, unsigned long flags; unsigned int param; u32 reg, bit, width, arg; - int ret, i; + int ret = 0, i; info = &pctrl->soc->padinfo[pin]; -- cgit v1.2.3 From 5f591543a937310e48baf0ee23680be09cdedfb8 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Thu, 14 Jun 2018 16:55:49 +0800 Subject: pinctrl: mt7622: fix a kernel panic when pio don't work as EINT controller The function, external interrupt controller, is made as an optional to mt7622 pinctrl. But if we don't want pio behaves as an external interrupt controller, it would lead to hw->eint not be created properly and then will cause 'kernel NULL pointer' issue when gpiochip try to call .to_irq or .set_config. To fix it, check hw->eint before accessing the member. [ 1.339494] Unable to handle kernel NULL pointer dereference at virtual address 00000010 [ 1.347857] Mem abort info: [ 1.350742] ESR = 0x96000005 [ 1.353905] Exception class = DABT (current EL), IL = 32 bits [ 1.360024] SET = 0, FnV = 0 [ 1.363185] EA = 0, S1PTW = 0 [ 1.366431] Data abort info: [ 1.369405] ISV = 0, ISS = 0x00000005 [ 1.373363] CM = 0, WnR = 0 [ 1.376437] [0000000000000010] user address but active_mm is swapper [ 1.383005] Internal error: Oops: 96000005 [#1] PREEMPT SMP [ 1.388748] Modules linked in: [ 1.391897] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.16.0-rc1+ #344 [ 1.398625] Hardware name: MediaTek MT7622 RFB1 board (DT) [ 1.404279] pstate: 80000005 (Nzcv daif -PAN -UAO) [ 1.409221] pc : mtk_eint_find_irq+0x8/0x24 [ 1.413532] lr : mtk_gpio_to_irq+0x20/0x28 [ 1.417749] sp : ffffff800801baf0 [ 1.421161] x29: ffffff800801baf0 x28: ffffff8008792f40 [ 1.426637] x27: ffffff800886b000 x26: ffffff8008615620 [ 1.432113] x25: ffffffc00e4dbdc8 x24: ffffff80087b8000 [ 1.437589] x23: ffffffc00325a000 x22: ffffffc00325a010 [ 1.443066] x21: ffffffc0033dec18 x20: 00000000ffffffea [ 1.448542] x19: ffffffc00e4db800 x18: 0000000000000130 [ 1.454018] x17: 000000000000000e x16: 0000000000000007 [ 1.459494] x15: ffffff80085ee000 x14: 0000000000000001 [ 1.464970] x13: 0000000000000001 x12: 0000000000000010 [ 1.470446] x11: 0101010101010101 x10: 0000000000000880 [ 1.475922] x9 : ffffff800801b990 x8 : ffffffc0030688e0 [ 1.481399] x7 : ffffff80080c0660 x6 : ffffffc00e4dbbb0 [ 1.486875] x5 : 0000000000000000 x4 : 0000000000000000 [ 1.492351] x3 : ffffff80082a92f4 x2 : 00000000fffffffa [ 1.497826] x1 : 0000000000000051 x0 : 0000000000000000 [ 1.503305] Process swapper/0 (pid: 1, stack limit = 0x0000000054e053bd) [ 1.510210] Call trace: [ 1.512727] mtk_eint_find_irq+0x8/0x24 [ 1.516677] mtk_gpio_to_irq+0x20/0x28 [ 1.520539] gpiod_to_irq+0x48/0x60 [ 1.524135] mmc_gpiod_request_cd_irq+0x3c/0xc4 [ 1.528804] mmc_start_host+0x6c/0x8c [ 1.532575] mmc_add_host+0x58/0x7c [ 1.536168] msdc_drv_probe+0x4fc/0x67c [ 1.540121] platform_drv_probe+0x58/0xa4 [ 1.544251] driver_probe_device+0x204/0x44c [ 1.548649] __driver_attach+0x84/0xf8 [ 1.552512] bus_for_each_dev+0x68/0xa0 [ 1.556461] driver_attach+0x20/0x28 [ 1.560142] bus_add_driver+0xec/0x240 [ 1.564002] driver_register+0x98/0xe4 [ 1.567863] __platform_driver_register+0x48/0x50 [ 1.572711] mt_msdc_driver_init+0x18/0x20 [ 1.576932] do_one_initcall+0x98/0x130 [ 1.580886] kernel_init_freeable+0x13c/0x1d4 [ 1.585375] kernel_init+0x10/0xf8 [ 1.588879] ret_from_fork+0x10/0x18 [ 1.592564] Code: a8c67bfd d65f03c0 a9bf7bfd 910003fd (f9400800) [ 1.598849] ---[ end trace 4bbcb7bc30e98492 ]--- [ 1.603677] Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b [ 1.603677] cc: Kevin Hilman Cc: stable@vger.kernel.org Fixes: e6dabd38d8e7 ("pinctrl: mediatek: add EINT support to MT7622 SoC") Signed-off-by: Sean Wang Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-mt7622.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-mt7622.c b/drivers/pinctrl/mediatek/pinctrl-mt7622.c index ad6da1184c9f..e3f1ab2290fc 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mt7622.c +++ b/drivers/pinctrl/mediatek/pinctrl-mt7622.c @@ -1459,6 +1459,9 @@ static int mtk_gpio_to_irq(struct gpio_chip *chip, unsigned int offset) struct mtk_pinctrl *hw = gpiochip_get_data(chip); unsigned long eint_n; + if (!hw->eint) + return -ENOTSUPP; + eint_n = offset; return mtk_eint_find_irq(hw->eint, eint_n); @@ -1471,7 +1474,8 @@ static int mtk_gpio_set_config(struct gpio_chip *chip, unsigned int offset, unsigned long eint_n; u32 debounce; - if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE) + if (!hw->eint || + pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE) return -ENOTSUPP; debounce = pinconf_to_config_argument(config); -- cgit v1.2.3 From 58b3d02f066e5b1480d80bd308c545526eea3250 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Wed, 13 Jun 2018 10:16:47 +0200 Subject: Revert "drm/sun4i: Handle DRM_BUS_FLAG_PIXDATA_*EDGE" This reverts commit 2c17a4368aad2b88b68e4390c819e226cf320f70. The offending commit triggers a run-time fault when accessing the panel element of the sun4i_tcon structure when no such panel is attached. It was apparently assumed in said commit that a panel is always used with the TCON. Although it is often the case, this is not always true. For instance a bridge might be used instead of a panel. This issue was discovered using an A13-OLinuXino, that uses the TCON in RGB mode for a simple DAC-based VGA bridge. Cc: stable@vger.kernel.org Signed-off-by: Paul Kocialkowski Signed-off-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20180613081647.31183-1-paul.kocialkowski@bootlin.com --- drivers/gpu/drm/sun4i/sun4i_tcon.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/drivers/gpu/drm/sun4i/sun4i_tcon.c b/drivers/gpu/drm/sun4i/sun4i_tcon.c index c3d92d537240..8045871335b5 100644 --- a/drivers/gpu/drm/sun4i/sun4i_tcon.c +++ b/drivers/gpu/drm/sun4i/sun4i_tcon.c @@ -17,7 +17,6 @@ #include #include #include -#include #include @@ -350,9 +349,6 @@ static void sun4i_tcon0_mode_set_lvds(struct sun4i_tcon *tcon, static void sun4i_tcon0_mode_set_rgb(struct sun4i_tcon *tcon, const struct drm_display_mode *mode) { - struct drm_panel *panel = tcon->panel; - struct drm_connector *connector = panel->connector; - struct drm_display_info display_info = connector->display_info; unsigned int bp, hsync, vsync; u8 clk_delay; u32 val = 0; @@ -410,27 +406,6 @@ static void sun4i_tcon0_mode_set_rgb(struct sun4i_tcon *tcon, if (mode->flags & DRM_MODE_FLAG_PVSYNC) val |= SUN4I_TCON0_IO_POL_VSYNC_POSITIVE; - /* - * On A20 and similar SoCs, the only way to achieve Positive Edge - * (Rising Edge), is setting dclk clock phase to 2/3(240°). - * By default TCON works in Negative Edge(Falling Edge), - * this is why phase is set to 0 in that case. - * Unfortunately there's no way to logically invert dclk through - * IO_POL register. - * The only acceptable way to work, triple checked with scope, - * is using clock phase set to 0° for Negative Edge and set to 240° - * for Positive Edge. - * On A33 and similar SoCs there would be a 90° phase option, - * but it divides also dclk by 2. - * Following code is a way to avoid quirks all around TCON - * and DOTCLOCK drivers. - */ - if (display_info.bus_flags & DRM_BUS_FLAG_PIXDATA_POSEDGE) - clk_set_phase(tcon->dclk, 240); - - if (display_info.bus_flags & DRM_BUS_FLAG_PIXDATA_NEGEDGE) - clk_set_phase(tcon->dclk, 0); - regmap_update_bits(tcon->regs, SUN4I_TCON0_IO_POL_REG, SUN4I_TCON0_IO_POL_HSYNC_POSITIVE | SUN4I_TCON0_IO_POL_VSYNC_POSITIVE, val); -- cgit v1.2.3 From 8195a655e5ce09550aff81b2573d9b015d520cb9 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 18 Jun 2018 14:17:16 +0300 Subject: ACPI / EC: Use ec_no_wakeup on Thinkpad X1 Carbon 6th On this system EC interrupt triggers constantly kicking devices out of low power states and thus blocking power management. The system also has a PCIe root port hosting Alpine Ridge Thunderbolt controller and it never gets a chance to go to D3cold because of this. Since the power button works the same regardless if EC interrupt is enabled or not during s2idle, add a quirk for this machine that sets ec_no_wakeup=true preventing spurious wakeups. Signed-off-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index bb94cf0731fe..442a9e24f439 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -2037,6 +2037,17 @@ static inline void acpi_ec_query_exit(void) } } +static const struct dmi_system_id acpi_ec_no_wakeup[] = { + { + .ident = "Thinkpad X1 Carbon 6th", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_NAME, "20KGS3JF01"), + }, + }, + { }, +}; + int __init acpi_ec_init(void) { int result; @@ -2047,6 +2058,15 @@ int __init acpi_ec_init(void) if (result) return result; + /* + * Disable EC wakeup on following systems to prevent periodic + * wakeup from EC GPE. + */ + if (dmi_check_system(acpi_ec_no_wakeup)) { + ec_no_wakeup = true; + pr_debug("Disabling EC wakeup on suspend-to-idle\n"); + } + /* Drivers must be started after acpi_ec_query_init() */ dsdt_fail = acpi_bus_register_driver(&acpi_ec_driver); /* -- cgit v1.2.3 From 856e7c4b619af622d56b3b454f7bec32a170ac99 Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Tue, 12 Jun 2018 16:46:03 -0600 Subject: selftests: pstore: return Kselftest Skip code for skipped tests When pstore_post_reboot test gets skipped because of unmet dependencies and/or unsupported configuration, it returns 0 which is treated as a pass by the Kselftest framework. This leads to false positive result even when the test could not be run. Change it to return kselftest skip code when a test gets skipped to clearly report that the test could not be run. Kselftest framework SKIP code is 4 and the framework prints appropriate messages to indicate that the test is skipped. Signed-off-by: Shuah Khan (Samsung OSG) Reviewed-by: Kees Cook Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/pstore/pstore_post_reboot_tests | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/pstore/pstore_post_reboot_tests b/tools/testing/selftests/pstore/pstore_post_reboot_tests index 6ccb154cb4aa..22f8df1ad7d4 100755 --- a/tools/testing/selftests/pstore/pstore_post_reboot_tests +++ b/tools/testing/selftests/pstore/pstore_post_reboot_tests @@ -7,13 +7,16 @@ # # Released under the terms of the GPL v2. +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + . ./common_tests if [ -e $REBOOT_FLAG ]; then rm $REBOOT_FLAG else prlog "pstore_crash_test has not been executed yet. we skip further tests." - exit 0 + exit $ksft_skip fi prlog -n "Mounting pstore filesystem ... " -- cgit v1.2.3 From 8781578087b8fb8829558bac96c3c24e5ba26f82 Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Tue, 12 Jun 2018 17:40:31 -0600 Subject: selftests: static_keys: return Kselftest Skip code for skipped tests When static_keys test is skipped because of unmet dependencies and/or unsupported configuration, it exits with error which is treated as a fail by the Kselftest framework. This leads to false negative result even when the test could not be run. Change it to return kselftest skip code when a test gets skipped to clearly report that the test could not be run. Added an explicit searches for test_static_key_base and test_static_keys modules and return skip code if they aren't found to differentiate between the failure to load the module condition and module not found condition. Kselftest framework SKIP code is 4 and the framework prints appropriate messages to indicate that the test is skipped. Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/static_keys/test_static_keys.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/static_keys/test_static_keys.sh b/tools/testing/selftests/static_keys/test_static_keys.sh index 24cff498b31a..fc9f8cde7d42 100755 --- a/tools/testing/selftests/static_keys/test_static_keys.sh +++ b/tools/testing/selftests/static_keys/test_static_keys.sh @@ -2,6 +2,19 @@ # SPDX-License-Identifier: GPL-2.0 # Runs static keys kernel module tests +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if ! /sbin/modprobe -q -n test_static_key_base; then + echo "static_key: module test_static_key_base is not found [SKIP]" + exit $ksft_skip +fi + +if ! /sbin/modprobe -q -n test_static_keys; then + echo "static_key: module test_static_keys is not found [SKIP]" + exit $ksft_skip +fi + if /sbin/modprobe -q test_static_key_base; then if /sbin/modprobe -q test_static_keys; then echo "static_key: ok" -- cgit v1.2.3 From c7db6ffb831fd36a03485a0d88b1e505378975ad Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Tue, 12 Jun 2018 18:11:37 -0600 Subject: selftests: sysctl: return Kselftest Skip code for skipped tests When sysctl test is skipped because of unmet dependencies and/or unsupported configuration, it exits with error which is treated as a fail by the Kselftest framework. This leads to false negative result even when the test could not be run. Change it to return kselftest skip code when a test gets skipped to clearly report that the test could not be run. Changed return code to kselftest skip code in skip error legs that check requirements and module probe test error leg. Kselftest framework SKIP code is 4 and the framework prints appropriate messages to indicate that the test is skipped. Signed-off-by: Shuah Khan (Samsung OSG) Reviewed-by: Kees Cook Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/sysctl/sysctl.sh | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index ec232c3cfcaa..584eb8ea780a 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -14,6 +14,9 @@ # This performs a series tests against the proc sysctl interface. +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + TEST_NAME="sysctl" TEST_DRIVER="test_${TEST_NAME}" TEST_DIR=$(dirname $0) @@ -41,7 +44,7 @@ test_modprobe() echo "$0: $DIR not present" >&2 echo "You must have the following enabled in your kernel:" >&2 cat $TEST_DIR/config >&2 - exit 1 + exit $ksft_skip fi } @@ -98,28 +101,30 @@ test_reqs() uid=$(id -u) if [ $uid -ne 0 ]; then echo $msg must be run as root >&2 - exit 0 + exit $ksft_skip fi if ! which perl 2> /dev/null > /dev/null; then echo "$0: You need perl installed" - exit 1 + exit $ksft_skip fi if ! which getconf 2> /dev/null > /dev/null; then echo "$0: You need getconf installed" - exit 1 + exit $ksft_skip fi if ! which diff 2> /dev/null > /dev/null; then echo "$0: You need diff installed" - exit 1 + exit $ksft_skip fi } function load_req_mod() { - trap "test_modprobe" EXIT - if [ ! -d $DIR ]; then + if ! modprobe -q -n $TEST_DRIVER; then + echo "$0: module $TEST_DRIVER not found [SKIP]" + exit $ksft_skip + fi modprobe $TEST_DRIVER if [ $? -ne 0 ]; then exit @@ -765,6 +770,7 @@ function parse_args() test_reqs allow_user_defaults check_production_sysctl_writes_strict +test_modprobe load_req_mod trap "test_finish" EXIT -- cgit v1.2.3 From d7d5311d4aa9611fe1a5a851e6f75733237a668a Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Wed, 13 Jun 2018 21:10:48 -0600 Subject: selftests: user: return Kselftest Skip code for skipped tests When user test is skipped because of unmet dependencies and/or unsupported configuration, it exits with error which is treated as a fail by the Kselftest framework. This leads to false negative result even when the test could not be run. Change it to return kselftest skip code when a test gets skipped to clearly report that the test could not be run. Add an explicit check for module presence and return skip code if module isn't present. Kselftest framework SKIP code is 4 and the framework prints appropriate messages to indicate that the test is skipped. Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/user/test_user_copy.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/user/test_user_copy.sh b/tools/testing/selftests/user/test_user_copy.sh index d60506fc77f8..f9b31a57439b 100755 --- a/tools/testing/selftests/user/test_user_copy.sh +++ b/tools/testing/selftests/user/test_user_copy.sh @@ -2,6 +2,13 @@ # SPDX-License-Identifier: GPL-2.0 # Runs copy_to/from_user infrastructure using test_user_copy kernel module +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if ! /sbin/modprobe -q -n test_user_copy; then + echo "user: module test_user_copy is not found [SKIP]" + exit $ksft_skip +fi if /sbin/modprobe -q test_user_copy; then /sbin/modprobe -q -r test_user_copy echo "user_copy: ok" -- cgit v1.2.3 From 685814466bf8398192cf855415a0bb2cefc1930e Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Thu, 14 Jun 2018 16:56:13 -0600 Subject: selftests: zram: return Kselftest Skip code for skipped tests When zram test is skipped because of unmet dependencies and/or unsupported configuration, it exits with error which is treated as a fail by the Kselftest framework. This leads to false negative result even when the test could not be run. Change it to return kselftest skip code when a test gets skipped to clearly report that the test could not be run. Kselftest framework SKIP code is 4 and the framework prints appropriate messages to indicate that the test is skipped. Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/zram/zram.sh | 5 ++++- tools/testing/selftests/zram/zram_lib.sh | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/zram/zram.sh b/tools/testing/selftests/zram/zram.sh index 754de7da426a..232e958ec454 100755 --- a/tools/testing/selftests/zram/zram.sh +++ b/tools/testing/selftests/zram/zram.sh @@ -2,6 +2,9 @@ # SPDX-License-Identifier: GPL-2.0 TCID="zram.sh" +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + . ./zram_lib.sh run_zram () { @@ -24,5 +27,5 @@ elif [ -b /dev/zram0 ]; then else echo "$TCID : No zram.ko module or /dev/zram0 device file not found" echo "$TCID : CONFIG_ZRAM is not set" - exit 1 + exit $ksft_skip fi diff --git a/tools/testing/selftests/zram/zram_lib.sh b/tools/testing/selftests/zram/zram_lib.sh index f6a9c73e7a44..9e73a4fb9b0a 100755 --- a/tools/testing/selftests/zram/zram_lib.sh +++ b/tools/testing/selftests/zram/zram_lib.sh @@ -18,6 +18,9 @@ MODULE=0 dev_makeswap=-1 dev_mounted=-1 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + trap INT check_prereqs() @@ -27,7 +30,7 @@ check_prereqs() if [ $uid -ne 0 ]; then echo $msg must be run as root >&2 - exit 0 + exit $ksft_skip fi } -- cgit v1.2.3 From a4d7537789724985cafbc9260a31ca4f2b7cf123 Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Wed, 13 Jun 2018 21:31:43 -0600 Subject: selftests: vm: return Kselftest Skip code for skipped tests When vm test is skipped because of unmet dependencies and/or unsupported configuration, it exits with error which is treated as a fail by the Kselftest framework. This leads to false negative result even when the test could not be run. Change it to return kselftest skip code when a test gets skipped to clearly report that the test could not be run. Kselftest framework SKIP code is 4 and the framework prints appropriate messages to indicate that the test is skipped. Signed-off-by: Shuah Khan (Samsung OSG) Acked-by: Mike Rapoport Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/vm/compaction_test.c | 4 +++- tools/testing/selftests/vm/mlock2-tests.c | 12 +++++++----- tools/testing/selftests/vm/run_vmtests | 5 ++++- tools/testing/selftests/vm/userfaultfd.c | 4 +++- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c index 1097f04e4d80..bcec71250873 100644 --- a/tools/testing/selftests/vm/compaction_test.c +++ b/tools/testing/selftests/vm/compaction_test.c @@ -16,6 +16,8 @@ #include #include +#include "../kselftest.h" + #define MAP_SIZE 1048576 struct map_list { @@ -169,7 +171,7 @@ int main(int argc, char **argv) printf("Either the sysctl compact_unevictable_allowed is not\n" "set to 1 or couldn't read the proc file.\n" "Skipping the test\n"); - return 0; + return KSFT_SKIP; } lim.rlim_cur = RLIM_INFINITY; diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c index 4997b9222cfa..637b6d0ac0d0 100644 --- a/tools/testing/selftests/vm/mlock2-tests.c +++ b/tools/testing/selftests/vm/mlock2-tests.c @@ -9,6 +9,8 @@ #include #include "mlock2.h" +#include "../kselftest.h" + struct vm_boundaries { unsigned long start; unsigned long end; @@ -303,7 +305,7 @@ static int test_mlock_lock() if (mlock2_(map, 2 * page_size, 0)) { if (errno == ENOSYS) { printf("Cannot call new mlock family, skipping test\n"); - _exit(0); + _exit(KSFT_SKIP); } perror("mlock2(0)"); goto unmap; @@ -412,7 +414,7 @@ static int test_mlock_onfault() if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { if (errno == ENOSYS) { printf("Cannot call new mlock family, skipping test\n"); - _exit(0); + _exit(KSFT_SKIP); } perror("mlock2(MLOCK_ONFAULT)"); goto unmap; @@ -425,7 +427,7 @@ static int test_mlock_onfault() if (munlock(map, 2 * page_size)) { if (errno == ENOSYS) { printf("Cannot call new mlock family, skipping test\n"); - _exit(0); + _exit(KSFT_SKIP); } perror("munlock()"); goto unmap; @@ -457,7 +459,7 @@ static int test_lock_onfault_of_present() if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { if (errno == ENOSYS) { printf("Cannot call new mlock family, skipping test\n"); - _exit(0); + _exit(KSFT_SKIP); } perror("mlock2(MLOCK_ONFAULT)"); goto unmap; @@ -583,7 +585,7 @@ static int test_vma_management(bool call_mlock) if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { if (errno == ENOSYS) { printf("Cannot call new mlock family, skipping test\n"); - _exit(0); + _exit(KSFT_SKIP); } perror("mlock(ONFAULT)\n"); goto out; diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 22d564673830..88cbe5575f0c 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -2,6 +2,9 @@ # SPDX-License-Identifier: GPL-2.0 #please run as root +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + mnt=./huge exitcode=0 @@ -36,7 +39,7 @@ if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then echo $(( $lackpgs + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages if [ $? -ne 0 ]; then echo "Please run this test as root" - exit 1 + exit $ksft_skip fi while read name size unit; do if [ "$name" = "HugePages_Free:" ]; then diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index de2f9ec8a87f..7b8171e3128a 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -69,6 +69,8 @@ #include #include +#include "../kselftest.h" + #ifdef __NR_userfaultfd static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; @@ -1322,7 +1324,7 @@ int main(int argc, char **argv) int main(void) { printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); - return 0; + return KSFT_SKIP; } #endif /* __NR_userfaultfd */ -- cgit v1.2.3 From d6a3e55131fcb1e5ca1753f4b6f297a177b2fc91 Mon Sep 17 00:00:00 2001 From: Fathi Boudra Date: Thu, 14 Jun 2018 11:57:08 +0200 Subject: selftests: sync: add config fragment for testing sync framework Unless the software synchronization objects (CONFIG_SW_SYNC) is enabled, the sync test will be skipped: TAP version 13 1..0 # Skipped: Sync framework not supported by kernel Add a config fragment file to be able to run "make kselftest-merge" to enable relevant configuration required in order to run the sync test. Signed-off-by: Fathi Boudra Link: https://lkml.org/lkml/2017/5/5/14 Signed-off-by: Anders Roxell Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/sync/config | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 tools/testing/selftests/sync/config diff --git a/tools/testing/selftests/sync/config b/tools/testing/selftests/sync/config new file mode 100644 index 000000000000..1ab7e8130db2 --- /dev/null +++ b/tools/testing/selftests/sync/config @@ -0,0 +1,4 @@ +CONFIG_STAGING=y +CONFIG_ANDROID=y +CONFIG_SYNC=y +CONFIG_SW_SYNC=y -- cgit v1.2.3 From 3c62c91a3635d6c4ef23e00b4fc84fa77bbf99cc Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Tue, 12 Jun 2018 16:50:37 -0600 Subject: selftests: sparc64: Fix to do nothing on non-sparc64 sparc64 test fails with the following errors on non-sparc64 systems. Fix the Makefile to do nothing on non-sparc64 systems to suppress the errors: make run_tests adi-test.c: Assembler messages: adi-test.c:302: Error: no such instruction: `rd %tick,%r13' adi-test.c:304: Error: no such instruction: `rd %tick,%rbp' adi-test.c:190: Error: no such instruction: `rd %tick,%rbp' adi-test.c:192: Error: no such instruction: `rd %tick,%rdx' adi-test.c:273: Error: no such instruction: `rd %tick,%rbx' adi-test.c:276: Error: no such instruction: `rd %tick,%rdx' adi-test.c:217: Error: no such instruction: `rd %tick,%rbp' adi-test.c:220: Error: no such instruction: `rd %tick,%rdx' adi-test.c:79: Error: no such instruction: `rd %tick,%rax' adi-test.c:79: Error: no such instruction: `rd %tick,%rax' adi-test.c:79: Error: no such instruction: `rd %tick,%rax' adi-test.c:79: Error: no such instruction: `rd %tick,%rax' adi-test.c:246: Error: no such instruction: `rd %tick,%rbp' adi-test.c:248: Error: no such instruction: `rd %tick,%rdx' adi-test.c:79: Error: no such instruction: `rd %tick,%rax' adi-test.c:79: Error: no such instruction: `rd %tick,%rax' adi-test.c:79: Error: no such instruction: `rd %tick,%rax' : recipe for target 'adi-test' failed make[1]: *** [adi-test] Error 1 adi: [FAIL] ./drivers_test.sh: 24: ./drivers_test.sh: ./adi-test: not found ../lib.mk:73: recipe for target 'run_tests' failed make: *** [run_tests] Error 1 Signed-off-by: Shuah Khan (Samsung OSG) Reviewed-by: Tom Hromatka Acked-by: David S. Miller Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/sparc64/Makefile | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/sparc64/Makefile b/tools/testing/selftests/sparc64/Makefile index 2082eeffd779..442f0ca45441 100644 --- a/tools/testing/selftests/sparc64/Makefile +++ b/tools/testing/selftests/sparc64/Makefile @@ -1,7 +1,17 @@ +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/x86_64/x86/) + +ifneq ($(ARCH),sparc64) +nothing: +.PHONY: all clean run_tests install +.SILENT: +else + SUBDIRS := drivers TEST_PROGS := run.sh + .PHONY: all clean include ../lib.mk @@ -44,3 +54,4 @@ override define CLEAN make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\ done endef +endif -- cgit v1.2.3 From 953c9d28d2c9bca15a17b11a022fbc657cc5631a Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Tue, 12 Jun 2018 17:12:24 -0600 Subject: selftests: sparc64: delete RUN_TESTS and EMIT_TESTS overrides Delete RUN_TESTS and EMIT_TESTS overrides and use common defines in lib.mk. Common defines work just fine and there is no need to define custom overrides. Signed-off-by: Shuah Khan (Samsung OSG) Reviewed-by: Tom Hromatka Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/sparc64/Makefile | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tools/testing/selftests/sparc64/Makefile b/tools/testing/selftests/sparc64/Makefile index 442f0ca45441..76b2206932c3 100644 --- a/tools/testing/selftests/sparc64/Makefile +++ b/tools/testing/selftests/sparc64/Makefile @@ -28,10 +28,6 @@ all: fi \ done -override define RUN_TESTS - @cd $(OUTPUT); ./run.sh -endef - override define INSTALL_RULE mkdir -p $(INSTALL_PATH) install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) @@ -43,10 +39,6 @@ override define INSTALL_RULE done; endef -override define EMIT_TESTS - echo "./run.sh" -endef - override define CLEAN @for DIR in $(SUBDIRS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ -- cgit v1.2.3 From eb83d5f7d07ed913d62ca4ad1e14fb6ca4937bab Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Wed, 13 Jun 2018 16:20:52 -0600 Subject: selftests: sparc64: Add missing SPDX License Identifiers Add missing SPDX License Identifiers to Makefile(s). Signed-off-by: Shuah Khan (Samsung OSG) Reviewed-by: Tom Hromatka Signed-off-by: Shuah Khan (Samsung OSG) --- tools/testing/selftests/sparc64/Makefile | 1 + tools/testing/selftests/sparc64/drivers/Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/sparc64/Makefile b/tools/testing/selftests/sparc64/Makefile index 76b2206932c3..a19531dba4dc 100644 --- a/tools/testing/selftests/sparc64/Makefile +++ b/tools/testing/selftests/sparc64/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 uname_M := $(shell uname -m 2>/dev/null || echo not) ARCH ?= $(shell echo $(uname_M) | sed -e s/x86_64/x86/) diff --git a/tools/testing/selftests/sparc64/drivers/Makefile b/tools/testing/selftests/sparc64/drivers/Makefile index 6264f40bbdbc..deb0df415565 100644 --- a/tools/testing/selftests/sparc64/drivers/Makefile +++ b/tools/testing/selftests/sparc64/drivers/Makefile @@ -1,4 +1,4 @@ - +# SPDX-License-Identifier: GPL-2.0 INCLUDEDIR := -I. CFLAGS := $(CFLAGS) $(INCLUDEDIR) -Wall -O2 -g -- cgit v1.2.3 From 7350cdd0257e73a37df57253fb9decd8effacd37 Mon Sep 17 00:00:00 2001 From: Bharat Potnuri Date: Fri, 15 Jun 2018 20:52:33 +0530 Subject: RDMA/core: Save kernel caller name when creating CQ using ib_create_cq() Few kernel applications like SCST-iSER create CQ using ib_create_cq(), where accessing CQ structures using rdma restrack tool leads to below NULL pointer dereference. This patch saves caller kernel module name similar to ib_alloc_cq(). BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] skip_spaces+0x30/0x30 PGD 738bac067 PUD 8533f0067 PMD 0 Oops: 0000 [#1] SMP R10: ffff88017fc03300 R11: 0000000000000246 R12: 0000000000000000 R13: ffff88082fa5a668 R14: ffff88017475a000 R15: 0000000000000000 FS: 00002b32726582c0(0000) GS:ffff88087fc40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000008491a1000 CR4: 00000000003607e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] ? fill_res_name_pid+0x7c/0x90 [ib_core] [] fill_res_cq_entry+0xef/0x170 [ib_core] [] res_get_common_dumpit+0x3c4/0x480 [ib_core] [] nldev_res_get_cq_dumpit+0x13/0x20 [ib_core] [] netlink_dump+0x117/0x2e0 [] __netlink_dump_start+0x1ab/0x230 [] ibnl_rcv_msg+0x11d/0x1f0 [ib_core] [] ? nldev_res_get_mr_dumpit+0x20/0x20 [ib_core] [] ? rdma_nl_multicast+0x30/0x30 [ib_core] [] netlink_rcv_skb+0xa9/0xc0 [] ibnl_rcv+0x98/0xb0 [ib_core] [] netlink_unicast+0xf2/0x1b0 [] netlink_sendmsg+0x31f/0x6a0 [] sock_sendmsg+0xb0/0xf0 [] ? _raw_spin_unlock_bh+0x1e/0x20 [] ? release_sock+0x118/0x170 [] SYSC_sendto+0x121/0x1c0 [] ? sock_alloc_file+0xa0/0x140 [] ? __fd_install+0x25/0x60 [] SyS_sendto+0xe/0x10 [] system_call_fastpath+0x16/0x1b RIP [] skip_spaces+0x30/0x30 RSP CR2: 0000000000000000 Cc: Fixes: f66c8ba4c9fa ("RDMA/core: Save kernel caller name when creating PD and CQ objects") Reviewed-by: Steve Wise Signed-off-by: Potnuri Bharat Teja Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 14 ++++++++------ include/rdma/ib_verbs.h | 13 ++++++++----- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 0b56828c1319..9d6beb948535 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1562,11 +1562,12 @@ EXPORT_SYMBOL(ib_destroy_qp); /* Completion queues */ -struct ib_cq *ib_create_cq(struct ib_device *device, - ib_comp_handler comp_handler, - void (*event_handler)(struct ib_event *, void *), - void *cq_context, - const struct ib_cq_init_attr *cq_attr) +struct ib_cq *__ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, + const struct ib_cq_init_attr *cq_attr, + const char *caller) { struct ib_cq *cq; @@ -1580,12 +1581,13 @@ struct ib_cq *ib_create_cq(struct ib_device *device, cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; + cq->res.kern_name = caller; rdma_restrack_add(&cq->res); } return cq; } -EXPORT_SYMBOL(ib_create_cq); +EXPORT_SYMBOL(__ib_create_cq); int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2043e1a8f851..4f71d6a073ba 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3394,11 +3394,14 @@ int ib_process_cq_direct(struct ib_cq *cq, int budget); * * Users can examine the cq structure to determine the actual CQ size. */ -struct ib_cq *ib_create_cq(struct ib_device *device, - ib_comp_handler comp_handler, - void (*event_handler)(struct ib_event *, void *), - void *cq_context, - const struct ib_cq_init_attr *cq_attr); +struct ib_cq *__ib_create_cq(struct ib_device *device, + ib_comp_handler comp_handler, + void (*event_handler)(struct ib_event *, void *), + void *cq_context, + const struct ib_cq_init_attr *cq_attr, + const char *caller); +#define ib_create_cq(device, cmp_hndlr, evt_hndlr, cq_ctxt, cq_attr) \ + __ib_create_cq((device), (cmp_hndlr), (evt_hndlr), (cq_ctxt), (cq_attr), KBUILD_MODNAME) /** * ib_resize_cq - Modifies the capacity of the CQ. -- cgit v1.2.3 From 375dc53d032fc11e98036b5f228ad13f7c5933f5 Mon Sep 17 00:00:00 2001 From: Vijay Immanuel Date: Tue, 12 Jun 2018 18:16:05 -0700 Subject: IB/rxe: Fix missing completion for mem_reg work requests Run the completer task to post a work completion after processing a memory registration or invalidate work request. This covers the case where the memory registration or invalidate was the last work request posted to the qp. Signed-off-by: Vijay Immanuel Reviewed-by: Yonatan Cohen Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 829ecb93661f..8be27238a86e 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -645,6 +645,9 @@ next_wqe: } else { goto exit; } + if ((wqe->wr.send_flags & IB_SEND_SIGNALED) || + qp->sq_sig_type == IB_SIGNAL_ALL_WR) + rxe_run_task(&qp->comp.task, 1); qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index); goto next_wqe; -- cgit v1.2.3 From 326345f995a83e326fa2e01d54bfa9a6a307bd4d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sat, 26 May 2018 19:12:51 +0200 Subject: MIPS: pb44: Fix i2c-gpio GPIO descriptor table I used bad names in my clumsiness when rewriting many board files to use GPIO descriptors instead of platform data. A few had the platform_device ID set to -1 which would indeed give the device name "i2c-gpio". But several had it set to >=0 which gives the names "i2c-gpio.0", "i2c-gpio.1" ... Fix the one affected board in the MIPS tree. Sorry. Fixes: b2e63555592f ("i2c: gpio: Convert to use descriptors") Reported-by: Simon Guinot Signed-off-by: Linus Walleij Reviewed-by: Paul Burton Cc: Ralf Baechle Cc: Wolfram Sang Cc: Simon Guinot Cc: linux-mips@linux-mips.org Cc: # 4.15+ Patchwork: https://patchwork.linux-mips.org/patch/19387/ Signed-off-by: James Hogan --- arch/mips/ath79/mach-pb44.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/ath79/mach-pb44.c b/arch/mips/ath79/mach-pb44.c index 6b2c6f3baefa..75fb96ca61db 100644 --- a/arch/mips/ath79/mach-pb44.c +++ b/arch/mips/ath79/mach-pb44.c @@ -34,7 +34,7 @@ #define PB44_KEYS_DEBOUNCE_INTERVAL (3 * PB44_KEYS_POLL_INTERVAL) static struct gpiod_lookup_table pb44_i2c_gpiod_table = { - .dev_id = "i2c-gpio", + .dev_id = "i2c-gpio.0", .table = { GPIO_LOOKUP_IDX("ath79-gpio", PB44_GPIO_I2C_SDA, NULL, 0, GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN), -- cgit v1.2.3 From 2a027b47dba6b77ab8c8e47b589ae9bbc5ac6175 Mon Sep 17 00:00:00 2001 From: Tokunori Ikegami Date: Sun, 3 Jun 2018 23:02:01 +0900 Subject: MIPS: BCM47XX: Enable 74K Core ExternalSync for PCIe erratum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The erratum and workaround are described by BCM5300X-ES300-RDS.pdf as below. R10: PCIe Transactions Periodically Fail Description: The BCM5300X PCIe does not maintain transaction ordering. This may cause PCIe transaction failure. Fix Comment: Add a dummy PCIe configuration read after a PCIe configuration write to ensure PCIe configuration access ordering. Set ES bit of CP0 configu7 register to enable sync function so that the sync instruction is functional. Resolution: hndpci.c: extpci_write_config() hndmips.c: si_mips_init() mipsinc.h CONF7_ES This is fixed by the CFE MIPS bcmsi chipset driver also for BCM47XX. Also the dummy PCIe configuration read is already implemented in the Linux BCMA driver. Enable ExternalSync in Config7 when CONFIG_BCMA_DRIVER_PCI_HOSTMODE=y too so that the sync instruction is externalised. Signed-off-by: Tokunori Ikegami Reviewed-by: Paul Burton Acked-by: Hauke Mehrtens Cc: Chris Packham Cc: Rafał Miłecki Cc: linux-mips@linux-mips.org Cc: stable@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/19461/ Signed-off-by: James Hogan --- arch/mips/bcm47xx/setup.c | 6 ++++++ arch/mips/include/asm/mipsregs.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/arch/mips/bcm47xx/setup.c b/arch/mips/bcm47xx/setup.c index 6054d49e608e..8c9cbf13d32a 100644 --- a/arch/mips/bcm47xx/setup.c +++ b/arch/mips/bcm47xx/setup.c @@ -212,6 +212,12 @@ static int __init bcm47xx_cpu_fixes(void) */ if (bcm47xx_bus.bcma.bus.chipinfo.id == BCMA_CHIP_ID_BCM4706) cpu_wait = NULL; + + /* + * BCM47XX Erratum "R10: PCIe Transactions Periodically Fail" + * Enable ExternalSync for sync instruction to take effect + */ + set_c0_config7(MIPS_CONF7_ES); break; #endif } diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index ae461d91cd1f..0bc270806ec5 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -681,6 +681,8 @@ #define MIPS_CONF7_WII (_ULCAST_(1) << 31) #define MIPS_CONF7_RPS (_ULCAST_(1) << 2) +/* ExternalSync */ +#define MIPS_CONF7_ES (_ULCAST_(1) << 8) #define MIPS_CONF7_IAR (_ULCAST_(1) << 10) #define MIPS_CONF7_AR (_ULCAST_(1) << 16) @@ -2765,6 +2767,7 @@ __BUILD_SET_C0(status) __BUILD_SET_C0(cause) __BUILD_SET_C0(config) __BUILD_SET_C0(config5) +__BUILD_SET_C0(config7) __BUILD_SET_C0(intcontrol) __BUILD_SET_C0(intctl) __BUILD_SET_C0(srsmap) -- cgit v1.2.3 From 73c4b15eff163f633a86589c5baf071f41af26b1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 18 Jun 2018 14:41:36 -0700 Subject: MAINTAINERS: Add me as an x86 entry code maintainer And update my email address. Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- MAINTAINERS | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 9d5eeff51b5f..624c3fd11d04 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15575,6 +15575,13 @@ S: Maintained F: Documentation/x86/ F: arch/x86/ +X86 ENTRY CODE +M: Andy Lutomirski +L: linux-kernel@vger.kernel.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/asm +S: Maintained +F: arch/x86/entry/ + X86 MCE INFRASTRUCTURE M: Tony Luck M: Borislav Petkov @@ -15597,7 +15604,7 @@ F: drivers/platform/x86/ F: drivers/platform/olpc/ X86 VDSO -M: Andy Lutomirski +M: Andy Lutomirski L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/vdso S: Maintained -- cgit v1.2.3 From 6cc22dc08a247b7b4a173e4561e39705a557d300 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 18 Jun 2018 14:15:30 -0700 Subject: revert "mm/memblock: add missing include " The patch fixed a W=1 warning but broke the ia64 build: CC mm/memblock.o mm/memblock.c:1340: error: redefinition of `memblock_virt_alloc_try_nid_raw' ./include/linux/bootmem.h:335: error: previous definition of `memblock_virt_alloc_try_nid_raw' was here Because inlcude/linux/bootmem.h says #if defined(CONFIG_HAVE_MEMBLOCK) && defined(CONFIG_NO_BOOTMEM) whereas mm/Makefile says obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o So revert 26f09e9b3a06 ("mm/memblock: add missing include ") while a full fix can be worked on. Fixes: 26f09e9b3a06 ("mm/memblock: add missing include ") Reported-by: Tony Luck Cc: Mathieu Malaterre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index cc16d70b8333..03d48d8835ba 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 1264f8325e9b8c004f36f1ae7bacd2a46a7ed771 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Mon, 18 Jun 2018 18:06:13 +1000 Subject: drm/nouveau/kms/nv50-: cursors always use core channel vram ctxdma Ctxdmas for cursors from all heads are setup in the core channel, and due to us tracking allocated handles per-window, we were failing with -EEXIST on multiple-head setups trying to allocate duplicate handles. The cursor code is hardcoded to use the core channel vram ctxdma already, so just skip ctxdma allocation for cursor fbs to fix the issue. Fixes: 5bca1621c07 ("drm/nouveau/kms/nv50-: move fb ctxdma tracking into windows") Reported-by: Adam Borowski Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/dispnv50/curs507a.c | 2 +- drivers/gpu/drm/nouveau/dispnv50/wndw.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/dispnv50/curs507a.c b/drivers/gpu/drm/nouveau/dispnv50/curs507a.c index 291c08117ab6..397143b639c6 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/curs507a.c +++ b/drivers/gpu/drm/nouveau/dispnv50/curs507a.c @@ -132,7 +132,7 @@ curs507a_new_(const struct nv50_wimm_func *func, struct nouveau_drm *drm, nvif_object_map(&wndw->wimm.base.user, NULL, 0); wndw->immd = func; - wndw->ctxdma.parent = &disp->core->chan.base.user; + wndw->ctxdma.parent = NULL; return 0; } diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c index 224963b533a6..c5a9bc1af5af 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c @@ -444,14 +444,17 @@ nv50_wndw_prepare_fb(struct drm_plane *plane, struct drm_plane_state *state) if (ret) return ret; - ctxdma = nv50_wndw_ctxdma_new(wndw, fb); - if (IS_ERR(ctxdma)) { - nouveau_bo_unpin(fb->nvbo); - return PTR_ERR(ctxdma); + if (wndw->ctxdma.parent) { + ctxdma = nv50_wndw_ctxdma_new(wndw, fb); + if (IS_ERR(ctxdma)) { + nouveau_bo_unpin(fb->nvbo); + return PTR_ERR(ctxdma); + } + + asyw->image.handle[0] = ctxdma->object.handle; } asyw->state.fence = reservation_object_get_excl_rcu(fb->nvbo->bo.resv); - asyw->image.handle[0] = ctxdma->object.handle; asyw->image.offset[0] = fb->nvbo->bo.offset; if (wndw->func->prepare) { -- cgit v1.2.3 From 0fe2795516b9e1c59b58b02bdf8658698117ec4e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:07:59 +0200 Subject: posix-timers: Fix nanosleep_copyout() for CONFIG_COMPAT_32BIT_TIME Commit b5793b0d92c9 added support for building the nanosleep compat system call on 32-bit architectures, but missed one change in nanosleep_copyout(), which would trigger a BUG() as soon as any architecture is switched over to use it. Use the proper config symbol to enable the code path. Fixes: Commit b5793b0d92c9 ("posix-timers: Make compat syscalls depend on CONFIG_COMPAT_32BIT_TIME") Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: y2038@lists.linaro.org Cc: Anna-Maria Gleixner Cc: Deepa Dinamani Cc: "Rafael J. Wysocki" Link: https://lkml.kernel.org/r/20180618140811.2998503-1-arnd@arndb.de --- kernel/time/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 055a4a728c00..3e93c54bd3a1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1659,7 +1659,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; -- cgit v1.2.3 From ee3dbcf963c17775417d2fdf74e60f0c1b563dda Mon Sep 17 00:00:00 2001 From: Ilia Lin Date: Sun, 17 Jun 2018 21:58:42 +0200 Subject: cpufreq: kryo: Fix possible error code dereference In event of error returned by the nvmem_cell_read() non-pointer value may be dereferenced. Fix this with error handling. Additionally free the allocated speedbin buffer, as per the API. Fixes: 9ce36edd1a52 (cpufreq: Add Kryo CPU scaling driver) Signed-off-by: Ilia Lin Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/qcom-cpufreq-kryo.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/qcom-cpufreq-kryo.c b/drivers/cpufreq/qcom-cpufreq-kryo.c index d049fe4b80c4..74b9b93d511b 100644 --- a/drivers/cpufreq/qcom-cpufreq-kryo.c +++ b/drivers/cpufreq/qcom-cpufreq-kryo.c @@ -115,6 +115,8 @@ static int qcom_cpufreq_kryo_probe(struct platform_device *pdev) speedbin = nvmem_cell_read(speedbin_nvmem, &len); nvmem_cell_put(speedbin_nvmem); + if (IS_ERR(speedbin)) + return PTR_ERR(speedbin); switch (msm8996_version) { case MSM8996_V3: @@ -127,6 +129,7 @@ static int qcom_cpufreq_kryo_probe(struct platform_device *pdev) BUG(); break; } + kfree(speedbin); for_each_possible_cpu(cpu) { cpu_dev = get_cpu_device(cpu); -- cgit v1.2.3 From 5ad7346b4ae2d59cfee106ec8e40ee2561476d47 Mon Sep 17 00:00:00 2001 From: Ilia Lin Date: Sun, 17 Jun 2018 22:01:46 +0200 Subject: cpufreq: kryo: Add module remove and exit Add device remove and module exit code to make the driver functioning as a loadable module. Fixes: ac28927659be (cpufreq: kryo: allow building as a loadable module) Signed-off-by: Ilia Lin Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/qcom-cpufreq-kryo.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-kryo.c b/drivers/cpufreq/qcom-cpufreq-kryo.c index 74b9b93d511b..01bddacf5c3b 100644 --- a/drivers/cpufreq/qcom-cpufreq-kryo.c +++ b/drivers/cpufreq/qcom-cpufreq-kryo.c @@ -42,6 +42,8 @@ enum _msm8996_version { NUM_OF_MSM8996_VERSIONS, }; +struct platform_device *cpufreq_dt_pdev, *kryo_cpufreq_pdev; + static enum _msm8996_version __init qcom_cpufreq_kryo_get_msm_id(void) { size_t len; @@ -74,7 +76,6 @@ static enum _msm8996_version __init qcom_cpufreq_kryo_get_msm_id(void) static int qcom_cpufreq_kryo_probe(struct platform_device *pdev) { struct opp_table *opp_tables[NR_CPUS] = {0}; - struct platform_device *cpufreq_dt_pdev; enum _msm8996_version msm8996_version; struct nvmem_cell *speedbin_nvmem; struct device_node *np; @@ -165,8 +166,15 @@ free_opp: return ret; } +static int qcom_cpufreq_kryo_remove(struct platform_device *pdev) +{ + platform_device_unregister(cpufreq_dt_pdev); + return 0; +} + static struct platform_driver qcom_cpufreq_kryo_driver = { .probe = qcom_cpufreq_kryo_probe, + .remove = qcom_cpufreq_kryo_remove, .driver = { .name = "qcom-cpufreq-kryo", }, @@ -201,8 +209,9 @@ static int __init qcom_cpufreq_kryo_init(void) if (unlikely(ret < 0)) return ret; - ret = PTR_ERR_OR_ZERO(platform_device_register_simple( - "qcom-cpufreq-kryo", -1, NULL, 0)); + kryo_cpufreq_pdev = platform_device_register_simple( + "qcom-cpufreq-kryo", -1, NULL, 0); + ret = PTR_ERR_OR_ZERO(kryo_cpufreq_pdev); if (0 == ret) return 0; @@ -211,5 +220,12 @@ static int __init qcom_cpufreq_kryo_init(void) } module_init(qcom_cpufreq_kryo_init); +static void __init qcom_cpufreq_kryo_exit(void) +{ + platform_device_unregister(kryo_cpufreq_pdev); + platform_driver_unregister(&qcom_cpufreq_kryo_driver); +} +module_exit(qcom_cpufreq_kryo_exit); + MODULE_DESCRIPTION("Qualcomm Technologies, Inc. Kryo CPUfreq driver"); MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From ff7c9917143b3a6cf2fa61212a32d67cf259bf9c Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 18 Jun 2018 12:47:45 -0700 Subject: cpufreq: intel_pstate: Fix scaling max/min limits with Turbo 3.0 When scaling max/min settings are changed, internally they are converted to a ratio using the max turbo 1 core turbo frequency. This works fine when 1 core max is same irrespective of the core. But under Turbo 3.0, this will not be the case. For example: Core 0: max turbo pstate: 43 (4.3GHz) Core 1: max turbo pstate: 45 (4.5GHz) In this case 1 core turbo ratio will be maximum of all, so it will be 45 (4.5GHz). Suppose scaling max is set to 4GHz (ratio 40) for all cores ,then on core one it will be = max_state * policy->max / max_freq; = 43 * (4000000/4500000) = 38 (3.8GHz) = 38 which is 200MHz less than the desired. On core2, it will be correctly set to ratio 40 (4GHz). Same holds true for scaling min frequency limit. So this requires usage of correct turbo max frequency for core one, which in this case is 4.3GHz. So we need to adjust per CPU cpu->pstate.turbo_freq using the maximum HWP ratio of that core. This change uses the HWP capability of a core to adjust max turbo frequency. But since Broadwell HWP doesn't use ratios in the HWP capabilities, we have to use legacy max 1 core turbo ratio. This is not a problem as the HWP capabilities don't differ among cores in Broadwell. We need to check for non Broadwell CPU model for applying this change, though. Signed-off-by: Srinivas Pandruvada Cc: 4.6+ # 4.6+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 1de5ec8d5ea3..ece120da3353 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -294,6 +294,7 @@ struct pstate_funcs { static struct pstate_funcs pstate_funcs __read_mostly; static int hwp_active __read_mostly; +static int hwp_mode_bdw __read_mostly; static bool per_cpu_limits __read_mostly; static bool hwp_boost __read_mostly; @@ -1413,7 +1414,15 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); cpu->pstate.scaling = pstate_funcs.get_scaling(); cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling; - cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + + if (hwp_active && !hwp_mode_bdw) { + unsigned int phy_max, current_max; + + intel_pstate_get_hwp_max(cpu->cpu, &phy_max, ¤t_max); + cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling; + } else { + cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + } if (pstate_funcs.get_aperf_mperf_shift) cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift(); @@ -2467,28 +2476,36 @@ static inline bool intel_pstate_has_acpi_ppc(void) { return false; } static inline void intel_pstate_request_control_from_smm(void) {} #endif /* CONFIG_ACPI */ +#define INTEL_PSTATE_HWP_BROADWELL 0x01 + +#define ICPU_HWP(model, hwp_mode) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_HWP, hwp_mode } + static const struct x86_cpu_id hwp_support_ids[] __initconst = { - { X86_VENDOR_INTEL, 6, X86_MODEL_ANY, X86_FEATURE_HWP }, + ICPU_HWP(INTEL_FAM6_BROADWELL_X, INTEL_PSTATE_HWP_BROADWELL), + ICPU_HWP(INTEL_FAM6_BROADWELL_XEON_D, INTEL_PSTATE_HWP_BROADWELL), + ICPU_HWP(X86_MODEL_ANY, 0), {} }; static int __init intel_pstate_init(void) { + const struct x86_cpu_id *id; int rc; if (no_load) return -ENODEV; - if (x86_match_cpu(hwp_support_ids)) { + id = x86_match_cpu(hwp_support_ids); + if (id) { copy_cpu_funcs(&core_funcs); if (!no_hwp) { hwp_active++; + hwp_mode_bdw = id->driver_data; intel_pstate.attr = hwp_cpufreq_attrs; goto hwp_cpu_matched; } } else { - const struct x86_cpu_id *id; - id = x86_match_cpu(intel_pstate_cpu_ids); if (!id) return -ENODEV; -- cgit v1.2.3 From c5c2a97b3ac7d1ec19e7cff9e38caca6afefc3de Mon Sep 17 00:00:00 2001 From: Waldemar Rymarkiewicz Date: Thu, 14 Jun 2018 15:56:08 +0200 Subject: PM / OPP: Update voltage in case freq == old_freq This commit fixes a rare but possible case when the clk rate is updated without update of the regulator voltage. At boot up, CPUfreq checks if the system is running at the right freq. This is a sanity check in case a bootloader set clk rate that is outside of freq table present with cpufreq core. In such cases system can be unstable so better to change it to a freq that is preset in freq-table. The CPUfreq takes next freq that is >= policy->cur and this is our target_freq that needs to be set now. dev_pm_opp_set_rate(dev, target_freq) checks the target_freq and the old_freq (a current rate). If these are equal it returns early. If not, it searches for OPP (old_opp) that fits best to old_freq (not listed in the table) and updates old_freq (!). Here, we can end up with old_freq = old_opp.rate = target_freq, which is not handled in _generic_set_opp_regulator(). It's supposed to update voltage only when freq > old_freq || freq > old_freq. if (freq > old_freq) { ret = _set_opp_voltage(dev, reg, new_supply); [...] if (freq < old_freq) { ret = _set_opp_voltage(dev, reg, new_supply); if (ret) It results in, no voltage update while clk rate is updated. Example: freq-table = { 1000MHz 1.15V 666MHZ 1.10V 333MHz 1.05V } boot-up-freq = 800MHz # not listed in freq-table freq = target_freq = 1GHz old_freq = 800Mhz old_opp = _find_freq_ceil(opp_table, &old_freq); #(old_freq is modified!) old_freq = 1GHz Fixes: 6a0712f6f199 ("PM / OPP: Add dev_pm_opp_set_rate()") Cc: 4.6+ # v4.6+ Signed-off-by: Waldemar Rymarkiewicz Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index ab2f3fead6b1..31ff03dbeb83 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -598,7 +598,7 @@ static int _generic_set_opp_regulator(const struct opp_table *opp_table, } /* Scaling up? Scale voltage before frequency */ - if (freq > old_freq) { + if (freq >= old_freq) { ret = _set_opp_voltage(dev, reg, new_supply); if (ret) goto restore_voltage; -- cgit v1.2.3 From 69a8405999aa1c489de4b8d349468f0c2b83f093 Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Thu, 14 Jun 2018 11:27:42 -0400 Subject: powerpc/e500mc: Set assembler machine type to e500mc In binutils 2.26 a new opcode for the "wait" instruction was added for the POWER9 and has precedence over the one specific to the e500mc. Commit ebf714ff3756 ("powerpc/e500mc: Add support for the wait instruction in e500_idle") uses this instruction specifically on the e500mc to work around an erratum. This results in an invalid instruction in idle_e500 when we build for the e500mc on bintutils >= 2.26 with the default assembler machine type. Since multiplatform between e500 and non-e500 is not supported, set the assembler machine type globaly when CONFIG_PPC_E500MC=y. Signed-off-by: Michael Jeanson Reviewed-by: Mathieu Desnoyers CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: Michael Ellerman CC: Kumar Gala CC: Vakul Garg CC: Scott Wood CC: Mathieu Desnoyers CC: linuxppc-dev@lists.ozlabs.org CC: linux-kernel@vger.kernel.org CC: stable@vger.kernel.org Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index bd06a3ccda31..2ea575cb3401 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -244,6 +244,7 @@ cpu-as-$(CONFIG_4xx) += -Wa,-m405 cpu-as-$(CONFIG_ALTIVEC) += $(call as-option,-Wa$(comma)-maltivec) cpu-as-$(CONFIG_E200) += -Wa,-me200 cpu-as-$(CONFIG_PPC_BOOK3S_64) += -Wa,-mpower4 +cpu-as-$(CONFIG_PPC_E500MC) += $(call as-option,-Wa$(comma)-me500mc) KBUILD_AFLAGS += $(cpu-as-y) KBUILD_CFLAGS += $(cpu-as-y) -- cgit v1.2.3 From 02390f66bd2362df114a0a0770d80ec33061f6d1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 15 Jun 2018 11:38:37 +1000 Subject: powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP The patch 99baac21e4 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem") added a force flush mode to the mmu_gather flush, which unconditionally flushes the entire address range being invalidated (even if actual ptes only covered a smaller range), to solve a problem with concurrent threads invalidating the same PTEs causing them to miss TLBs that need flushing. This does not work with powerpc that invalidates mmu_gather batches according to page size. Have powerpc flush all possible page sizes in the range if it encounters this concurrency condition. Patch 4647706ebe ("mm: always flush VMA ranges affected by zap_page_range") does add a TLB flush for all page sizes on powerpc for the zap_page_range case, but that is to be removed and replaced with the mmu_gather flush to avoid redundant flushing. It is also thought to not cover other obscure race conditions: https://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com Hash does not have a problem because it invalidates TLBs inside the page table locks. Reported-by: Aneesh Kumar K.V Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 96 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 75 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 67a6e86d3e7e..a734e486664d 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -689,22 +689,17 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; -void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) +static inline void __radix__flush_tlb_range(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool flush_all_sizes) { - struct mm_struct *mm = vma->vm_mm; unsigned long pid; unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; unsigned long page_size = 1UL << page_shift; unsigned long nr_pages = (end - start) >> page_shift; bool local, full; -#ifdef CONFIG_HUGETLB_PAGE - if (is_vm_hugetlb_page(vma)) - return radix__flush_hugetlb_tlb_range(vma, start, end); -#endif - pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) return; @@ -738,37 +733,64 @@ is_local: _tlbie_pid(pid, RIC_FLUSH_TLB); } } else { - bool hflush = false; + bool hflush = flush_all_sizes; + bool gflush = flush_all_sizes; unsigned long hstart, hend; + unsigned long gstart, gend; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT; - hend = end >> HPAGE_PMD_SHIFT; - if (hstart < hend) { - hstart <<= HPAGE_PMD_SHIFT; - hend <<= HPAGE_PMD_SHIFT; + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) hflush = true; + + if (hflush) { + hstart = (start + PMD_SIZE - 1) & PMD_MASK; + hend = end & PMD_MASK; + if (hstart == hend) + hflush = false; + } + + if (gflush) { + gstart = (start + PUD_SIZE - 1) & PUD_MASK; + gend = end & PUD_MASK; + if (gstart == gend) + gflush = false; } -#endif asm volatile("ptesync": : :"memory"); if (local) { __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbiel_va_range(hstart, hend, pid, - HPAGE_PMD_SIZE, MMU_PAGE_2M); + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbiel_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); asm volatile("ptesync": : :"memory"); } else { __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); if (hflush) __tlbie_va_range(hstart, hend, pid, - HPAGE_PMD_SIZE, MMU_PAGE_2M); + PMD_SIZE, MMU_PAGE_2M); + if (gflush) + __tlbie_va_range(gstart, gend, pid, + PUD_SIZE, MMU_PAGE_1G); fixup_tlbie(); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } } preempt_enable(); } + +void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) + +{ +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif + + __radix__flush_tlb_range(vma->vm_mm, start, end, false); +} EXPORT_SYMBOL(radix__flush_tlb_range); static int radix_get_mmu_psize(int page_size) @@ -837,6 +859,8 @@ void radix__tlb_flush(struct mmu_gather *tlb) int psize = 0; struct mm_struct *mm = tlb->mm; int page_size = tlb->page_size; + unsigned long start = tlb->start; + unsigned long end = tlb->end; /* * if page size is not something we understand, do a full mm flush @@ -847,15 +871,45 @@ void radix__tlb_flush(struct mmu_gather *tlb) */ if (tlb->fullmm) { __flush_all_mm(mm, true); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) + } else if (mm_tlb_flush_nested(mm)) { + /* + * If there is a concurrent invalidation that is clearing ptes, + * then it's possible this invalidation will miss one of those + * cleared ptes and miss flushing the TLB. If this invalidate + * returns before the other one flushes TLBs, that can result + * in it returning while there are still valid TLBs inside the + * range to be invalidated. + * + * See mm/memory.c:tlb_finish_mmu() for more details. + * + * The solution to this is ensure the entire range is always + * flushed here. The problem for powerpc is that the flushes + * are page size specific, so this "forced flush" would not + * do the right thing if there are a mix of page sizes in + * the range to be invalidated. So use __flush_tlb_range + * which invalidates all possible page sizes in the range. + * + * PWC flush probably is not be required because the core code + * shouldn't free page tables in this path, but accounting + * for the possibility makes us a bit more robust. + * + * need_flush_all is an uncommon case because page table + * teardown should be done with exclusive locks held (but + * after locks are dropped another invalidate could come + * in), it could be optimized further if necessary. + */ + if (!tlb->need_flush_all) + __radix__flush_tlb_range(mm, start, end, true); + else + radix__flush_all_mm(mm); +#endif } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { if (!tlb->need_flush_all) radix__flush_tlb_mm(mm); else radix__flush_all_mm(mm); } else { - unsigned long start = tlb->start; - unsigned long end = tlb->end; - if (!tlb->need_flush_all) radix__flush_tlb_range_psize(mm, start, end, psize); else -- cgit v1.2.3 From 749a0278c2177b2d16da5d8b135ba7f940bb4199 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 13 Jun 2018 23:23:56 +1000 Subject: powerpc/64s: Fix DT CPU features Power9 DD2.1 logic In the device tree CPU features quirk code we want to set CPU_FTR_POWER9_DD2_1 on all Power9s that aren't DD2.0 or earlier. But we got the logic wrong and instead set it on all CPUs that aren't Power9 DD2.0 or earlier, ie. including Power8. Fix it by making sure we're on a Power9. This isn't a bug in practice because the only code that checks the feature is Power9 only to begin with. But we'll backport it anyway to avoid confusion. Fixes: 9e9626ed3a4a ("powerpc/64s: Fix POWER9 DD2.2 and above in DT CPU features") Cc: stable@vger.kernel.org # v4.17+ Reported-by: Paul Mackerras Signed-off-by: Michael Ellerman Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/dt_cpu_ftrs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 4be1c0de9406..96dd3d871986 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -711,7 +711,8 @@ static __init void cpufeatures_cpu_quirks(void) cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST; cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG; cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; - } else /* DD2.1 and up have DD2_1 */ + } else if ((version & 0xffff0000) == 0x004e0000) + /* DD2.1 and up have DD2_1 */ cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1; if ((version & 0xffff0000) == 0x004e0000) { -- cgit v1.2.3 From 8c1aef6a682f87a059f10ab606cc1e2cdd663d5a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 19 May 2018 14:35:52 +1000 Subject: powerpc/64: hard disable irqs in panic_smp_self_stop Similarly to commit 855bfe0de1 ("powerpc: hard disable irqs in smp_send_stop loop"), irqs should be hard disabled by panic_smp_self_stop. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 7a7ce8ad455e..225bc5f91049 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -387,6 +387,14 @@ void early_setup_secondary(void) #endif /* CONFIG_SMP */ +void panic_smp_self_stop(void) +{ + hard_irq_disable(); + spin_begin(); + while (1) + spin_cpu_relax(); +} + #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) static bool use_spinloop(void) { -- cgit v1.2.3 From de6e5d38417e6cdb005843db420a2974993d36ff Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 19 May 2018 14:35:53 +1000 Subject: powerpc: smp_send_stop do not offline stopped CPUs Marking CPUs stopped by smp_send_stop as offline can cause warnings due to cross-CPU wakeups. This trace was noticed on a busy system running a sysrq+c crash test, after the injected crash: WARNING: CPU: 51 PID: 1546 at kernel/sched/core.c:1179 set_task_cpu+0x22c/0x240 CPU: 51 PID: 1546 Comm: kworker/u352:1 Tainted: G D Workqueue: mlx5e mlx5e_update_stats_work [mlx5_core] [...] NIP [c00000000017c21c] set_task_cpu+0x22c/0x240 LR [c00000000017d580] try_to_wake_up+0x230/0x720 Call Trace: [c000000001017700] runqueues+0x0/0xb00 (unreliable) [c00000000017d580] try_to_wake_up+0x230/0x720 [c00000000015a214] insert_work+0x104/0x140 [c00000000015adb0] __queue_work+0x230/0x690 [c000003fc5007910] [c00000000015b26c] queue_work_on+0x5c/0x90 [c0080000135fc8f8] mlx5_cmd_exec+0x538/0xcb0 [mlx5_core] [c008000013608fd0] mlx5_core_access_reg+0x140/0x1d0 [mlx5_core] [c00800001362777c] mlx5e_update_pport_counters.constprop.59+0x6c/0x90 [mlx5_core] [c008000013628868] mlx5e_update_ndo_stats+0x28/0x90 [mlx5_core] [c008000013625558] mlx5e_update_stats_work+0x68/0xb0 [mlx5_core] [c00000000015bcec] process_one_work+0x1bc/0x5f0 [c00000000015ecac] worker_thread+0xac/0x6b0 [c000000000168338] kthread+0x168/0x1b0 [c00000000000b628] ret_from_kernel_thread+0x5c/0xb4 This happens because firstly the CPU is not really offline in the usual sense, processes and interrupts have not been migrated away. Secondly smp_send_stop does not happen atomically on all CPUs, so one CPU can have marked itself offline, while another CPU is still running processes or interrupts which can affect the first CPU. Fix this by just not marking the CPU as offline. It's more like frozen in time, so offline does not really reflect its state properly anyway. There should be nothing in the crash/panic path that walks online CPUs and synchronously waits for them, so this change should not introduce new hangs. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5eadfffabe35..4794d6b4f4d2 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -600,9 +600,6 @@ static void nmi_stop_this_cpu(struct pt_regs *regs) nmi_ipi_busy_count--; nmi_ipi_unlock(); - /* Remove this CPU */ - set_cpu_online(smp_processor_id(), false); - spin_begin(); while (1) spin_cpu_relax(); @@ -617,9 +614,6 @@ void smp_send_stop(void) static void stop_this_cpu(void *dummy) { - /* Remove this CPU */ - set_cpu_online(smp_processor_id(), false); - hard_irq_disable(); spin_begin(); while (1) -- cgit v1.2.3 From 855b6232dda2b6941ecd22979893e8a1d25642db Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 19 May 2018 14:35:54 +1000 Subject: powerpc/64: hard disable irqs on the panic()ing CPU Similar to previous patches, hard disable interrupts when a CPU is in panic. This reduces the chance the watchdog has to interfere with the panic, and avoids any other type of masked interrupt being executed when crashing which minimises the length of the crash path. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 62b1a40d8957..40b44bb53a4e 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -700,12 +700,19 @@ EXPORT_SYMBOL(check_legacy_ioport); static int ppc_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { + /* + * panic does a local_irq_disable, but we really + * want interrupts to be hard disabled. + */ + hard_irq_disable(); + /* * If firmware-assisted dump has been registered then trigger * firmware-assisted dump and let firmware handle everything else. */ crash_fadump(NULL, ptr); - ppc_md.panic(ptr); /* May not return */ + if (ppc_md.panic) + ppc_md.panic(ptr); /* May not return */ return NOTIFY_DONE; } @@ -716,7 +723,8 @@ static struct notifier_block ppc_panic_block = { void __init setup_panic(void) { - if (!ppc_md.panic) + /* PPC64 always does a hard irq disable in its panic handler */ + if (!IS_ENABLED(CONFIG_PPC64) && !ppc_md.panic) return; atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); } -- cgit v1.2.3 From 1fe83888a2b776c204cb06629700adfb8e9cc123 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Fri, 8 Jun 2018 10:40:38 +0200 Subject: xen: share start flags between PV and PVH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a global variable to store the start flags for both PV and PVH. This allows the xen_initial_domain macro to work properly on PVH. Note that ARM is also switched to use the new variable. Signed-off-by: Boris Ostrovsky Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- arch/arm/xen/enlighten.c | 7 ++++--- arch/x86/xen/enlighten.c | 7 +++++++ arch/x86/xen/enlighten_pv.c | 1 + arch/x86/xen/enlighten_pvh.c | 1 + include/xen/xen.h | 6 +++++- 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 8073625371f5..07060e5b5864 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -59,6 +59,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; static __read_mostly unsigned int xen_events_irq; +uint32_t xen_start_flags; +EXPORT_SYMBOL(xen_start_flags); + int xen_remap_domain_gfn_array(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *gfn, int nr, @@ -293,9 +296,7 @@ void __init xen_early_init(void) xen_setup_features(); if (xen_feature(XENFEAT_dom0)) - xen_start_info->flags |= SIF_INITDOMAIN|SIF_PRIVILEGED; - else - xen_start_info->flags &= ~(SIF_INITDOMAIN|SIF_PRIVILEGED); + xen_start_flags |= SIF_INITDOMAIN|SIF_PRIVILEGED; if (!console_set_on_cmdline && !xen_initial_domain()) add_preferred_console("hvc", 0, NULL); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c9081c6671f0..3b5318505c69 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -64,6 +64,13 @@ struct shared_info xen_dummy_shared_info; __read_mostly int xen_have_vector_callback; EXPORT_SYMBOL_GPL(xen_have_vector_callback); +/* + * NB: needs to live in .data because it's used by xen_prepare_pvh which runs + * before clearing the bss. + */ +uint32_t xen_start_flags __attribute__((section(".data"))) = 0; +EXPORT_SYMBOL(xen_start_flags); + /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 357969a3697c..8d4e2e1ae60b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1203,6 +1203,7 @@ asmlinkage __visible void __init xen_start_kernel(void) return; xen_domain_type = XEN_PV_DOMAIN; + xen_start_flags = xen_start_info->flags; xen_setup_features(); diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index aa1c6a6831a9..c85d1a88f476 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -97,6 +97,7 @@ void __init xen_prepare_pvh(void) } xen_pvh = 1; + xen_start_flags = pvh_start_info.flags; msr = cpuid_ebx(xen_cpuid_base() + 2); pfn = __pa(hypercall_page); diff --git a/include/xen/xen.h b/include/xen/xen.h index 9d4340c907d1..1e1d9bd0bd37 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -25,12 +25,16 @@ extern bool xen_pvh; #define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN) #define xen_pvh_domain() (xen_pvh) +#include + +extern uint32_t xen_start_flags; + #ifdef CONFIG_XEN_DOM0 #include #include #define xen_initial_domain() (xen_domain() && \ - xen_start_info && xen_start_info->flags & SIF_INITDOMAIN) + (xen_start_flags & SIF_INITDOMAIN)) #else /* !CONFIG_XEN_DOM0 */ #define xen_initial_domain() (0) #endif /* CONFIG_XEN_DOM0 */ -- cgit v1.2.3 From 84c029a73327cef571eaa61c7d6e67e8031b52ec Mon Sep 17 00:00:00 2001 From: Zhouyang Jia Date: Fri, 15 Jun 2018 07:34:52 +0800 Subject: xen: add error handling for xenbus_printf When xenbus_printf fails, the lack of error-handling code may cause unexpected results. This patch adds error-handling code after calling xenbus_printf. Signed-off-by: Zhouyang Jia Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- drivers/xen/manage.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 8835065029d3..c93d8ef8df34 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -289,8 +289,15 @@ static void sysrq_handler(struct xenbus_watch *watch, const char *path, return; } - if (sysrq_key != '\0') - xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + if (sysrq_key != '\0') { + err = xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + if (err) { + pr_err("%s: Error %d writing sysrq in control/sysrq\n", + __func__, err); + xenbus_transaction_end(xbt, 1); + return; + } + } err = xenbus_transaction_end(xbt, 0); if (err == -EAGAIN) @@ -342,7 +349,12 @@ static int setup_shutdown_watcher(void) continue; snprintf(node, FEATURE_PATH_SIZE, "feature-%s", shutdown_handlers[idx].command); - xenbus_printf(XBT_NIL, "control", node, "%u", 1); + err = xenbus_printf(XBT_NIL, "control", node, "%u", 1); + if (err) { + pr_err("%s: Error %d writing %s\n", __func__, + err, node); + return err; + } } return 0; -- cgit v1.2.3 From 6e3cc2a6e259f49f5817d1561109832eff90f8e4 Mon Sep 17 00:00:00 2001 From: Oleksandr Andrushchenko Date: Fri, 1 Jun 2018 14:41:24 +0300 Subject: xen/grant-table: Export gnttab_{alloc|free}_pages as GPL Only gnttab_{alloc|free}_pages are exported as EXPORT_SYMBOL while all the rest are exported as EXPORT_SYMBOL_GPL, thus effectively making it not possible for non-GPL driver modules to use grant table module. Export gnttab_{alloc|free}_pages as EXPORT_SYMBOL_GPL so all the exports are aligned. Signed-off-by: Oleksandr Andrushchenko Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/grant-table.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 27be107d6480..ba36ff3e4903 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -799,7 +799,7 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages) return 0; } -EXPORT_SYMBOL(gnttab_alloc_pages); +EXPORT_SYMBOL_GPL(gnttab_alloc_pages); /** * gnttab_free_pages - free pages allocated by gnttab_alloc_pages() @@ -820,7 +820,7 @@ void gnttab_free_pages(int nr_pages, struct page **pages) } free_xenballooned_pages(nr_pages, pages); } -EXPORT_SYMBOL(gnttab_free_pages); +EXPORT_SYMBOL_GPL(gnttab_free_pages); /* Handling of paged out grant targets (GNTST_eagain) */ #define MAX_DELAY 256 -- cgit v1.2.3 From 7ba33e1c9d1e03f442b161c701d1f811ea13c75e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sun, 10 Jun 2018 20:43:09 +0100 Subject: drm/i915: Apply batch location restrictions before pinning We special case the position of the batch within the GTT to prevent negative self-relocation deltas from underflowing. However, that restriction is being applied after a trial pin of the batch in its current position. Thus we are not rejecting an invalid location if the batch has been used before, leading to an assertion if we happen to need to rearrange the entire payload. In the worst case, this may cause a GPU hang on gen7 or perhaps missing state. References: https://bugs.freedesktop.org/show_bug.cgi?id=105720 Fixes: 2889caa92321 ("drm/i915: Eliminate lots of iterations over the execobjects array") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Martin Peres Link: https://patchwork.freedesktop.org/patch/msgid/20180610194325.13467-2-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen (cherry picked from commit 746c8f143afad7aaa66c484485fc39888d437a3f) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 49 ++++++++++++++++-------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index f627a8c47c58..22df17c8ca9b 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -489,7 +489,9 @@ eb_validate_vma(struct i915_execbuffer *eb, } static int -eb_add_vma(struct i915_execbuffer *eb, unsigned int i, struct i915_vma *vma) +eb_add_vma(struct i915_execbuffer *eb, + unsigned int i, unsigned batch_idx, + struct i915_vma *vma) { struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; int err; @@ -522,6 +524,24 @@ eb_add_vma(struct i915_execbuffer *eb, unsigned int i, struct i915_vma *vma) eb->flags[i] = entry->flags; vma->exec_flags = &eb->flags[i]; + /* + * SNA is doing fancy tricks with compressing batch buffers, which leads + * to negative relocation deltas. Usually that works out ok since the + * relocate address is still positive, except when the batch is placed + * very low in the GTT. Ensure this doesn't happen. + * + * Note that actual hangs have only been observed on gen7, but for + * paranoia do it everywhere. + */ + if (i == batch_idx) { + if (!(eb->flags[i] & EXEC_OBJECT_PINNED)) + eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS; + if (eb->reloc_cache.has_fence) + eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE; + + eb->batch = vma; + } + err = 0; if (eb_pin_vma(eb, entry, vma)) { if (entry->offset != vma->node.start) { @@ -716,7 +736,7 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) { struct radix_tree_root *handles_vma = &eb->ctx->handles_vma; struct drm_i915_gem_object *obj; - unsigned int i; + unsigned int i, batch; int err; if (unlikely(i915_gem_context_is_closed(eb->ctx))) @@ -728,6 +748,8 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) INIT_LIST_HEAD(&eb->relocs); INIT_LIST_HEAD(&eb->unbound); + batch = eb_batch_index(eb); + for (i = 0; i < eb->buffer_count; i++) { u32 handle = eb->exec[i].handle; struct i915_lut_handle *lut; @@ -770,33 +792,16 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) lut->handle = handle; add_vma: - err = eb_add_vma(eb, i, vma); + err = eb_add_vma(eb, i, batch, vma); if (unlikely(err)) goto err_vma; GEM_BUG_ON(vma != eb->vma[i]); GEM_BUG_ON(vma->exec_flags != &eb->flags[i]); + GEM_BUG_ON(drm_mm_node_allocated(&vma->node) && + eb_vma_misplaced(&eb->exec[i], vma, eb->flags[i])); } - /* take note of the batch buffer before we might reorder the lists */ - i = eb_batch_index(eb); - eb->batch = eb->vma[i]; - GEM_BUG_ON(eb->batch->exec_flags != &eb->flags[i]); - - /* - * SNA is doing fancy tricks with compressing batch buffers, which leads - * to negative relocation deltas. Usually that works out ok since the - * relocate address is still positive, except when the batch is placed - * very low in the GTT. Ensure this doesn't happen. - * - * Note that actual hangs have only been observed on gen7, but for - * paranoia do it everywhere. - */ - if (!(eb->flags[i] & EXEC_OBJECT_PINNED)) - eb->flags[i] |= __EXEC_OBJECT_NEEDS_BIAS; - if (eb->reloc_cache.has_fence) - eb->flags[i] |= EXEC_OBJECT_NEEDS_FENCE; - eb->args->flags |= __EXEC_VALIDATED; return eb_reserve(eb); -- cgit v1.2.3 From a5bfcdf0e16b33c1690ded31f863466136480ddc Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 11 Jun 2018 16:33:32 +0100 Subject: drm/i915/execlists: Avoid putting the error pointer On allocation error, do not jump to the unwind handler that tries to free the error pointer. Reported-by: Lionel Landwerlin Fixes: a89d1f921c15 ("drm/i915: Split i915_gem_timeline into individual timelines") Signed-off-by: Chris Wilson Cc: Lionel Landwerlin Cc: Tvrtko Ursulin Reviewed-by: Lionel Landwerlin Link: https://patchwork.freedesktop.org/patch/msgid/20180611153332.14824-1-chris@chris-wilson.co.uk (cherry picked from commit 467d35789e5a4f47428b65ef711b30fdabbb0fd4) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_lrc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 15434cad5430..f3968580e5e2 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -2641,10 +2641,8 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx, context_size += LRC_HEADER_PAGES * PAGE_SIZE; ctx_obj = i915_gem_object_create(ctx->i915, context_size); - if (IS_ERR(ctx_obj)) { - ret = PTR_ERR(ctx_obj); - goto error_deref_obj; - } + if (IS_ERR(ctx_obj)) + return PTR_ERR(ctx_obj); vma = i915_vma_instance(ctx_obj, &ctx->i915->ggtt.base, NULL); if (IS_ERR(vma)) { -- cgit v1.2.3 From 541ab84d2b6ea79021d5df0b54d81600334fa2a4 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 24 May 2018 15:54:03 +0300 Subject: drm/i915: Allow DBLSCAN user modes with eDP/LVDS/DSI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When encountering a connector with the scaling mode property both intel and modesetting ddxs sometimes add tons of DBLSCAN modes to the output's mode list. The idea presumably being that since the output will be going through the panel fitter anyway we can pretend to use any kind of mode. Sadly that means we can't reject user modes with the DBLSCAN flag until we know whether we're going to be using the panel's native mode or the user mode directly. Doing otherwise means X clients using xf86vidmode/xrandr will get a protocol error (and often self terminate as a result) when the kernel refuses to use the requested mode with the DBLSCAN flag. To undo the regression we'll move the DBLSCAN checks into the connector->mode_valid() and encoder->compute_config() hooks. Cc: stable@vger.kernel.org Cc: Vito Caputo Reported-by: Vito Caputo Fixes: e995ca0b8139 ("drm/i915: Provide a device level .mode_valid() hook") References: https://lkml.org/lkml/2018/5/21/715 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20180524125403.23445-1-ville.syrjala@linux.intel.com Reviewed-by: Maarten Lankhorst Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106804 Tested-by: Arkadiusz Miskiewicz (cherry picked from commit e4dd27aadd205417a2e9ea9902b698a0252ec3a0) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_crt.c | 20 ++++++++++++++++++++ drivers/gpu/drm/i915/intel_display.c | 16 +++++++++++++--- drivers/gpu/drm/i915/intel_dp.c | 6 ++++++ drivers/gpu/drm/i915/intel_dp_mst.c | 6 ++++++ drivers/gpu/drm/i915/intel_dsi.c | 6 ++++++ drivers/gpu/drm/i915/intel_dvo.c | 6 ++++++ drivers/gpu/drm/i915/intel_hdmi.c | 6 ++++++ drivers/gpu/drm/i915/intel_lvds.c | 5 +++++ drivers/gpu/drm/i915/intel_sdvo.c | 6 ++++++ drivers/gpu/drm/i915/intel_tv.c | 12 ++++++++++-- 10 files changed, 84 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_crt.c b/drivers/gpu/drm/i915/intel_crt.c index de0e22322c76..072b326d5ee0 100644 --- a/drivers/gpu/drm/i915/intel_crt.c +++ b/drivers/gpu/drm/i915/intel_crt.c @@ -304,6 +304,9 @@ intel_crt_mode_valid(struct drm_connector *connector, int max_dotclk = dev_priv->max_dotclk_freq; int max_clock; + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + if (mode->clock < 25000) return MODE_CLOCK_LOW; @@ -337,6 +340,12 @@ static bool intel_crt_compute_config(struct intel_encoder *encoder, struct intel_crtc_state *pipe_config, struct drm_connector_state *conn_state) { + struct drm_display_mode *adjusted_mode = + &pipe_config->base.adjusted_mode; + + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + return true; } @@ -344,6 +353,12 @@ static bool pch_crt_compute_config(struct intel_encoder *encoder, struct intel_crtc_state *pipe_config, struct drm_connector_state *conn_state) { + struct drm_display_mode *adjusted_mode = + &pipe_config->base.adjusted_mode; + + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + pipe_config->has_pch_encoder = true; return true; @@ -354,6 +369,11 @@ static bool hsw_crt_compute_config(struct intel_encoder *encoder, struct drm_connector_state *conn_state) { struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); + struct drm_display_mode *adjusted_mode = + &pipe_config->base.adjusted_mode; + + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; pipe_config->has_pch_encoder = true; diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index dee3a8e659f1..2cc6faa1daa8 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -14469,12 +14469,22 @@ static enum drm_mode_status intel_mode_valid(struct drm_device *dev, const struct drm_display_mode *mode) { + /* + * Can't reject DBLSCAN here because Xorg ddxen can add piles + * of DBLSCAN modes to the output's mode list when they detect + * the scaling mode property on the connector. And they don't + * ask the kernel to validate those modes in any way until + * modeset time at which point the client gets a protocol error. + * So in order to not upset those clients we silently ignore the + * DBLSCAN flag on such connectors. For other connectors we will + * reject modes with the DBLSCAN flag in encoder->compute_config(). + * And we always reject DBLSCAN modes in connector->mode_valid() + * as we never want such modes on the connector's mode list. + */ + if (mode->vscan > 1) return MODE_NO_VSCAN; - if (mode->flags & DRM_MODE_FLAG_DBLSCAN) - return MODE_NO_DBLESCAN; - if (mode->flags & DRM_MODE_FLAG_HSKEW) return MODE_H_ILLEGAL; diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 8320f0e8e3be..6cce43a6eaad 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -420,6 +420,9 @@ intel_dp_mode_valid(struct drm_connector *connector, int max_rate, mode_rate, max_lanes, max_link_clock; int max_dotclk; + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + max_dotclk = intel_dp_downstream_max_dotclock(intel_dp); if (intel_dp_is_edp(intel_dp) && fixed_mode) { @@ -1862,6 +1865,9 @@ intel_dp_compute_config(struct intel_encoder *encoder, conn_state->scaling_mode); } + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + if ((IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) && adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) return false; diff --git a/drivers/gpu/drm/i915/intel_dp_mst.c b/drivers/gpu/drm/i915/intel_dp_mst.c index 9e6956c08688..5890500a3a8b 100644 --- a/drivers/gpu/drm/i915/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/intel_dp_mst.c @@ -48,6 +48,9 @@ static bool intel_dp_mst_compute_config(struct intel_encoder *encoder, bool reduce_m_n = drm_dp_has_quirk(&intel_dp->desc, DP_DPCD_QUIRK_LIMITED_M_N); + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + pipe_config->has_pch_encoder = false; bpp = 24; if (intel_dp->compliance.test_data.bpc) { @@ -366,6 +369,9 @@ intel_dp_mst_mode_valid(struct drm_connector *connector, if (!intel_dp) return MODE_ERROR; + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + max_link_clock = intel_dp_max_link_rate(intel_dp); max_lanes = intel_dp_max_lane_count(intel_dp); diff --git a/drivers/gpu/drm/i915/intel_dsi.c b/drivers/gpu/drm/i915/intel_dsi.c index cf39ca90d887..f349b3920199 100644 --- a/drivers/gpu/drm/i915/intel_dsi.c +++ b/drivers/gpu/drm/i915/intel_dsi.c @@ -326,6 +326,9 @@ static bool intel_dsi_compute_config(struct intel_encoder *encoder, conn_state->scaling_mode); } + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + /* DSI uses short packets for sync events, so clear mode flags for DSI */ adjusted_mode->flags = 0; @@ -1266,6 +1269,9 @@ intel_dsi_mode_valid(struct drm_connector *connector, DRM_DEBUG_KMS("\n"); + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + if (fixed_mode) { if (mode->hdisplay > fixed_mode->hdisplay) return MODE_PANEL; diff --git a/drivers/gpu/drm/i915/intel_dvo.c b/drivers/gpu/drm/i915/intel_dvo.c index a70d767313aa..61d908e0df0e 100644 --- a/drivers/gpu/drm/i915/intel_dvo.c +++ b/drivers/gpu/drm/i915/intel_dvo.c @@ -219,6 +219,9 @@ intel_dvo_mode_valid(struct drm_connector *connector, int max_dotclk = to_i915(connector->dev)->max_dotclk_freq; int target_clock = mode->clock; + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + /* XXX: Validate clock range */ if (fixed_mode) { @@ -254,6 +257,9 @@ static bool intel_dvo_compute_config(struct intel_encoder *encoder, if (fixed_mode) intel_fixed_panel_mode(fixed_mode, adjusted_mode); + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + return true; } diff --git a/drivers/gpu/drm/i915/intel_hdmi.c b/drivers/gpu/drm/i915/intel_hdmi.c index ee929f31f7db..d8cb53ef4351 100644 --- a/drivers/gpu/drm/i915/intel_hdmi.c +++ b/drivers/gpu/drm/i915/intel_hdmi.c @@ -1557,6 +1557,9 @@ intel_hdmi_mode_valid(struct drm_connector *connector, bool force_dvi = READ_ONCE(to_intel_digital_connector_state(connector->state)->force_audio) == HDMI_AUDIO_OFF_DVI; + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + clock = mode->clock; if ((mode->flags & DRM_MODE_FLAG_3D_MASK) == DRM_MODE_FLAG_3D_FRAME_PACKING) @@ -1677,6 +1680,9 @@ bool intel_hdmi_compute_config(struct intel_encoder *encoder, int desired_bpp; bool force_dvi = intel_conn_state->force_audio == HDMI_AUDIO_OFF_DVI; + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + pipe_config->has_hdmi_sink = !force_dvi && intel_hdmi->has_hdmi_sink; if (pipe_config->has_hdmi_sink) diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c index d278f24ba6ae..48f618dc9abb 100644 --- a/drivers/gpu/drm/i915/intel_lvds.c +++ b/drivers/gpu/drm/i915/intel_lvds.c @@ -380,6 +380,8 @@ intel_lvds_mode_valid(struct drm_connector *connector, struct drm_display_mode *fixed_mode = intel_connector->panel.fixed_mode; int max_pixclk = to_i915(connector->dev)->max_dotclk_freq; + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; if (mode->hdisplay > fixed_mode->hdisplay) return MODE_PANEL; if (mode->vdisplay > fixed_mode->vdisplay) @@ -429,6 +431,9 @@ static bool intel_lvds_compute_config(struct intel_encoder *intel_encoder, intel_fixed_panel_mode(intel_connector->panel.fixed_mode, adjusted_mode); + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + if (HAS_PCH_SPLIT(dev_priv)) { pipe_config->has_pch_encoder = true; diff --git a/drivers/gpu/drm/i915/intel_sdvo.c b/drivers/gpu/drm/i915/intel_sdvo.c index 25005023c243..26975df4e593 100644 --- a/drivers/gpu/drm/i915/intel_sdvo.c +++ b/drivers/gpu/drm/i915/intel_sdvo.c @@ -1160,6 +1160,9 @@ static bool intel_sdvo_compute_config(struct intel_encoder *encoder, adjusted_mode); } + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + /* * Make the CRTC code factor in the SDVO pixel multiplier. The * SDVO device will factor out the multiplier during mode_set. @@ -1621,6 +1624,9 @@ intel_sdvo_mode_valid(struct drm_connector *connector, struct intel_sdvo *intel_sdvo = intel_attached_sdvo(connector); int max_dotclk = to_i915(connector->dev)->max_dotclk_freq; + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + if (intel_sdvo->pixel_clock_min > mode->clock) return MODE_CLOCK_LOW; diff --git a/drivers/gpu/drm/i915/intel_tv.c b/drivers/gpu/drm/i915/intel_tv.c index 885fc3809f7f..b55b5c157e38 100644 --- a/drivers/gpu/drm/i915/intel_tv.c +++ b/drivers/gpu/drm/i915/intel_tv.c @@ -850,6 +850,9 @@ intel_tv_mode_valid(struct drm_connector *connector, const struct tv_mode *tv_mode = intel_tv_mode_find(connector->state); int max_dotclk = to_i915(connector->dev)->max_dotclk_freq; + if (mode->flags & DRM_MODE_FLAG_DBLSCAN) + return MODE_NO_DBLESCAN; + if (mode->clock > max_dotclk) return MODE_CLOCK_HIGH; @@ -877,16 +880,21 @@ intel_tv_compute_config(struct intel_encoder *encoder, struct drm_connector_state *conn_state) { const struct tv_mode *tv_mode = intel_tv_mode_find(conn_state); + struct drm_display_mode *adjusted_mode = + &pipe_config->base.adjusted_mode; if (!tv_mode) return false; - pipe_config->base.adjusted_mode.crtc_clock = tv_mode->clock; + if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) + return false; + + adjusted_mode->crtc_clock = tv_mode->clock; DRM_DEBUG_KMS("forcing bpc to 8 for TV\n"); pipe_config->pipe_bpp = 8*3; /* TV has it's own notion of sync and other mode flags, so clear them. */ - pipe_config->base.adjusted_mode.flags = 0; + adjusted_mode->flags = 0; /* * FIXME: We don't check whether the input mode is actually what we want -- cgit v1.2.3 From 4dc055c9cc8b3dac966b54d3cd5cf463a988299b Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Mon, 11 Jun 2018 23:02:55 +0300 Subject: drm/i915: Fix PIPESTAT irq ack on i965/g4x MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On i965/g4x IIR is edge triggered. So in order for IIR to notice that there is still a pending interrupt we have to force and edge in ISR. For the ISR/IIR pipe event bits we can do that by temporarily clearing all the PIPESTAT enable bits when we ack the status bits. This will force the ISR pipe event bit low, and it can then go back high when we restore the PIPESTAT enable bits. This avoids the following race: 1. stat = read(PIPESTAT) 2. an enabled PIPESTAT status bit goes high 3. write(PIPESTAT, enable|stat); 4. write(IIR, PIPE_EVENT) The end result is IIR==0 and ISR!=0. This can lead to nasty vblank wait/flip_done timeouts if another interrupt source doesn't trick us into looking at the PIPESTAT status bits despite the IIR PIPE_EVENT bit being low. Before i965 IIR was level triggered so this problem can't actually happen there. And curiously VLV/CHV went back to the level triggered scheme as well. But for simplicity we'll use the same i965/g4x compatible code for all platforms. Cc: stable@vger.kernel.org Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106033 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105225 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106030 Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20180611200258.27121-1-ville.syrjala@linux.intel.com Reviewed-by: Chris Wilson (cherry picked from commit 132c27c97cb958f637dc05adc35a61b47779bcd8) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_irq.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index f9bc3aaa90d0..4a02747ac658 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1893,9 +1893,17 @@ static void i9xx_pipestat_irq_ack(struct drm_i915_private *dev_priv, /* * Clear the PIPE*STAT regs before the IIR + * + * Toggle the enable bits to make sure we get an + * edge in the ISR pipe event bit if we don't clear + * all the enabled status bits. Otherwise the edge + * triggered IIR on i965/g4x wouldn't notice that + * an interrupt is still pending. */ - if (pipe_stats[pipe]) - I915_WRITE(reg, enable_mask | pipe_stats[pipe]); + if (pipe_stats[pipe]) { + I915_WRITE(reg, pipe_stats[pipe]); + I915_WRITE(reg, enable_mask); + } } spin_unlock(&dev_priv->irq_lock); } -- cgit v1.2.3 From 1e34f1d36804be1a446212a33ca5397bf0e5acdd Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Wed, 13 Jun 2018 19:05:52 +0300 Subject: drm/i915: Disallow interlaced modes on g4x DP outputs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Looks like interlaced DP output doesn't work on g4x either. Not all that surprising considering we already established that interlaced DP output is busted on VLV/CHV. Cc: stable@vger.kernel.org Signed-off-by: Ville Syrjälä Reviewed-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20180613160553.11664-1-ville.syrjala@linux.intel.com (cherry picked from commit 929168c5f3df5d9ea0ef426c33e971157d045eab) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_dp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 6cce43a6eaad..2bdfe4b2d6dc 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -1868,7 +1868,7 @@ intel_dp_compute_config(struct intel_encoder *encoder, if (adjusted_mode->flags & DRM_MODE_FLAG_DBLSCAN) return false; - if ((IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) && + if (HAS_GMCH_DISPLAY(dev_priv) && adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) return false; @@ -6343,7 +6343,7 @@ intel_dp_init_connector(struct intel_digital_port *intel_dig_port, drm_connector_init(dev, connector, &intel_dp_connector_funcs, type); drm_connector_helper_add(connector, &intel_dp_connector_helper_funcs); - if (!IS_VALLEYVIEW(dev_priv) && !IS_CHERRYVIEW(dev_priv)) + if (!HAS_GMCH_DISPLAY(dev_priv)) connector->interlace_allowed = true; connector->doublescan_allowed = 0; -- cgit v1.2.3 From 4dccc4d517481282e84335c7acbfd7a1481004b8 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Wed, 13 Jun 2018 19:05:53 +0300 Subject: drm/i915: Turn off g4x DP port in .post_disable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While Bspec doesn't list a specific sequence for turning off the DP port on g4x we are getting an underrun if the port is disabled in the .disable() hook. Looks like the pipe stops when the port stops, and by that time the plane disable may not have completed yet. Also the plane(s) seem to end up in some wonky state when this happens as they also signal another underrun immediately after we turn them back on during the next enable sequence. We could add a vblank wait in .disable() to avoid wedging the planes, but I assume we're still tripping up the pipe in some way. So it seems better to me to just follow the ILK+ sequence and turn off the DP port in .post_disable() instead. This sequence doesn't seem to suffer from this problem. Could be it was always the intended sequence for DP and the gen4 bspec was just never updated to include it. Originally we used the bad sequence even on ilk+, but I changed that in commit 08aff3fe26ae ("drm/i915: Move DP port disable to post_disable for pch platforms") as it was causing issues on those platforms as well. I left out g4x then only because I didn't have the hardware to test it. Now that I do it's fairly clear that the ilk+ sequence is also the right choice for g4x. v2: Fix whitespace fail (Jani) Mention the ilk+ commit (Jani) Cc: stable@vger.kernel.org Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20180613160553.11664-2-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula (cherry picked from commit 51a9f6dfc00d35f927ecfaf6f0ae8ebaba39b3fe) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_dp.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c index 2bdfe4b2d6dc..16faea30114a 100644 --- a/drivers/gpu/drm/i915/intel_dp.c +++ b/drivers/gpu/drm/i915/intel_dp.c @@ -2788,16 +2788,6 @@ static void intel_disable_dp(struct intel_encoder *encoder, static void g4x_disable_dp(struct intel_encoder *encoder, const struct intel_crtc_state *old_crtc_state, const struct drm_connector_state *old_conn_state) -{ - intel_disable_dp(encoder, old_crtc_state, old_conn_state); - - /* disable the port before the pipe on g4x */ - intel_dp_link_down(encoder, old_crtc_state); -} - -static void ilk_disable_dp(struct intel_encoder *encoder, - const struct intel_crtc_state *old_crtc_state, - const struct drm_connector_state *old_conn_state) { intel_disable_dp(encoder, old_crtc_state, old_conn_state); } @@ -2813,13 +2803,19 @@ static void vlv_disable_dp(struct intel_encoder *encoder, intel_disable_dp(encoder, old_crtc_state, old_conn_state); } -static void ilk_post_disable_dp(struct intel_encoder *encoder, +static void g4x_post_disable_dp(struct intel_encoder *encoder, const struct intel_crtc_state *old_crtc_state, const struct drm_connector_state *old_conn_state) { struct intel_dp *intel_dp = enc_to_intel_dp(&encoder->base); enum port port = encoder->port; + /* + * Bspec does not list a specific disable sequence for g4x DP. + * Follow the ilk+ sequence (disable pipe before the port) for + * g4x DP as it does not suffer from underruns like the normal + * g4x modeset sequence (disable pipe after the port). + */ intel_dp_link_down(encoder, old_crtc_state); /* Only ilk+ has port A */ @@ -6442,15 +6438,11 @@ bool intel_dp_init(struct drm_i915_private *dev_priv, intel_encoder->enable = vlv_enable_dp; intel_encoder->disable = vlv_disable_dp; intel_encoder->post_disable = vlv_post_disable_dp; - } else if (INTEL_GEN(dev_priv) >= 5) { - intel_encoder->pre_enable = g4x_pre_enable_dp; - intel_encoder->enable = g4x_enable_dp; - intel_encoder->disable = ilk_disable_dp; - intel_encoder->post_disable = ilk_post_disable_dp; } else { intel_encoder->pre_enable = g4x_pre_enable_dp; intel_encoder->enable = g4x_enable_dp; intel_encoder->disable = g4x_disable_dp; + intel_encoder->post_disable = g4x_post_disable_dp; } intel_dig_port->dp.output_reg = output_reg; -- cgit v1.2.3 From bc64e05408cafe3668e7460834935ea3f1764f31 Mon Sep 17 00:00:00 2001 From: Mika Kuoppala Date: Fri, 15 Jun 2018 13:44:29 +0300 Subject: drm/i915: Fix context ban and hang accounting for client If client is smart or lucky enough to create a new context after each hang, our context banning mechanism will never catch up, and as a result of that it will be saved from client banning. This can result in a never ending streak of gpu hangs caused by bad or malicious client, preventing access from other legit gpu clients. Fix this by always incrementing per client ban score if it hangs in short successions regardless of context ban scoring. The exception are non bannable contexts. They remain detached from client ban scoring mechanism. v2: xchg timestamp, tidyup (Chris) v3: comment, bannable & banned together (Chris) Fixes: b083a0870c79 ("drm/i915: Add per client max context ban limit") Cc: Chris Wilson Signed-off-by: Mika Kuoppala Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20180615104429.31477-1-mika.kuoppala@linux.intel.com (cherry picked from commit 14921f3cef85b0167a9145e5f29b9dfc3b2a84dc) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_drv.h | 21 ++++++++---- drivers/gpu/drm/i915/i915_gem.c | 57 +++++++++++++++++++++++---------- drivers/gpu/drm/i915/i915_gem_context.c | 2 +- 3 files changed, 55 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 34c125e2d90c..7014a96546f4 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -340,14 +340,21 @@ struct drm_i915_file_private { unsigned int bsd_engine; -/* Client can have a maximum of 3 contexts banned before - * it is denied of creating new contexts. As one context - * ban needs 4 consecutive hangs, and more if there is - * progress in between, this is a last resort stop gap measure - * to limit the badly behaving clients access to gpu. +/* + * Every context ban increments per client ban score. Also + * hangs in short succession increments ban score. If ban threshold + * is reached, client is considered banned and submitting more work + * will fail. This is a stop gap measure to limit the badly behaving + * clients access to gpu. Note that unbannable contexts never increment + * the client ban score. */ -#define I915_MAX_CLIENT_CONTEXT_BANS 3 - atomic_t context_bans; +#define I915_CLIENT_SCORE_HANG_FAST 1 +#define I915_CLIENT_FAST_HANG_JIFFIES (60 * HZ) +#define I915_CLIENT_SCORE_CONTEXT_BAN 3 +#define I915_CLIENT_SCORE_BANNED 9 + /** ban_score: Accumulated score of all ctx bans and fast hangs. */ + atomic_t ban_score; + unsigned long hang_timestamp; }; /* Interface history: diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 3704f4c0c2c9..d44ad7bc1e94 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2933,32 +2933,54 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj, return 0; } +static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv, + const struct i915_gem_context *ctx) +{ + unsigned int score; + unsigned long prev_hang; + + if (i915_gem_context_is_banned(ctx)) + score = I915_CLIENT_SCORE_CONTEXT_BAN; + else + score = 0; + + prev_hang = xchg(&file_priv->hang_timestamp, jiffies); + if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) + score += I915_CLIENT_SCORE_HANG_FAST; + + if (score) { + atomic_add(score, &file_priv->ban_score); + + DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n", + ctx->name, score, + atomic_read(&file_priv->ban_score)); + } +} + static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx) { - bool banned; + unsigned int score; + bool banned, bannable; atomic_inc(&ctx->guilty_count); - banned = false; - if (i915_gem_context_is_bannable(ctx)) { - unsigned int score; + bannable = i915_gem_context_is_bannable(ctx); + score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score); + banned = score >= CONTEXT_SCORE_BAN_THRESHOLD; - score = atomic_add_return(CONTEXT_SCORE_GUILTY, - &ctx->ban_score); - banned = score >= CONTEXT_SCORE_BAN_THRESHOLD; + DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, ban %s\n", + ctx->name, atomic_read(&ctx->guilty_count), + score, yesno(banned && bannable)); - DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n", - ctx->name, score, yesno(banned)); - } - if (!banned) + /* Cool contexts don't accumulate client ban score */ + if (!bannable) return; - i915_gem_context_set_banned(ctx); - if (!IS_ERR_OR_NULL(ctx->file_priv)) { - atomic_inc(&ctx->file_priv->context_bans); - DRM_DEBUG_DRIVER("client %s has had %d context banned\n", - ctx->name, atomic_read(&ctx->file_priv->context_bans)); - } + if (banned) + i915_gem_context_set_banned(ctx); + + if (!IS_ERR_OR_NULL(ctx->file_priv)) + i915_gem_client_mark_guilty(ctx->file_priv, ctx); } static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx) @@ -5736,6 +5758,7 @@ int i915_gem_open(struct drm_i915_private *i915, struct drm_file *file) INIT_LIST_HEAD(&file_priv->mm.request_list); file_priv->bsd_engine = -1; + file_priv->hang_timestamp = jiffies; ret = i915_gem_context_open(i915, file); if (ret) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 33f8a4b3c981..060335d3d9e0 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -652,7 +652,7 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *dev_priv) static bool client_is_banned(struct drm_i915_file_private *file_priv) { - return atomic_read(&file_priv->context_bans) > I915_MAX_CLIENT_CONTEXT_BANS; + return atomic_read(&file_priv->ban_score) >= I915_CLIENT_SCORE_BANNED; } int i915_gem_context_create_ioctl(struct drm_device *dev, void *data, -- cgit v1.2.3 From 7a3727f385dc64773db1c144f6b15c1e9d4735bb Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 15 Jun 2018 20:06:05 +0100 Subject: drm/i915: Enable provoking vertex fix on Gen9 systems. The SF and clipper units mishandle the provoking vertex in some cases, which can cause misrendering with shaders that use flat shaded inputs. There are chicken bits in 3D_CHICKEN3 (for SF) and FF_SLICE_CHICKEN (for the clipper) that work around the issue. These registers are unfortunately not part of the logical context (even the power context), and so we must reload them every time we start executing in a context. Bugzilla: https://bugs.freedesktop.org/103047 Signed-off-by: Kenneth Graunke Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20180615190605.16238-1-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen Cc: stable@vger.kernel.org (cherry picked from commit b77422f80337d363eed60c8c48db9cb6e33085c9) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_reg.h | 5 +++++ drivers/gpu/drm/i915/intel_lrc.c | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index f11bb213ec07..7720569f2024 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -2425,12 +2425,17 @@ enum i915_power_well_id { #define _3D_CHICKEN _MMIO(0x2084) #define _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB (1 << 10) #define _3D_CHICKEN2 _MMIO(0x208c) + +#define FF_SLICE_CHICKEN _MMIO(0x2088) +#define FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX (1 << 1) + /* Disables pipelining of read flushes past the SF-WIZ interface. * Required on all Ironlake steppings according to the B-Spec, but the * particular danger of not doing so is not specified. */ # define _3D_CHICKEN2_WM_READ_PIPELINED (1 << 14) #define _3D_CHICKEN3 _MMIO(0x2090) +#define _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX (1 << 12) #define _3D_CHICKEN_SF_DISABLE_OBJEND_CULL (1 << 10) #define _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE (1 << 5) #define _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL (1 << 5) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index f3968580e5e2..7c4c8fb1dae4 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1545,11 +1545,21 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ batch = gen8_emit_flush_coherentl3_wa(engine, batch); + *batch++ = MI_LOAD_REGISTER_IMM(3); + /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ - *batch++ = MI_LOAD_REGISTER_IMM(1); *batch++ = i915_mmio_reg_offset(COMMON_SLICE_CHICKEN2); *batch++ = _MASKED_BIT_DISABLE( GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE); + + /* BSpec: 11391 */ + *batch++ = i915_mmio_reg_offset(FF_SLICE_CHICKEN); + *batch++ = _MASKED_BIT_ENABLE(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX); + + /* BSpec: 11299 */ + *batch++ = i915_mmio_reg_offset(_3D_CHICKEN3); + *batch++ = _MASKED_BIT_ENABLE(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX); + *batch++ = MI_NOOP; /* WaClearSlmSpaceAtContextSwitch:kbl */ -- cgit v1.2.3 From 2dbf8dffbf35fd8f611083b9d9fe74fdccf912a3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Jun 2018 15:58:45 -0400 Subject: pNFS: Always free the session slot on error in nfs4_layoutget_handle_exception Right now, we can call nfs_commit_inode() while holding the session slot, which could lead to NFSv4 deadlocks. Ensure we only keep the slot if the server returned a layout that we have to process. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ed45090e4df6..2c8c2696415e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8650,6 +8650,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); + nfs4_sequence_free_slot(&lgp->res.seq_res); + switch (nfs4err) { case 0: goto out; @@ -8714,7 +8716,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, goto out; } - nfs4_sequence_free_slot(&lgp->res.seq_res); err = nfs4_handle_exception(server, nfs4err, exception); if (!status) { if (exception->retry) @@ -8786,20 +8787,22 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) if (IS_ERR(task)) return ERR_CAST(task); status = rpc_wait_for_completion_task(task); - if (status == 0) { + if (status != 0) + goto out; + + /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ + if (task->tk_status < 0 || lgp->res.layoutp->len == 0) { status = nfs4_layoutget_handle_exception(task, lgp, &exception); *timeout = exception.timeout; - } - + } else + lseg = pnfs_layout_process(lgp); +out: trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, &lgp->res.stateid, status); - /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ - if (status == 0 && lgp->res.layoutp->len) - lseg = pnfs_layout_process(lgp); rpc_put_task(task); dprintk("<-- %s status=%d\n", __func__, status); if (status) -- cgit v1.2.3 From c8bf70735382401a161d9c818e8ea89500812d0c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 15 Jun 2018 16:31:02 -0400 Subject: pNFS: Don't send layoutreturn if the layout is already invalid If the layout was invalidated due to a reboot, then don't try to send a layoutreturn for it. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 16 ++++++++++++++++ fs/nfs/pnfs.h | 5 +++++ 2 files changed, 21 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2c8c2696415e..6dd146885da9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3294,6 +3294,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct inode *inode = calldata->inode; + struct pnfs_layout_hdr *lo; bool is_rdonly, is_wronly, is_rdwr; int call_close = 0; @@ -3337,6 +3338,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_wait; } + lo = calldata->arg.lr_args ? calldata->arg.lr_args->layout : NULL; + if (lo && !pnfs_layout_is_valid(lo)) { + calldata->arg.lr_args = NULL; + calldata->res.lr_res = NULL; + } + if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; @@ -5972,12 +5979,19 @@ static void nfs4_delegreturn_release(void *calldata) static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) { struct nfs4_delegreturndata *d_data; + struct pnfs_layout_hdr *lo; d_data = (struct nfs4_delegreturndata *)data; if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) return; + lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL; + if (lo && !pnfs_layout_is_valid(lo)) { + d_data->args.lr_args = NULL; + d_data->res.lr_res = NULL; + } + nfs4_setup_sequence(d_data->res.server->nfs_client, &d_data->args.seq_args, &d_data->res.seq_res, @@ -8820,6 +8834,8 @@ nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata) &lrp->args.seq_args, &lrp->res.seq_res, task); + if (!pnfs_layout_is_valid(lrp->args.layout)) + rpc_exit(task, 0); } static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a8f5e6b16749..3fe81424337d 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -801,6 +801,11 @@ static inline void nfs4_lgopen_release(struct nfs4_layoutget *lgp) { } +static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo) +{ + return false; +} + #endif /* CONFIG_NFS_V4_1 */ #if IS_ENABLED(CONFIG_NFS_V4_2) -- cgit v1.2.3 From 0dae72d581dfe795aedaf5523c1faeb18958b1a7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Jun 2018 15:55:43 -0400 Subject: sunrpc: Prevent duplicate XID allocation Krzysztof Kozlowski reports that a heavy NFSv4 WRITE workload against a slow NFS server causes his Raspberry Pi clients to stall. Krzysztof bisected it to commit 37ac86c3a76c ("SUNRPC: Initialize rpc_rqst outside of xprt->reserve_lock") . I was able to reproduce similar behavior and it appears that rarely the RPC client layer is re-allocating an XID for an RPC that it has already partially sent. This results in the client ignoring the subsequent reply, which carries the original XID. For various reasons, checking !req->rq_xmit_bytes_sent in xprt_prepare_transmit is not a 100% reliable mechanism for determining when a fresh XID is needed. Trond's preference is to allocate the XID at the time each rpc_rqst slot is initialized. This patch should also address a gcc 4.1.2 complaint reported by Geert Uytterhoeven . Reported-by: Krzysztof Kozlowski Fixes: 37ac86c3a76c ("SUNRPC: Initialize rpc_rqst outside of ... ") Signed-off-by: Chuck Lever Tested-by: Krzysztof Kozlowski Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3c85af058227..3fabf9f6a0f9 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -987,8 +987,6 @@ bool xprt_prepare_transmit(struct rpc_task *task) task->tk_status = -EAGAIN; goto out_unlock; } - if (!bc_prealloc(req) && !req->rq_xmit_bytes_sent) - req->rq_xid = xprt_alloc_xid(xprt); ret = true; out_unlock: spin_unlock_bh(&xprt->transport_lock); @@ -1298,7 +1296,12 @@ void xprt_retry_reserve(struct rpc_task *task) static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) { - return (__force __be32)xprt->xid++; + __be32 xid; + + spin_lock(&xprt->reserve_lock); + xid = (__force __be32)xprt->xid++; + spin_unlock(&xprt->reserve_lock); + return xid; } static inline void xprt_init_xid(struct rpc_xprt *xprt) @@ -1316,6 +1319,7 @@ void xprt_request_init(struct rpc_task *task) req->rq_task = task; req->rq_xprt = xprt; req->rq_buffer = NULL; + req->rq_xid = xprt_alloc_xid(xprt); req->rq_connect_cookie = xprt->connect_cookie - 1; req->rq_bytes_sent = 0; req->rq_snd_buf.len = 0; -- cgit v1.2.3 From 93efbd39870474cc536b9caf4a6efeb03b0bc56f Mon Sep 17 00:00:00 2001 From: Zhouyang Jia Date: Sat, 16 Jun 2018 01:05:01 +0800 Subject: scsi: xen-scsifront: add error handling for xenbus_printf When xenbus_printf fails, the lack of error-handling code may cause unexpected results. This patch adds error-handling code after calling xenbus_printf. Signed-off-by: Zhouyang Jia Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/scsi/xen-scsifront.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/xen-scsifront.c b/drivers/scsi/xen-scsifront.c index 36f59a1be7e9..61389bdc7926 100644 --- a/drivers/scsi/xen-scsifront.c +++ b/drivers/scsi/xen-scsifront.c @@ -654,10 +654,17 @@ static int scsifront_dev_reset_handler(struct scsi_cmnd *sc) static int scsifront_sdev_configure(struct scsi_device *sdev) { struct vscsifrnt_info *info = shost_priv(sdev->host); + int err; - if (info && current == info->curr) - xenbus_printf(XBT_NIL, info->dev->nodename, + if (info && current == info->curr) { + err = xenbus_printf(XBT_NIL, info->dev->nodename, info->dev_state_path, "%d", XenbusStateConnected); + if (err) { + xenbus_dev_error(info->dev, err, + "%s: writing dev_state_path", __func__); + return err; + } + } return 0; } @@ -665,10 +672,15 @@ static int scsifront_sdev_configure(struct scsi_device *sdev) static void scsifront_sdev_destroy(struct scsi_device *sdev) { struct vscsifrnt_info *info = shost_priv(sdev->host); + int err; - if (info && current == info->curr) - xenbus_printf(XBT_NIL, info->dev->nodename, + if (info && current == info->curr) { + err = xenbus_printf(XBT_NIL, info->dev->nodename, info->dev_state_path, "%d", XenbusStateClosed); + if (err) + xenbus_dev_error(info->dev, err, + "%s: writing dev_state_path", __func__); + } } static struct scsi_host_template scsifront_sht = { @@ -1003,9 +1015,12 @@ static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op) if (scsi_add_device(info->host, chn, tgt, lun)) { dev_err(&dev->dev, "scsi_add_device\n"); - xenbus_printf(XBT_NIL, dev->nodename, + err = xenbus_printf(XBT_NIL, dev->nodename, info->dev_state_path, "%d", XenbusStateClosed); + if (err) + xenbus_dev_error(dev, err, + "%s: writing dev_state_path", __func__); } break; case VSCSIFRONT_OP_DEL_LUN: @@ -1019,10 +1034,14 @@ static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op) } break; case VSCSIFRONT_OP_READD_LUN: - if (device_state == XenbusStateConnected) - xenbus_printf(XBT_NIL, dev->nodename, + if (device_state == XenbusStateConnected) { + err = xenbus_printf(XBT_NIL, dev->nodename, info->dev_state_path, "%d", XenbusStateConnected); + if (err) + xenbus_dev_error(dev, err, + "%s: writing dev_state_path", __func__); + } break; default: break; -- cgit v1.2.3 From 7c63ca24c878e0051c91904b72174029320ef4bd Mon Sep 17 00:00:00 2001 From: Zhouyang Jia Date: Sat, 16 Jun 2018 08:14:37 +0800 Subject: xen/scsiback: add error handling for xenbus_printf When xenbus_printf fails, the lack of error-handling code may cause unexpected results. This patch adds error-handling code after calling xenbus_printf. Signed-off-by: Zhouyang Jia Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/xen-scsiback.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c index 7bc88fd43cfc..e2f3e8b0fba9 100644 --- a/drivers/xen/xen-scsiback.c +++ b/drivers/xen/xen-scsiback.c @@ -1012,6 +1012,7 @@ static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state, { struct v2p_entry *entry; unsigned long flags; + int err; if (try) { spin_lock_irqsave(&info->v2p_lock, flags); @@ -1027,8 +1028,11 @@ static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state, scsiback_del_translation_entry(info, vir); } } else if (!try) { - xenbus_printf(XBT_NIL, info->dev->nodename, state, + err = xenbus_printf(XBT_NIL, info->dev->nodename, state, "%d", XenbusStateClosed); + if (err) + xenbus_dev_error(info->dev, err, + "%s: writing %s", __func__, state); } } @@ -1067,8 +1071,11 @@ static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op, snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", ent); val = xenbus_read(XBT_NIL, dev->nodename, str, NULL); if (IS_ERR(val)) { - xenbus_printf(XBT_NIL, dev->nodename, state, + err = xenbus_printf(XBT_NIL, dev->nodename, state, "%d", XenbusStateClosed); + if (err) + xenbus_dev_error(info->dev, err, + "%s: writing %s", __func__, state); return; } strlcpy(phy, val, VSCSI_NAMELEN); @@ -1079,8 +1086,11 @@ static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op, err = xenbus_scanf(XBT_NIL, dev->nodename, str, "%u:%u:%u:%u", &vir.hst, &vir.chn, &vir.tgt, &vir.lun); if (XENBUS_EXIST_ERR(err)) { - xenbus_printf(XBT_NIL, dev->nodename, state, + err = xenbus_printf(XBT_NIL, dev->nodename, state, "%d", XenbusStateClosed); + if (err) + xenbus_dev_error(info->dev, err, + "%s: writing %s", __func__, state); return; } -- cgit v1.2.3 From e08ecba17b72aeb01859601bc242a5bc48620109 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 19 Jun 2018 21:51:55 +1000 Subject: powerpc/64s: Fix build failures with CONFIG_NMI_IPI=n I broke the build when CONFIG_NMI_IPI=n with my recent commit to add arch_trigger_cpumask_backtrace(), eg: stacktrace.c:(.text+0x1b0): undefined reference to `.smp_send_safe_nmi_ipi' We should rework the CONFIG symbols here in future to avoid these double barrelled ifdefs but for now they fix the build. Fixes: 5cc05910f26e ("powerpc/64s: Wire up arch_trigger_cpumask_backtrace()") Reported-by: Christophe LEROY Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/nmi.h | 2 +- arch/powerpc/kernel/stacktrace.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h index 0f571e0ebca1..bd9ba8defd72 100644 --- a/arch/powerpc/include/asm/nmi.h +++ b/arch/powerpc/include/asm/nmi.h @@ -8,7 +8,7 @@ extern void arch_touch_nmi_watchdog(void); static inline void arch_touch_nmi_watchdog(void) {} #endif -#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_STACKTRACE) +#if defined(CONFIG_NMI_IPI) && defined(CONFIG_STACKTRACE) extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 07e97f289c52..e2c50b55138f 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -196,7 +196,7 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk, EXPORT_SYMBOL_GPL(save_stack_trace_tsk_reliable); #endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ -#ifdef CONFIG_PPC_BOOK3S_64 +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) static void handle_backtrace_ipi(struct pt_regs *regs) { nmi_cpu_backtrace(regs); @@ -242,4 +242,4 @@ void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) { nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); } -#endif /* CONFIG_PPC64 */ +#endif /* defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_NMI_IPI) */ -- cgit v1.2.3 From dd65a941f6ba473a5cb9d013d57fa43b48450a04 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 12 Jun 2018 13:08:40 +0200 Subject: arm64: dma-mapping: clear buffers allocated with FORCE_CONTIGUOUS flag dma_alloc_*() buffers might be exposed to userspace via mmap() call, so they should be cleared on allocation. In case of IOMMU-based dma-mapping implementation such buffer clearing was missing in the code path for DMA_ATTR_FORCE_CONTIGUOUS flag handling, because dma_alloc_from_contiguous() doesn't honor __GFP_ZERO flag. This patch fixes this issue. For more information on clearing buffers allocated by dma_alloc_* functions, see commit 6829e274a623 ("arm64: dma-mapping: always clear allocated buffers"). Fixes: 44176bb38fa4 ("arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to IOMMU") Signed-off-by: Marek Szyprowski Signed-off-by: Catalin Marinas --- arch/arm64/mm/dma-mapping.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 49e217ac7e1e..61e93f0b5482 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -583,13 +583,14 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, size >> PAGE_SHIFT); return NULL; } - if (!coherent) - __dma_flush_area(page_to_virt(page), iosize); - addr = dma_common_contiguous_remap(page, size, VM_USERMAP, prot, __builtin_return_address(0)); - if (!addr) { + if (addr) { + memset(addr, 0, size); + if (!coherent) + __dma_flush_area(page_to_virt(page), iosize); + } else { iommu_dma_unmap_page(dev, *handle, iosize, 0, attrs); dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); -- cgit v1.2.3 From b154886f7892499d0d3054026e19dfb9a731df61 Mon Sep 17 00:00:00 2001 From: Zhizhou Zhang Date: Tue, 12 Jun 2018 17:07:37 +0800 Subject: arm64: make secondary_start_kernel() notrace We can't call function trace hook before setup percpu offset. When entering secondary_start_kernel(), percpu offset has not been initialized. So this lead hotplug malfunction. Here is the flow to reproduce this bug: echo 0 > /sys/devices/system/cpu/cpu1/online echo function > /sys/kernel/debug/tracing/current_tracer echo 1 > /sys/kernel/debug/tracing/tracing_on echo 1 > /sys/devices/system/cpu/cpu1/online Acked-by: Mark Rutland Tested-by: Suzuki K Poulose Signed-off-by: Zhizhou Zhang Signed-off-by: Catalin Marinas --- arch/arm64/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index f3e2e3aec0b0..2faa9863d2e5 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -179,7 +179,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * This is the secondary CPU boot entry. We're using this CPUs * idle thread stack, but a set of temporary page tables. */ -asmlinkage void secondary_start_kernel(void) +asmlinkage notrace void secondary_start_kernel(void) { u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK; struct mm_struct *mm = &init_mm; -- cgit v1.2.3 From 42f86b44a4d356edba626171dfe0be061fc695af Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 Jun 2018 19:07:24 -0400 Subject: pNFS/flexfiles: Don't tie up all the rpciod threads in resends We do not want to have rpciod threads perform recursive calls into the RPC layer since that can deadlock. In particular, having to wait for a layoutget can be nasty... We want rather to defer scheduling those retries until we're in the rpc_release() callback, since that is called from the nfsiod workqueue. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 11 ++++++++--- include/linux/nfs_xdr.h | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 3ae038d9c292..336b4d560e2c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1243,17 +1243,18 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); + clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); + clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: if (ff_layout_choose_best_ds_for_read(hdr->lseg, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - ff_layout_read_record_layoutstats_done(task, hdr); - pnfs_read_resend_pnfs(hdr); + set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - ff_layout_reset_read(hdr); + set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); return task->tk_status; case -EAGAIN: goto out_eagain; @@ -1403,6 +1404,10 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + pnfs_read_resend_pnfs(hdr); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9dee3c23895d..712eed156d09 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1438,6 +1438,8 @@ enum { NFS_IOHDR_EOF, NFS_IOHDR_REDO, NFS_IOHDR_STAT, + NFS_IOHDR_RESEND_PNFS, + NFS_IOHDR_RESEND_MDS, }; struct nfs_io_completion; -- cgit v1.2.3 From 7b0df92ac12148098391bf53f3494af17812f264 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 18 Jun 2018 19:23:50 -0400 Subject: pNFS/flexfiles: Process writeback resends from nfsiod context as well Although the writeback resends are more robust than the reads, since they are not immediately rescheduled by the same thread, we are better off processing them in the same place as the reads. Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 336b4d560e2c..1386b774ec95 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1428,12 +1428,14 @@ static int ff_layout_write_done_cb(struct rpc_task *task, hdr->ds_clp, hdr->lseg, hdr->pgio_mirror_idx); + clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); + clear_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); switch (err) { case -NFS4ERR_RESET_TO_PNFS: - ff_layout_reset_write(hdr, true); + set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - ff_layout_reset_write(hdr, false); + set_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags); return task->tk_status; case -EAGAIN: return -EAGAIN; @@ -1580,6 +1582,10 @@ static void ff_layout_write_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_write_record_layoutstats_done(&hdr->task, hdr); + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + ff_layout_reset_write(hdr, true); + else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + ff_layout_reset_write(hdr, false); pnfs_generic_rw_release(data); } -- cgit v1.2.3 From 0cc61e64e21cfc24fa0d938fd148aba4a595163b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jun 2018 18:40:14 +0200 Subject: block: fix timeout changes for legacy request drivers blk_mq_complete_request can only be called for blk-mq drivers, but when removing the BLK_EH_HANDLED return value, two legacy request timeout methods incorrectly got switched to call blk_mq_complete_request. Call __blk_complete_request instead to reinstance the previous behavior. For that __blk_complete_request needs to be exported. Fixes: 1fc2b62e ("scsi_transport_fc: complete requests from ->timeout") Fixes: 0df0bb08 ("null_blk: complete requests from ->timeout") Reported-by: Jianchao Wang Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-softirq.c | 1 + drivers/block/null_blk.c | 2 +- drivers/scsi/scsi_transport_fc.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 01e2b353a2b9..15c1f5e12eb8 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -144,6 +144,7 @@ do_local: local_irq_restore(flags); } +EXPORT_SYMBOL(__blk_complete_request); /** * blk_complete_request - end I/O on a request diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 2bdadd7f1454..3d8bdbe9bd35 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -1365,7 +1365,7 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq) { pr_info("null: rq %p timed out\n", rq); - blk_mq_complete_request(rq); + __blk_complete_request(rq); return BLK_EH_DONE; } diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c index 1da3d71e9f61..13948102ca29 100644 --- a/drivers/scsi/scsi_transport_fc.c +++ b/drivers/scsi/scsi_transport_fc.c @@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req) /* the blk_end_sync_io() doesn't check the error */ if (inflight) - blk_mq_complete_request(req); + __blk_complete_request(req); return BLK_EH_DONE; } -- cgit v1.2.3 From 91c822c33066b7c4f8cc47d7532f47e3bb89979b Mon Sep 17 00:00:00 2001 From: Rajan Vaja Date: Mon, 18 Jun 2018 13:01:02 +0530 Subject: drm/amd/pp: Fix uninitialized variable Initialize variable to 0 before performing logical OR operation. Reviewed-by: Rex Zhu Signed-off-by: Rajan Vaja Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/powerplay/hwmgr/vega10_powertune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_powertune.c b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_powertune.c index dbe4b1f66784..22364875a943 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_powertune.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega10_powertune.c @@ -1090,7 +1090,7 @@ static int vega10_disable_se_edc_config(struct pp_hwmgr *hwmgr) static int vega10_enable_psm_gc_edc_config(struct pp_hwmgr *hwmgr) { struct amdgpu_device *adev = hwmgr->adev; - int result; + int result = 0; uint32_t num_se = 0; uint32_t count, data; -- cgit v1.2.3 From 6fa39bc1e01dab8b4f54b23e95a181a2ed5a2d38 Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Fri, 8 Jun 2018 12:58:15 +0200 Subject: drm/amdgpu: Use kvmalloc_array for allocating VRAM manager nodes array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It can be quite big, and there's no need for it to be physically contiguous. This is less likely to fail under memory pressure (has actually happened while running piglit). Cc: stable@vger.kernel.org Signed-off-by: Michel Dänzer Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9aca653bec07..9c47e860e5e6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -135,7 +135,8 @@ static int amdgpu_vram_mgr_new(struct ttm_mem_type_manager *man, num_nodes = DIV_ROUND_UP(mem->num_pages, pages_per_node); } - nodes = kcalloc(num_nodes, sizeof(*nodes), GFP_KERNEL); + nodes = kvmalloc_array(num_nodes, sizeof(*nodes), + GFP_KERNEL | __GFP_ZERO); if (!nodes) return -ENOMEM; @@ -190,7 +191,7 @@ error: drm_mm_remove_node(&nodes[i]); spin_unlock(&mgr->lock); - kfree(nodes); + kvfree(nodes); return r == -ENOSPC ? 0 : r; } @@ -229,7 +230,7 @@ static void amdgpu_vram_mgr_del(struct ttm_mem_type_manager *man, atomic64_sub(usage, &mgr->usage); atomic64_sub(vis_usage, &mgr->vis_usage); - kfree(mem->mm_node); + kvfree(mem->mm_node); mem->mm_node = NULL; } -- cgit v1.2.3 From d9fda248046ac035f18a6e663f2f9245b4bf9470 Mon Sep 17 00:00:00 2001 From: Harry Wentland Date: Tue, 8 May 2018 11:33:42 -0400 Subject: drm/amdgpu: Don't default to DC support for Kaveri and older We've had a number of users report failures to detect and light up display with DC with LVDS and VGA. These connector types are not currently supported with DC. I'd like to add support but unfortunately don't have a system with LVDS or VGA available. In order not to cause regressions we should probably fallback to the non-DC driver for ASICs that support VGA and LVDS. These ASICs are: * Bonaire * Kabini * Kaveri * Mullins ASIC support can always be force enabled with amdgpu.dc=1 v2: Keep Hawaii on DC v3: Added Mullins to the list Cc: stable@vger.kernel.org Signed-off-by: Harry Wentland Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3317d1536f4f..6e5284e6c028 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2158,10 +2158,18 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) switch (asic_type) { #if defined(CONFIG_DRM_AMD_DC) case CHIP_BONAIRE: - case CHIP_HAWAII: case CHIP_KAVERI: case CHIP_KABINI: case CHIP_MULLINS: + /* + * We have systems in the wild with these ASICs that require + * LVDS and VGA support which is not supported with DC. + * + * Fallback to the non-DC driver here by default so as not to + * cause regressions. + */ + return amdgpu_dc > 0; + case CHIP_HAWAII: case CHIP_CARRIZO: case CHIP_STONEY: case CHIP_POLARIS10: -- cgit v1.2.3 From 9c24c10a2c1e1bb478b6bb70612d9e885aee044f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 19 Jun 2018 10:26:40 -0700 Subject: Revert "block: Add warning for bi_next not NULL in bio_endio()" Commit 0ba99ca4838b ("block: Add warning for bi_next not NULL in bio_endio()") breaks the dm driver. end_clone_bio() detects whether or not a bio is the last bio associated with a request by checking the .bi_next field. Commit 0ba99ca4838b clears that field before end_clone_bio() has had a chance to inspect that field. Hence revert commit 0ba99ca4838b. This patch avoids that KASAN reports the following complaint when running the srp-test software (srp-test/run_tests -c -d -r 10 -t 02-mq): ================================================================== BUG: KASAN: use-after-free in bio_advance+0x11b/0x1d0 Read of size 4 at addr ffff8801300e06d0 by task ksoftirqd/0/9 CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 4.18.0-rc1-dbg+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0xa4/0xf5 print_address_description+0x6f/0x270 kasan_report+0x241/0x360 __asan_load4+0x78/0x80 bio_advance+0x11b/0x1d0 blk_update_request+0xa7/0x5b0 scsi_end_request+0x56/0x320 [scsi_mod] scsi_io_completion+0x7d6/0xb20 [scsi_mod] scsi_finish_command+0x1c0/0x280 [scsi_mod] scsi_softirq_done+0x19a/0x230 [scsi_mod] blk_mq_complete_request+0x160/0x240 scsi_mq_done+0x50/0x1a0 [scsi_mod] srp_recv_done+0x515/0x1330 [ib_srp] __ib_process_cq+0xa0/0xf0 [ib_core] ib_poll_handler+0x38/0xa0 [ib_core] irq_poll_softirq+0xe8/0x1f0 __do_softirq+0x128/0x60d run_ksoftirqd+0x3f/0x60 smpboot_thread_fn+0x352/0x460 kthread+0x1c1/0x1e0 ret_from_fork+0x24/0x30 Allocated by task 1918: save_stack+0x43/0xd0 kasan_kmalloc+0xad/0xe0 kasan_slab_alloc+0x11/0x20 kmem_cache_alloc+0xfe/0x350 mempool_alloc_slab+0x15/0x20 mempool_alloc+0xfb/0x270 bio_alloc_bioset+0x244/0x350 submit_bh_wbc+0x9c/0x2f0 __block_write_full_page+0x299/0x5a0 block_write_full_page+0x16b/0x180 blkdev_writepage+0x18/0x20 __writepage+0x42/0x80 write_cache_pages+0x376/0x8a0 generic_writepages+0xbe/0x110 blkdev_writepages+0xe/0x10 do_writepages+0x9b/0x180 __filemap_fdatawrite_range+0x178/0x1c0 file_write_and_wait_range+0x59/0xc0 blkdev_fsync+0x46/0x80 vfs_fsync_range+0x66/0x100 do_fsync+0x3d/0x70 __x64_sys_fsync+0x21/0x30 do_syscall_64+0x77/0x230 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 9: save_stack+0x43/0xd0 __kasan_slab_free+0x137/0x190 kasan_slab_free+0xe/0x10 kmem_cache_free+0xd3/0x380 mempool_free_slab+0x17/0x20 mempool_free+0x63/0x160 bio_free+0x81/0xa0 bio_put+0x59/0x60 end_bio_bh_io_sync+0x5d/0x70 bio_endio+0x1a7/0x360 blk_update_request+0xd0/0x5b0 end_clone_bio+0xa3/0xd0 [dm_mod] bio_endio+0x1a7/0x360 blk_update_request+0xd0/0x5b0 scsi_end_request+0x56/0x320 [scsi_mod] scsi_io_completion+0x7d6/0xb20 [scsi_mod] scsi_finish_command+0x1c0/0x280 [scsi_mod] scsi_softirq_done+0x19a/0x230 [scsi_mod] blk_mq_complete_request+0x160/0x240 scsi_mq_done+0x50/0x1a0 [scsi_mod] srp_recv_done+0x515/0x1330 [ib_srp] __ib_process_cq+0xa0/0xf0 [ib_core] ib_poll_handler+0x38/0xa0 [ib_core] irq_poll_softirq+0xe8/0x1f0 __do_softirq+0x128/0x60d The buggy address belongs to the object at ffff8801300e0640 which belongs to the cache bio-0 of size 200 The buggy address is located 144 bytes inside of 200-byte region [ffff8801300e0640, ffff8801300e0708) The buggy address belongs to the page: page:ffffea0004c03800 count:1 mapcount:0 mapping:ffff88015a563a00 index:0x0 compound_mapcount: 0 flags: 0x8000000000008100(slab|head) raw: 8000000000008100 dead000000000100 dead000000000200 ffff88015a563a00 raw: 0000000000000000 0000000000330033 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801300e0580: fb fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc ffff8801300e0600: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb >ffff8801300e0680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801300e0700: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8801300e0780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== Cc: Kent Overstreet Fixes: 0ba99ca4838b ("block: Add warning for bi_next not NULL in bio_endio()") Acked-by: Mike Snitzer Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/bio.c | 3 --- block/blk-core.c | 8 +------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/block/bio.c b/block/bio.c index db9a40e9a136..f7e3d88bd0b6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1807,9 +1807,6 @@ again: if (!bio_integrity_endio(bio)) return; - if (WARN_ONCE(bio->bi_next, "driver left bi_next not NULL")) - bio->bi_next = NULL; - /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that diff --git a/block/blk-core.c b/block/blk-core.c index cf0ee764b908..afd2596ea3d3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -273,10 +273,6 @@ static void req_bio_endio(struct request *rq, struct bio *bio, bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ - /* - * XXX this code looks suspicious - it's not consistent with advancing - * req->bio in caller - */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } @@ -3081,10 +3077,8 @@ bool blk_update_request(struct request *req, blk_status_t error, struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - if (bio_bytes == bio->bi_iter.bi_size) { + if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; - bio->bi_next = NULL; - } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); -- cgit v1.2.3 From 5c53d19b76dccbaf340b11acb837d40c0789049d Mon Sep 17 00:00:00 2001 From: James Zhu Date: Mon, 18 Jun 2018 13:46:16 -0400 Subject: drm/amdgpu:All UVD instances share one idle_work handle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All UVD instanses have only one dpm control, so it is better to share one idle_work handle. Signed-off-by: James Zhu Reviewed-by: Alex Deucher Reviewed-by: Christian König Tested-by: Stefan Agner Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 14 +++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index bcf68f80bbf0..3ff08e326838 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -130,7 +130,7 @@ int amdgpu_uvd_sw_init(struct amdgpu_device *adev) unsigned version_major, version_minor, family_id; int i, j, r; - INIT_DELAYED_WORK(&adev->uvd.inst->idle_work, amdgpu_uvd_idle_work_handler); + INIT_DELAYED_WORK(&adev->uvd.idle_work, amdgpu_uvd_idle_work_handler); switch (adev->asic_type) { #ifdef CONFIG_DRM_AMDGPU_CIK @@ -314,12 +314,12 @@ int amdgpu_uvd_suspend(struct amdgpu_device *adev) void *ptr; int i, j; + cancel_delayed_work_sync(&adev->uvd.idle_work); + for (j = 0; j < adev->uvd.num_uvd_inst; ++j) { if (adev->uvd.inst[j].vcpu_bo == NULL) continue; - cancel_delayed_work_sync(&adev->uvd.inst[j].idle_work); - /* only valid for physical mode */ if (adev->asic_type < CHIP_POLARIS10) { for (i = 0; i < adev->uvd.max_handles; ++i) @@ -1145,7 +1145,7 @@ int amdgpu_uvd_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle, static void amdgpu_uvd_idle_work_handler(struct work_struct *work) { struct amdgpu_device *adev = - container_of(work, struct amdgpu_device, uvd.inst->idle_work.work); + container_of(work, struct amdgpu_device, uvd.idle_work.work); unsigned fences = 0, i, j; for (i = 0; i < adev->uvd.num_uvd_inst; ++i) { @@ -1167,7 +1167,7 @@ static void amdgpu_uvd_idle_work_handler(struct work_struct *work) AMD_CG_STATE_GATE); } } else { - schedule_delayed_work(&adev->uvd.inst->idle_work, UVD_IDLE_TIMEOUT); + schedule_delayed_work(&adev->uvd.idle_work, UVD_IDLE_TIMEOUT); } } @@ -1179,7 +1179,7 @@ void amdgpu_uvd_ring_begin_use(struct amdgpu_ring *ring) if (amdgpu_sriov_vf(adev)) return; - set_clocks = !cancel_delayed_work_sync(&adev->uvd.inst->idle_work); + set_clocks = !cancel_delayed_work_sync(&adev->uvd.idle_work); if (set_clocks) { if (adev->pm.dpm_enabled) { amdgpu_dpm_enable_uvd(adev, true); @@ -1196,7 +1196,7 @@ void amdgpu_uvd_ring_begin_use(struct amdgpu_ring *ring) void amdgpu_uvd_ring_end_use(struct amdgpu_ring *ring) { if (!amdgpu_sriov_vf(ring->adev)) - schedule_delayed_work(&ring->adev->uvd.inst->idle_work, UVD_IDLE_TIMEOUT); + schedule_delayed_work(&ring->adev->uvd.idle_work, UVD_IDLE_TIMEOUT); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h index b1579fba134c..8b23a1b00c76 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.h @@ -44,7 +44,6 @@ struct amdgpu_uvd_inst { void *saved_bo; atomic_t handles[AMDGPU_MAX_UVD_HANDLES]; struct drm_file *filp[AMDGPU_MAX_UVD_HANDLES]; - struct delayed_work idle_work; struct amdgpu_ring ring; struct amdgpu_ring ring_enc[AMDGPU_MAX_UVD_ENC_RINGS]; struct amdgpu_irq_src irq; @@ -62,6 +61,7 @@ struct amdgpu_uvd { bool address_64_bit; bool use_ctx_buf; struct amdgpu_uvd_inst inst[AMDGPU_MAX_UVD_INSTANCES]; + struct delayed_work idle_work; }; int amdgpu_uvd_sw_init(struct amdgpu_device *adev); -- cgit v1.2.3 From 34d6d59986abb1d2cb5415a49b6c50f51ba1d2e4 Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Fri, 15 Jun 2018 11:06:56 +0200 Subject: drm/amdgpu: Update pin_size values before unpinning BO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At least in theory, ttm_bo_validate may move the BO, in which case the pin_size accounting would be inconsistent with when the BO was pinned. Cc: stable@vger.kernel.org Signed-off-by: Michel Dänzer Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 5e4e1bd90383..026140f08ee9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -790,15 +790,6 @@ int amdgpu_bo_unpin(struct amdgpu_bo *bo) bo->pin_count--; if (bo->pin_count) return 0; - for (i = 0; i < bo->placement.num_placement; i++) { - bo->placements[i].lpfn = 0; - bo->placements[i].flags &= ~TTM_PL_FLAG_NO_EVICT; - } - r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); - if (unlikely(r)) { - dev_err(adev->dev, "%p validate failed for unpin\n", bo); - goto error; - } if (bo->tbo.mem.mem_type == TTM_PL_VRAM) { adev->vram_pin_size -= amdgpu_bo_size(bo); @@ -808,7 +799,14 @@ int amdgpu_bo_unpin(struct amdgpu_bo *bo) adev->gart_pin_size -= amdgpu_bo_size(bo); } -error: + for (i = 0; i < bo->placement.num_placement; i++) { + bo->placements[i].lpfn = 0; + bo->placements[i].flags &= ~TTM_PL_FLAG_NO_EVICT; + } + r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); + if (unlikely(r)) + dev_err(adev->dev, "%p validate failed for unpin\n", bo); + return r; } -- cgit v1.2.3 From 5e9244ff585239630f15f8ad8e676bc91a94ca9e Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Tue, 12 Jun 2018 12:07:33 +0200 Subject: drm/amdgpu: Refactor amdgpu_vram_mgr_bo_invisible_size helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preparation for the following fix, no functional change intended. Cc: stable@vger.kernel.org Signed-off-by: Michel Dänzer Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 6 ++---- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 16 ++++++++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 026140f08ee9..3526efa8960e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -762,8 +762,7 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain, domain = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type); if (domain == AMDGPU_GEM_DOMAIN_VRAM) { adev->vram_pin_size += amdgpu_bo_size(bo); - if (bo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) - adev->invisible_pin_size += amdgpu_bo_size(bo); + adev->invisible_pin_size += amdgpu_vram_mgr_bo_invisible_size(bo); } else if (domain == AMDGPU_GEM_DOMAIN_GTT) { adev->gart_pin_size += amdgpu_bo_size(bo); } @@ -793,8 +792,7 @@ int amdgpu_bo_unpin(struct amdgpu_bo *bo) if (bo->tbo.mem.mem_type == TTM_PL_VRAM) { adev->vram_pin_size -= amdgpu_bo_size(bo); - if (bo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) - adev->invisible_pin_size -= amdgpu_bo_size(bo); + adev->invisible_pin_size -= amdgpu_vram_mgr_bo_invisible_size(bo); } else if (bo->tbo.mem.mem_type == TTM_PL_TT) { adev->gart_pin_size -= amdgpu_bo_size(bo); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index e969c879d87e..e5da4654b630 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -73,6 +73,7 @@ bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_mem_reg *mem); uint64_t amdgpu_gtt_mgr_usage(struct ttm_mem_type_manager *man); int amdgpu_gtt_mgr_recover(struct ttm_mem_type_manager *man); +u64 amdgpu_vram_mgr_bo_invisible_size(struct amdgpu_bo *bo); uint64_t amdgpu_vram_mgr_usage(struct ttm_mem_type_manager *man); uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_mem_type_manager *man); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 9c47e860e5e6..ae0049c6c52c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -96,6 +96,22 @@ static u64 amdgpu_vram_mgr_vis_size(struct amdgpu_device *adev, adev->gmc.visible_vram_size : end) - start; } +/** + * amdgpu_vram_mgr_bo_invisible_size - CPU invisible BO size + * + * @bo: &amdgpu_bo buffer object (must be in VRAM) + * + * Returns: + * How much of the given &amdgpu_bo buffer object lies in CPU invisible VRAM. + */ +u64 amdgpu_vram_mgr_bo_invisible_size(struct amdgpu_bo *bo) +{ + if (bo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) + return amdgpu_bo_size(bo); + + return 0; +} + /** * amdgpu_vram_mgr_new - allocate new ranges * -- cgit v1.2.3 From 7303b39e46b2f523334591f05fd9566cf929eb26 Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Thu, 14 Jun 2018 13:02:07 +0200 Subject: drm/amdgpu: Make amdgpu_vram_mgr_bo_invisible_size always accurate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even BOs with AMDGPU_GEM_CREATE_NO_CPU_ACCESS may end up at least partially in CPU visible VRAM, in particular when all VRAM is visible. v2: * Don't take VRAM mgr spinlock, not needed (Christian König) * Make loop logic simpler and clearer. Cc: stable@vger.kernel.org Signed-off-by: Michel Dänzer Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index ae0049c6c52c..b6333f92ba45 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -106,10 +106,26 @@ static u64 amdgpu_vram_mgr_vis_size(struct amdgpu_device *adev, */ u64 amdgpu_vram_mgr_bo_invisible_size(struct amdgpu_bo *bo) { - if (bo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + struct ttm_mem_reg *mem = &bo->tbo.mem; + struct drm_mm_node *nodes = mem->mm_node; + unsigned pages = mem->num_pages; + u64 usage = 0; + + if (adev->gmc.visible_vram_size == adev->gmc.real_vram_size) + return 0; + + if (mem->start >= adev->gmc.visible_vram_size >> PAGE_SHIFT) return amdgpu_bo_size(bo); - return 0; + while (nodes && pages) { + usage += nodes->size << PAGE_SHIFT; + usage -= amdgpu_vram_mgr_vis_size(adev, nodes); + pages -= nodes->size; + ++nodes; + } + + return usage; } /** -- cgit v1.2.3 From 6fb8656646f996d1eef42e6d56203c4915cb9e08 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sat, 24 Mar 2018 17:57:49 +0100 Subject: mips: ftrace: fix static function graph tracing ftrace_graph_caller was never run after calling ftrace_trace_function, breaking the function graph tracer. Fix this, bringing it in line with the x86 implementation. While we're at it, also streamline the control flow of _mcount a bit to reduce the number of branches. This issue was reported before: https://www.linux-mips.org/archives/linux-mips/2014-11/msg00295.html Signed-off-by: Matthias Schiffer Tested-by: Matt Redfearn Patchwork: https://patchwork.linux-mips.org/patch/18929/ Signed-off-by: Paul Burton Cc: stable@vger.kernel.org # v3.17+ --- arch/mips/kernel/mcount.S | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/mips/kernel/mcount.S b/arch/mips/kernel/mcount.S index f2ee7e1e3342..cff52b283e03 100644 --- a/arch/mips/kernel/mcount.S +++ b/arch/mips/kernel/mcount.S @@ -119,10 +119,20 @@ NESTED(_mcount, PT_SIZE, ra) EXPORT_SYMBOL(_mcount) PTR_LA t1, ftrace_stub PTR_L t2, ftrace_trace_function /* Prepare t2 for (1) */ - bne t1, t2, static_trace + beq t1, t2, fgraph_trace nop + MCOUNT_SAVE_REGS + + move a0, ra /* arg1: self return address */ + jalr t2 /* (1) call *ftrace_trace_function */ + move a1, AT /* arg2: parent's return address */ + + MCOUNT_RESTORE_REGS + +fgraph_trace: #ifdef CONFIG_FUNCTION_GRAPH_TRACER + PTR_LA t1, ftrace_stub PTR_L t3, ftrace_graph_return bne t1, t3, ftrace_graph_caller nop @@ -131,24 +141,11 @@ EXPORT_SYMBOL(_mcount) bne t1, t3, ftrace_graph_caller nop #endif - b ftrace_stub -#ifdef CONFIG_32BIT - addiu sp, sp, 8 -#else - nop -#endif -static_trace: - MCOUNT_SAVE_REGS - - move a0, ra /* arg1: self return address */ - jalr t2 /* (1) call *ftrace_trace_function */ - move a1, AT /* arg2: parent's return address */ - - MCOUNT_RESTORE_REGS #ifdef CONFIG_32BIT addiu sp, sp, 8 #endif + .globl ftrace_stub ftrace_stub: RETURN_BACK -- cgit v1.2.3 From 4f9de4df901fb84709fe3a864dfa4eaf35700f68 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Mon, 18 Jun 2018 21:58:00 -0700 Subject: qed: Fix possible memory leak in Rx error path handling. Memory for packet buffers need to be freed in the error paths as there is no consumer (e.g., upper layer) for such packets and that memory will never get freed. The issue was uncovered when port was attacked with flood of isatap packets, these are multicast packets hence were directed at all the PFs. For foce PF, this meant they were routed to the ll2 module which in turn drops such packets. Fixes: 0a7fb11c ("qed: Add Light L2 support") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_ll2.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index c97ebd681c47..012973d75ad0 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -201,8 +201,9 @@ void qed_ll2b_complete_rx_packet(void *cxt, struct qed_ll2_comp_rx_data *data) skb = build_skb(buffer->data, 0); if (!skb) { - rc = -ENOMEM; - goto out_post; + DP_INFO(cdev, "Failed to build SKB\n"); + kfree(buffer->data); + goto out_post1; } data->u.placement_offset += NET_SKB_PAD; @@ -224,8 +225,14 @@ void qed_ll2b_complete_rx_packet(void *cxt, struct qed_ll2_comp_rx_data *data) cdev->ll2->cbs->rx_cb(cdev->ll2->cb_cookie, skb, data->opaque_data_0, data->opaque_data_1); + } else { + DP_VERBOSE(p_hwfn, (NETIF_MSG_RX_STATUS | NETIF_MSG_PKTDATA | + QED_MSG_LL2 | QED_MSG_STORAGE), + "Dropping the packet\n"); + kfree(buffer->data); } +out_post1: /* Update Buffer information and update FW producer */ buffer->data = new_data; buffer->phys_addr = new_phys_addr; -- cgit v1.2.3 From 3935a70968820c3994db4de7e6e1c7e814bff875 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Mon, 18 Jun 2018 21:58:01 -0700 Subject: qed: Add sanity check for SIMD fastpath handler. Avoid calling a SIMD fastpath handler if it is NULL. The check is needed to handle an unlikely scenario where unsolicited interrupt is destined to a PF in INTa mode. Fixes: fe56b9e6a ("qed: Add module with basic common support") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_main.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index b04d57ca5176..5c10fd7210c3 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -567,8 +567,16 @@ static irqreturn_t qed_single_int(int irq, void *dev_instance) /* Fastpath interrupts */ for (j = 0; j < 64; j++) { if ((0x2ULL << j) & status) { - hwfn->simd_proto_handler[j].func( - hwfn->simd_proto_handler[j].token); + struct qed_simd_fp_handler *p_handler = + &hwfn->simd_proto_handler[j]; + + if (p_handler->func) + p_handler->func(p_handler->token); + else + DP_NOTICE(hwfn, + "Not calling fastpath handler as it is NULL [handler #%d, status 0x%llx]\n", + j, status); + status &= ~(0x2ULL << j); rc = IRQ_HANDLED; } -- cgit v1.2.3 From ff54d5cd9ec15546abc870452dd0b66eef4b4606 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Mon, 18 Jun 2018 21:58:02 -0700 Subject: qed: Do not advertise DCBX_LLD_MANAGED capability. Do not advertise DCBX_LLD_MANAGED capability i.e., do not allow external agent to manage the dcbx/lldp negotiation. MFW acts as lldp agent for qed* devices, and no other lldp agent is allowed to coexist with mfw. Also updated a debug print, to not to display the redundant info. Fixes: a1d8d8a51 ("qed: Add dcbnl support.") Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index 8f31406ec894..f0b01385d5cb 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -255,9 +255,8 @@ qed_dcbx_get_app_protocol_type(struct qed_hwfn *p_hwfn, *type = DCBX_PROTOCOL_ROCE_V2; } else { *type = DCBX_MAX_PROTOCOL_TYPE; - DP_ERR(p_hwfn, - "No action required, App TLV id = 0x%x app_prio_bitmap = 0x%x\n", - id, app_prio_bitmap); + DP_ERR(p_hwfn, "No action required, App TLV entry = 0x%x\n", + app_prio_bitmap); return false; } @@ -1479,8 +1478,8 @@ static u8 qed_dcbnl_getcap(struct qed_dev *cdev, int capid, u8 *cap) *cap = 0x80; break; case DCB_CAP_ATTR_DCBX: - *cap = (DCB_CAP_DCBX_LLD_MANAGED | DCB_CAP_DCBX_VER_CEE | - DCB_CAP_DCBX_VER_IEEE | DCB_CAP_DCBX_STATIC); + *cap = (DCB_CAP_DCBX_VER_CEE | DCB_CAP_DCBX_VER_IEEE | + DCB_CAP_DCBX_STATIC); break; default: *cap = false; @@ -1548,8 +1547,6 @@ static u8 qed_dcbnl_getdcbx(struct qed_dev *cdev) if (!dcbx_info) return 0; - if (dcbx_info->operational.enabled) - mode |= DCB_CAP_DCBX_LLD_MANAGED; if (dcbx_info->operational.ieee) mode |= DCB_CAP_DCBX_VER_IEEE; if (dcbx_info->operational.cee) -- cgit v1.2.3 From c51818d5b793302b0923ade9856574ac28b9333b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 Jun 2018 14:33:54 -0700 Subject: bpf, xdp, i40e: fix i40e_build_skb skb reserve and truesize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using skb_reserve(skb, I40E_SKB_PAD + (xdp->data - xdp->data_hard_start)) is clearly wrong since I40E_SKB_PAD already points to the offset where the original xdp->data was sitting since xdp->data_hard_start is defined as xdp->data - i40e_rx_offset(rx_ring) where latter offsets to I40E_SKB_PAD when build skb is used. However, also before cc5b114dcf98 ("bpf, i40e: add meta data support") this seems broken since bpf_xdp_adjust_head() helper could have been used to alter headroom and enlarge / shrink the frame and with that the assumption that the xdp->data remains unchanged does not hold and would push a bogus packet to upper stack. ixgbe got this right in 924708081629 ("ixgbe: add XDP support for pass and drop actions"). In any case, fix it by removing the I40E_SKB_PAD from both skb_reserve() and truesize calculation. Fixes: cc5b114dcf98 ("bpf, i40e: add meta data support") Fixes: 0c8493d90b6b ("i40e: add XDP support for pass and drop actions") Reported-by: Keith Busch Reported-by: Toshiaki Makita Signed-off-by: Daniel Borkmann Cc: Björn Töpel Cc: John Fastabend Tested-by: Keith Busch Acked-by: John Fastabend Acked-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 8ffb7454e67c..ed6dbcfd4e96 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2103,9 +2103,8 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, unsigned int truesize = i40e_rx_pg_size(rx_ring) / 2; #else unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + - SKB_DATA_ALIGN(I40E_SKB_PAD + - (xdp->data_end - - xdp->data_hard_start)); + SKB_DATA_ALIGN(xdp->data_end - + xdp->data_hard_start); #endif struct sk_buff *skb; @@ -2124,7 +2123,7 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, return NULL; /* update pointers within the skb to store the data */ - skb_reserve(skb, I40E_SKB_PAD + (xdp->data - xdp->data_hard_start)); + skb_reserve(skb, xdp->data - xdp->data_hard_start); __skb_put(skb, xdp->data_end - xdp->data); if (metasize) skb_metadata_set(skb, metasize); -- cgit v1.2.3 From 87975a0117815b9b63527e8b8d9a9dffa6913132 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 19 Jun 2018 15:08:31 +0930 Subject: net/ncsi: Silence debug messages In normal operation we see this series of messages as the host drives the network device: ftgmac100 1e660000.ethernet eth0: NCSI: LSC AEN - channel 0 state down ftgmac100 1e660000.ethernet eth0: NCSI: suspending channel 0 ftgmac100 1e660000.ethernet eth0: NCSI: configuring channel 0 ftgmac100 1e660000.ethernet eth0: NCSI: channel 0 link down after config ftgmac100 1e660000.ethernet eth0: NCSI interface down ftgmac100 1e660000.ethernet eth0: NCSI: LSC AEN - channel 0 state up ftgmac100 1e660000.ethernet eth0: NCSI: configuring channel 0 ftgmac100 1e660000.ethernet eth0: NCSI interface up ftgmac100 1e660000.ethernet eth0: NCSI: LSC AEN - channel 0 state down ftgmac100 1e660000.ethernet eth0: NCSI: suspending channel 0 ftgmac100 1e660000.ethernet eth0: NCSI: configuring channel 0 ftgmac100 1e660000.ethernet eth0: NCSI: channel 0 link down after config ftgmac100 1e660000.ethernet eth0: NCSI interface down ftgmac100 1e660000.ethernet eth0: NCSI: LSC AEN - channel 0 state up ftgmac100 1e660000.ethernet eth0: NCSI: configuring channel 0 ftgmac100 1e660000.ethernet eth0: NCSI interface up This makes all of these messages netdev_dbg. They are still useful to debug eg. misbehaving network device firmware, but we do not need them filling up the kernel logs in normal operation. Acked-by: Samuel Mendoza-Jonas Signed-off-by: Joel Stanley Signed-off-by: David S. Miller --- drivers/net/ethernet/faraday/ftgmac100.c | 4 ++-- net/ncsi/ncsi-aen.c | 4 ++-- net/ncsi/ncsi-manage.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c index 78db8e62a83f..ed6c76d20b45 100644 --- a/drivers/net/ethernet/faraday/ftgmac100.c +++ b/drivers/net/ethernet/faraday/ftgmac100.c @@ -1735,8 +1735,8 @@ static void ftgmac100_ncsi_handler(struct ncsi_dev *nd) if (unlikely(nd->state != ncsi_dev_state_functional)) return; - netdev_info(nd->dev, "NCSI interface %s\n", - nd->link_up ? "up" : "down"); + netdev_dbg(nd->dev, "NCSI interface %s\n", + nd->link_up ? "up" : "down"); } static void ftgmac100_setup_clk(struct ftgmac100 *priv) diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index e7b05de1e6d1..f899ed61bb57 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -73,8 +73,8 @@ static int ncsi_aen_handler_lsc(struct ncsi_dev_priv *ndp, ncm->data[2] = data; ncm->data[4] = ntohl(lsc->oem_status); - netdev_info(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n", - nc->id, data & 0x1 ? "up" : "down"); + netdev_dbg(ndp->ndev.dev, "NCSI: LSC AEN - channel %u state %s\n", + nc->id, data & 0x1 ? "up" : "down"); chained = !list_empty(&nc->link); state = nc->state; diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 5561e221b71f..616441c2b54f 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -816,9 +816,9 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } else { hot_nc = NULL; nc->state = NCSI_CHANNEL_INACTIVE; - netdev_warn(ndp->ndev.dev, - "NCSI: channel %u link down after config\n", - nc->id); + netdev_dbg(ndp->ndev.dev, + "NCSI: channel %u link down after config\n", + nc->id); } spin_unlock_irqrestore(&nc->lock, flags); @@ -1199,14 +1199,14 @@ int ncsi_process_next_channel(struct ncsi_dev_priv *ndp) switch (old_state) { case NCSI_CHANNEL_INACTIVE: ndp->ndev.state = ncsi_dev_state_config; - netdev_info(ndp->ndev.dev, "NCSI: configuring channel %u\n", - nc->id); + netdev_dbg(ndp->ndev.dev, "NCSI: configuring channel %u\n", + nc->id); ncsi_configure_channel(ndp); break; case NCSI_CHANNEL_ACTIVE: ndp->ndev.state = ncsi_dev_state_suspend; - netdev_info(ndp->ndev.dev, "NCSI: suspending channel %u\n", - nc->id); + netdev_dbg(ndp->ndev.dev, "NCSI: suspending channel %u\n", + nc->id); ncsi_suspend_channel(ndp); break; default: -- cgit v1.2.3 From 5d3b146736d5f47d1c806e3043ebc8b627c6277e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 19 Jun 2018 15:08:32 +0930 Subject: net/ncsi: Drop no more channels message This does not provide useful information. As the ncsi maintainer said: > either we get a channel or broadcom has gone out to lunch Acked-by: Samuel Mendoza-Jonas Signed-off-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/ncsi-manage.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 616441c2b54f..716493a61ba6 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1226,8 +1226,6 @@ out: return ncsi_choose_active_channel(ndp); } - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "NCSI: No more channels to process\n"); ncsi_report_link(ndp, false); return -ENODEV; } -- cgit v1.2.3 From 6e42a3f5cdb60e2641472a8d668cce13736e0443 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 19 Jun 2018 15:08:33 +0930 Subject: net/ncsi: Use netdev_dbg for debug messages This moves all of the netdev_printk(KERN_DEBUG, ...) messages over to netdev_dbg. As Joe explains: > netdev_dbg is not included in object code unless > DEBUG is defined or CONFIG_DYNAMIC_DEBUG is set. > And then, it is not emitted into the log unless > DEBUG is set or this specific netdev_dbg is enabled > via the dynamic debug control file. Which is what we're after in this case. Acked-by: Samuel Mendoza-Jonas Signed-off-by: Joel Stanley Signed-off-by: David S. Miller --- net/ncsi/ncsi-aen.c | 6 +++--- net/ncsi/ncsi-manage.c | 33 +++++++++++++++------------------ 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/net/ncsi/ncsi-aen.c b/net/ncsi/ncsi-aen.c index f899ed61bb57..25e483e8278b 100644 --- a/net/ncsi/ncsi-aen.c +++ b/net/ncsi/ncsi-aen.c @@ -148,9 +148,9 @@ static int ncsi_aen_handler_hncdsc(struct ncsi_dev_priv *ndp, hncdsc = (struct ncsi_aen_hncdsc_pkt *)h; ncm->data[3] = ntohl(hncdsc->status); spin_unlock_irqrestore(&nc->lock, flags); - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "NCSI: host driver %srunning on channel %u\n", - ncm->data[3] & 0x1 ? "" : "not ", nc->id); + netdev_dbg(ndp->ndev.dev, + "NCSI: host driver %srunning on channel %u\n", + ncm->data[3] & 0x1 ? "" : "not ", nc->id); return 0; } diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 716493a61ba6..091284760d21 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -788,8 +788,8 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) } break; case ncsi_dev_state_config_done: - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "NCSI: channel %u config done\n", nc->id); + netdev_dbg(ndp->ndev.dev, "NCSI: channel %u config done\n", + nc->id); spin_lock_irqsave(&nc->lock, flags); if (nc->reconfigure_needed) { /* This channel's configuration has been updated @@ -804,8 +804,7 @@ static void ncsi_configure_channel(struct ncsi_dev_priv *ndp) list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); - netdev_printk(KERN_DEBUG, dev, - "Dirty NCSI channel state reset\n"); + netdev_dbg(dev, "Dirty NCSI channel state reset\n"); ncsi_process_next_channel(ndp); break; } @@ -908,9 +907,9 @@ static int ncsi_choose_active_channel(struct ncsi_dev_priv *ndp) } ncm = &found->modes[NCSI_MODE_LINK]; - netdev_printk(KERN_DEBUG, ndp->ndev.dev, - "NCSI: Channel %u added to queue (link %s)\n", - found->id, ncm->data[2] & 0x1 ? "up" : "down"); + netdev_dbg(ndp->ndev.dev, + "NCSI: Channel %u added to queue (link %s)\n", + found->id, ncm->data[2] & 0x1 ? "up" : "down"); out: spin_lock_irqsave(&ndp->lock, flags); @@ -1316,9 +1315,9 @@ static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) if ((ndp->ndev.state & 0xff00) == ncsi_dev_state_config || !list_empty(&nc->link)) { - netdev_printk(KERN_DEBUG, nd->dev, - "NCSI: channel %p marked dirty\n", - nc); + netdev_dbg(nd->dev, + "NCSI: channel %p marked dirty\n", + nc); nc->reconfigure_needed = true; } spin_unlock_irqrestore(&nc->lock, flags); @@ -1336,8 +1335,7 @@ static int ncsi_kick_channels(struct ncsi_dev_priv *ndp) list_add_tail_rcu(&nc->link, &ndp->channel_queue); spin_unlock_irqrestore(&ndp->lock, flags); - netdev_printk(KERN_DEBUG, nd->dev, - "NCSI: kicked channel %p\n", nc); + netdev_dbg(nd->dev, "NCSI: kicked channel %p\n", nc); n++; } } @@ -1368,8 +1366,8 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) list_for_each_entry_rcu(vlan, &ndp->vlan_vids, list) { n_vids++; if (vlan->vid == vid) { - netdev_printk(KERN_DEBUG, dev, - "NCSI: vid %u already registered\n", vid); + netdev_dbg(dev, "NCSI: vid %u already registered\n", + vid); return 0; } } @@ -1388,7 +1386,7 @@ int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) vlan->vid = vid; list_add_rcu(&vlan->list, &ndp->vlan_vids); - netdev_printk(KERN_DEBUG, dev, "NCSI: Added new vid %u\n", vid); + netdev_dbg(dev, "NCSI: Added new vid %u\n", vid); found = ncsi_kick_channels(ndp) != 0; @@ -1417,8 +1415,7 @@ int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) /* Remove the VLAN id from our internal list */ list_for_each_entry_safe(vlan, tmp, &ndp->vlan_vids, list) if (vlan->vid == vid) { - netdev_printk(KERN_DEBUG, dev, - "NCSI: vid %u found, removing\n", vid); + netdev_dbg(dev, "NCSI: vid %u found, removing\n", vid); list_del_rcu(&vlan->list); found = true; kfree(vlan); @@ -1545,7 +1542,7 @@ void ncsi_stop_dev(struct ncsi_dev *nd) } } - netdev_printk(KERN_DEBUG, ndp->ndev.dev, "NCSI: Stopping device\n"); + netdev_dbg(ndp->ndev.dev, "NCSI: Stopping device\n"); ncsi_report_link(ndp, true); } EXPORT_SYMBOL_GPL(ncsi_stop_dev); -- cgit v1.2.3 From 01a21986f8ed52911eafdc728f595d2252b71451 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 19 Jun 2018 15:08:34 +0930 Subject: MAINTAINERS: Add Sam as the maintainer for NCSI Sam has been handing the maintenance of NCSI for a number release cycles now. Acked-by: Samuel Mendoza-Jonas Signed-off-by: Joel Stanley Signed-off-by: David S. Miller --- MAINTAINERS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a5f04264ad10..ebb3168fd9e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9756,6 +9756,11 @@ L: linux-scsi@vger.kernel.org S: Maintained F: drivers/scsi/NCR_D700.* +NCSI LIBRARY: +M: Samuel Mendoza-Jonas +S: Maintained +F: net/ncsi/ + NCT6775 HARDWARE MONITOR DRIVER M: Guenter Roeck L: linux-hwmon@vger.kernel.org -- cgit v1.2.3 From 3256d29fc7aecdf99feb1cb9475ed2252769a8a7 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Tue, 19 Jun 2018 08:15:24 -0700 Subject: enic: initialize enic->rfs_h.lock in enic_probe lockdep spotted that we are using rfs_h.lock in enic_get_rxnfc() without initializing. rfs_h.lock is initialized in enic_open(). But ethtool_ops can be called when interface is down. Move enic_rfs_flw_tbl_init to enic_probe. INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 18 PID: 1189 Comm: ethtool Not tainted 4.17.0-rc7-devel+ #27 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-20171110_100015-anatol 04/01/2014 Call Trace: dump_stack+0x85/0xc0 register_lock_class+0x550/0x560 ? __handle_mm_fault+0xa8b/0x1100 __lock_acquire+0x81/0x670 lock_acquire+0xb9/0x1e0 ? enic_get_rxnfc+0x139/0x2b0 [enic] _raw_spin_lock_bh+0x38/0x80 ? enic_get_rxnfc+0x139/0x2b0 [enic] enic_get_rxnfc+0x139/0x2b0 [enic] ethtool_get_rxnfc+0x8d/0x1c0 dev_ethtool+0x16c8/0x2400 ? __mutex_lock+0x64d/0xa00 ? dev_load+0x6a/0x150 dev_ioctl+0x253/0x4b0 sock_do_ioctl+0x9a/0x130 sock_ioctl+0x1af/0x350 do_vfs_ioctl+0x8e/0x670 ? syscall_trace_enter+0x1e2/0x380 ksys_ioctl+0x60/0x90 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5a/0x170 entry_SYSCALL_64_after_hwframe+0x49/0xbe Signed-off-by: Govindarajulu Varadarajan Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/enic_clsf.c | 3 +-- drivers/net/ethernet/cisco/enic/enic_main.c | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c index 973c1fb70d09..99038dfc7fbe 100644 --- a/drivers/net/ethernet/cisco/enic/enic_clsf.c +++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c @@ -79,7 +79,6 @@ void enic_rfs_flw_tbl_init(struct enic *enic) enic->rfs_h.max = enic->config.num_arfs; enic->rfs_h.free = enic->rfs_h.max; enic->rfs_h.toclean = 0; - enic_rfs_timer_start(enic); } void enic_rfs_flw_tbl_free(struct enic *enic) @@ -88,7 +87,6 @@ void enic_rfs_flw_tbl_free(struct enic *enic) enic_rfs_timer_stop(enic); spin_lock_bh(&enic->rfs_h.lock); - enic->rfs_h.free = 0; for (i = 0; i < (1 << ENIC_RFS_FLW_BITSHIFT); i++) { struct hlist_head *hhead; struct hlist_node *tmp; @@ -99,6 +97,7 @@ void enic_rfs_flw_tbl_free(struct enic *enic) enic_delfltr(enic, n->fltr_id); hlist_del(&n->node); kfree(n); + enic->rfs_h.free++; } } spin_unlock_bh(&enic->rfs_h.lock); diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index 30d2eaa18c04..e6ad581eadd8 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -1971,7 +1971,7 @@ static int enic_open(struct net_device *netdev) vnic_intr_unmask(&enic->intr[i]); enic_notify_timer_start(enic); - enic_rfs_flw_tbl_init(enic); + enic_rfs_timer_start(enic); return 0; @@ -2904,6 +2904,7 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) timer_setup(&enic->notify_timer, enic_notify_timer, 0); + enic_rfs_flw_tbl_init(enic); enic_set_rx_coal_setting(enic); INIT_WORK(&enic->reset, enic_reset); INIT_WORK(&enic->tx_hang_reset, enic_tx_hang_reset); -- cgit v1.2.3 From 4e8439aa34802deab11cee68b0ecb18f887fb153 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Sun, 17 Jun 2018 23:40:53 +0200 Subject: net: hamradio: use eth_broadcast_addr The array bpq_eth_addr is only used to get the size of an address, whereas the bcast_addr is used to set the broadcast address. This leads to a warning when using clang: drivers/net/hamradio/bpqether.c:94:13: warning: variable 'bpq_eth_addr' is not needed and will not be emitted [-Wunneeded-internal-declaration] static char bpq_eth_addr[6]; ^ Remove both variables and use the common eth_broadcast_addr to set the broadcast address. Signed-off-by: Stefan Agner Signed-off-by: David S. Miller --- drivers/net/hamradio/bpqether.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index f347fd9c5b28..777fa59f5e0c 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -89,10 +89,6 @@ static const char banner[] __initconst = KERN_INFO \ "AX.25: bpqether driver version 004\n"; -static char bcast_addr[6]={0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; - -static char bpq_eth_addr[6]; - static int bpq_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); static int bpq_device_event(struct notifier_block *, unsigned long, void *); @@ -501,8 +497,8 @@ static int bpq_new_device(struct net_device *edev) bpq->ethdev = edev; bpq->axdev = ndev; - memcpy(bpq->dest_addr, bcast_addr, sizeof(bpq_eth_addr)); - memcpy(bpq->acpt_addr, bcast_addr, sizeof(bpq_eth_addr)); + eth_broadcast_addr(bpq->dest_addr); + eth_broadcast_addr(bpq->acpt_addr); err = register_netdevice(ndev); if (err) -- cgit v1.2.3 From 548feb33c598dfaf9f8e066b842441ac49b84a8a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 18 Jun 2018 16:15:57 +0800 Subject: ipvlan: use ETH_MAX_MTU as max mtu Similar to the fixes on team and bonding, this restores the ability to set an ipvlan device's mtu to anything higher than 1500. Fixes: 91572088e3fd ("net: use core MTU range checking in core net infra") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 4377c26f714d..d02f0a7c534e 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -693,6 +693,7 @@ void ipvlan_link_setup(struct net_device *dev) { ether_setup(dev); + dev->max_mtu = ETH_MAX_MTU; dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE; dev->netdev_ops = &ipvlan_netdev_ops; -- cgit v1.2.3 From e5223438280d76ef782592cf643e09441140d14c Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Mon, 18 Jun 2018 15:04:05 +0300 Subject: net: net_failover: fix typo in net_failover_slave_register() Sync both unicast and multicast lists instead of unicast twice. Fixes: cfc80d9a116 ("net: Introduce net_failover driver") Reviewed-by: Joao Martins Signed-off-by: Liran Alon Signed-off-by: David S. Miller --- drivers/net/net_failover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 83f7420ddea5..4f390fa557e4 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -527,7 +527,7 @@ static int net_failover_slave_register(struct net_device *slave_dev, netif_addr_lock_bh(failover_dev); dev_uc_sync_multiple(slave_dev, failover_dev); - dev_uc_sync_multiple(slave_dev, failover_dev); + dev_mc_sync_multiple(slave_dev, failover_dev); netif_addr_unlock_bh(failover_dev); err = vlan_vids_add_by_dev(slave_dev, failover_dev); -- cgit v1.2.3 From 9b0a8da8c4c6e91012ab03a801acc5d8011c7c2f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 18 Jun 2018 05:24:31 -0700 Subject: net/ipv6: respect rcu grace period before freeing fib6_info syzbot reported use after free that is caused by fib6_info being freed without a proper RCU grace period. CPU: 0 PID: 1407 Comm: udevd Not tainted 4.17.0+ #39 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1b9/0x294 lib/dump_stack.c:113 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433 __read_once_size include/linux/compiler.h:188 [inline] find_rr_leaf net/ipv6/route.c:705 [inline] rt6_select net/ipv6/route.c:761 [inline] fib6_table_lookup+0x12b7/0x14d0 net/ipv6/route.c:1823 ip6_pol_route+0x1c2/0x1020 net/ipv6/route.c:1856 ip6_pol_route_output+0x54/0x70 net/ipv6/route.c:2082 fib6_rule_lookup+0x211/0x6d0 net/ipv6/fib6_rules.c:122 ip6_route_output_flags+0x2c5/0x350 net/ipv6/route.c:2110 ip6_route_output include/net/ip6_route.h:82 [inline] icmpv6_xrlim_allow net/ipv6/icmp.c:211 [inline] icmp6_send+0x147c/0x2da0 net/ipv6/icmp.c:535 icmpv6_send+0x17a/0x300 net/ipv6/ip6_icmp.c:43 ip6_link_failure+0xa5/0x790 net/ipv6/route.c:2244 dst_link_failure include/net/dst.h:427 [inline] ndisc_error_report+0xd1/0x1c0 net/ipv6/ndisc.c:695 neigh_invalidate+0x246/0x550 net/core/neighbour.c:892 neigh_timer_handler+0xaf9/0xde0 net/core/neighbour.c:978 call_timer_fn+0x230/0x940 kernel/time/timer.c:1326 expire_timers kernel/time/timer.c:1363 [inline] __run_timers+0x79e/0xc50 kernel/time/timer.c:1666 run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692 __do_softirq+0x2e0/0xaf5 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1d1/0x200 kernel/softirq.c:404 exiting_irq arch/x86/include/asm/apic.h:527 [inline] smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863 RIP: 0010:strlen+0x5e/0xa0 lib/string.c:482 Code: 24 00 74 3b 48 bb 00 00 00 00 00 fc ff df 4c 89 e0 48 83 c0 01 48 89 c2 48 89 c1 48 c1 ea 03 83 e1 07 0f b6 14 1a 38 ca 7f 04 <84> d2 75 23 80 38 00 75 de 48 83 c4 08 4c 29 e0 5b 41 5c 5d c3 48 RSP: 0018:ffff8801af117850 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13 RAX: ffff880197f53bd0 RBX: dffffc0000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff81c5b06c RDI: ffff880197f53bc0 RBP: ffff8801af117868 R08: ffff88019a976540 R09: 0000000000000000 R10: ffff88019a976540 R11: 0000000000000000 R12: ffff880197f53bc0 R13: ffff880197f53bc0 R14: ffffffff899e4e90 R15: ffff8801d91c6a00 strlen include/linux/string.h:267 [inline] getname_kernel+0x24/0x370 fs/namei.c:218 open_exec+0x17/0x70 fs/exec.c:882 load_elf_binary+0x968/0x5610 fs/binfmt_elf.c:780 search_binary_handler+0x17d/0x570 fs/exec.c:1653 exec_binprm fs/exec.c:1695 [inline] __do_execve_file.isra.35+0x16fe/0x2710 fs/exec.c:1819 do_execveat_common fs/exec.c:1866 [inline] do_execve fs/exec.c:1883 [inline] __do_sys_execve fs/exec.c:1964 [inline] __se_sys_execve fs/exec.c:1959 [inline] __x64_sys_execve+0x8f/0xc0 fs/exec.c:1959 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7f1576a46207 Code: 77 19 f4 48 89 d7 44 89 c0 0f 05 48 3d 00 f0 ff ff 76 e0 f7 d8 64 41 89 01 eb d8 f7 d8 64 41 89 01 eb df b8 3b 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 02 f3 c3 48 8b 15 00 8c 2d 00 f7 d8 64 89 02 RSP: 002b:00007ffff2784568 EFLAGS: 00000202 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f1576a46207 RDX: 0000000001215b10 RSI: 00007ffff2784660 RDI: 00007ffff2785670 RBP: 0000000000625500 R08: 000000000000589c R09: 000000000000589c R10: 0000000000000000 R11: 0000000000000202 R12: 0000000001215b10 R13: 0000000000000007 R14: 0000000001204250 R15: 0000000000000005 Allocated by task 12188: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553 kmem_cache_alloc_trace+0x152/0x780 mm/slab.c:3620 kmalloc include/linux/slab.h:513 [inline] kzalloc include/linux/slab.h:706 [inline] fib6_info_alloc+0xbb/0x280 net/ipv6/ip6_fib.c:152 ip6_route_info_create+0x782/0x2b50 net/ipv6/route.c:3013 ip6_route_add+0x23/0xb0 net/ipv6/route.c:3154 ipv6_route_ioctl+0x5a5/0x760 net/ipv6/route.c:3660 inet6_ioctl+0x100/0x1f0 net/ipv6/af_inet6.c:546 sock_do_ioctl+0xe4/0x3e0 net/socket.c:973 sock_ioctl+0x30d/0x680 net/socket.c:1097 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x16f0 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 __do_sys_ioctl fs/ioctl.c:708 [inline] __se_sys_ioctl fs/ioctl.c:706 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:706 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 1402: save_stack+0x43/0xd0 mm/kasan/kasan.c:448 set_track mm/kasan/kasan.c:460 [inline] __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528 __cache_free mm/slab.c:3498 [inline] kfree+0xd9/0x260 mm/slab.c:3813 fib6_info_destroy+0x29b/0x350 net/ipv6/ip6_fib.c:207 fib6_info_release include/net/ip6_fib.h:286 [inline] __ip6_del_rt_siblings net/ipv6/route.c:3235 [inline] ip6_route_del+0x11c4/0x13b0 net/ipv6/route.c:3316 ipv6_route_ioctl+0x616/0x760 net/ipv6/route.c:3663 inet6_ioctl+0x100/0x1f0 net/ipv6/af_inet6.c:546 sock_do_ioctl+0xe4/0x3e0 net/socket.c:973 sock_ioctl+0x30d/0x680 net/socket.c:1097 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x16f0 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 __do_sys_ioctl fs/ioctl.c:708 [inline] __se_sys_ioctl fs/ioctl.c:706 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:706 do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8801b5df2580 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 8 bytes inside of 256-byte region [ffff8801b5df2580, ffff8801b5df2680) The buggy address belongs to the page: page:ffffea0006d77c80 count:1 mapcount:0 mapping:ffff8801da8007c0 index:0xffff8801b5df2e40 flags: 0x2fffc0000000100(slab) raw: 02fffc0000000100 ffffea0006c5cc48 ffffea0007363308 ffff8801da8007c0 raw: ffff8801b5df2e40 ffff8801b5df2080 0000000100000006 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801b5df2480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801b5df2500: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc > ffff8801b5df2580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801b5df2600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801b5df2680: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb Fixes: a64efe142f5e ("net/ipv6: introduce fib6_info struct and helpers") Signed-off-by: Eric Dumazet Cc: David Ahern Reported-by: syzbot+9e6d75e3edef427ee888@syzkaller.appspotmail.com Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 +++-- net/ipv6/ip6_fib.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5cba71d2dc44..71b9043aa0e7 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -170,6 +170,7 @@ struct fib6_info { unused:3; struct fib6_nh fib6_nh; + struct rcu_head rcu; }; struct rt6_info { @@ -273,7 +274,7 @@ static inline void ip6_rt_put(struct rt6_info *rt) } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags); -void fib6_info_destroy(struct fib6_info *f6i); +void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) { @@ -283,7 +284,7 @@ static inline void fib6_info_hold(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) - fib6_info_destroy(f6i); + call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } enum fib6_walk_state { diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 39d1d487eca2..1fb2f3118d60 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -167,8 +167,9 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags) return f6i; } -void fib6_info_destroy(struct fib6_info *f6i) +void fib6_info_destroy_rcu(struct rcu_head *head) { + struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); struct rt6_exception_bucket *bucket; struct dst_metrics *m; @@ -206,7 +207,7 @@ void fib6_info_destroy(struct fib6_info *f6i) kfree(f6i); } -EXPORT_SYMBOL_GPL(fib6_info_destroy); +EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { -- cgit v1.2.3 From f696a21c229ac3e85bc239efc52f4530b43002c5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jun 2018 16:20:39 +0200 Subject: ptp: replace getnstimeofday64() with ktime_get_real_ts64() getnstimeofday64() is deprecated and getting replaced throughout the kernel with ktime_get_*() based helpers for a more consistent interface. The two functions do the exact same thing, so this is just a cosmetic change. Signed-off-by: Arnd Bergmann Acked-by: Richard Cochran Signed-off-by: David S. Miller --- drivers/ptp/ptp_chardev.c | 4 ++-- drivers/ptp/ptp_qoriq.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c index 767c485af59b..547dbdac9d54 100644 --- a/drivers/ptp/ptp_chardev.c +++ b/drivers/ptp/ptp_chardev.c @@ -221,7 +221,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) } pct = &sysoff->ts[0]; for (i = 0; i < sysoff->n_samples; i++) { - getnstimeofday64(&ts); + ktime_get_real_ts64(&ts); pct->sec = ts.tv_sec; pct->nsec = ts.tv_nsec; pct++; @@ -230,7 +230,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg) pct->nsec = ts.tv_nsec; pct++; } - getnstimeofday64(&ts); + ktime_get_real_ts64(&ts); pct->sec = ts.tv_sec; pct->nsec = ts.tv_nsec; if (copy_to_user((void __user *)arg, sysoff, sizeof(*sysoff))) diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index 1468a1642b49..e8652c148c52 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -374,7 +374,7 @@ static int qoriq_ptp_probe(struct platform_device *dev) pr_err("ioremap ptp registers failed\n"); goto no_ioremap; } - getnstimeofday64(&now); + ktime_get_real_ts64(&now); ptp_qoriq_settime(&qoriq_ptp->caps, &now); tmr_ctrl = -- cgit v1.2.3 From 8c43bd1706885ba1acfa88da02bc60a2ec16f68c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 18 Jun 2018 12:30:37 -0700 Subject: net/tcp: Fix socket lookups with SO_BINDTODEVICE Similar to 69678bcd4d2d ("udp: fix SO_BINDTODEVICE"), TCP socket lookups need to fail if dev_match is not true. Currently, a packet to a given port can match a socket bound to device when it should not. In the VRF case, this causes the lookup to hit a VRF socket and not a global socket resulting in a response trying to go through the VRF when it should not. Fixes: 3fa6f616a7a4d ("net: ipv4: add second dif to inet socket lookups") Fixes: 4297a0ef08572 ("net: ipv6: add second dif to inet6 socket lookups") Reported-by: Lou Berger Diagnosed-by: Renato Westphal Tested-by: Renato Westphal Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 4 ++-- net/ipv6/inet6_hashtables.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 31ff46daae97..3647167c8fa3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -243,9 +243,9 @@ static inline int compute_score(struct sock *sk, struct net *net, bool dev_match = (sk->sk_bound_dev_if == dif || sk->sk_bound_dev_if == sdif); - if (exact_dif && !dev_match) + if (!dev_match) return -1; - if (sk->sk_bound_dev_if && dev_match) + if (sk->sk_bound_dev_if) score += 4; } if (sk->sk_incoming_cpu == raw_smp_processor_id()) diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 2febe26de6a1..595ad408dba0 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -113,9 +113,9 @@ static inline int compute_score(struct sock *sk, struct net *net, bool dev_match = (sk->sk_bound_dev_if == dif || sk->sk_bound_dev_if == sdif); - if (exact_dif && !dev_match) + if (!dev_match) return -1; - if (sk->sk_bound_dev_if && dev_match) + if (sk->sk_bound_dev_if) score++; } if (sk->sk_incoming_cpu == raw_smp_processor_id()) -- cgit v1.2.3 From 56f772279a762984f6e9ebbf24a7c829faba5712 Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Mon, 18 Jun 2018 10:01:05 -0700 Subject: enic: do not overwrite error code In failure path, we overwrite err to what vnic_rq_disable() returns. In case it returns 0, enic_open() returns success in case of error. Reported-by: Ben Hutchings Fixes: e8588e268509 ("enic: enable rq before updating rq descriptors") Signed-off-by: Govindarajulu Varadarajan Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/enic_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c index e6ad581eadd8..90c645b8538e 100644 --- a/drivers/net/ethernet/cisco/enic/enic_main.c +++ b/drivers/net/ethernet/cisco/enic/enic_main.c @@ -1920,7 +1920,7 @@ static int enic_open(struct net_device *netdev) { struct enic *enic = netdev_priv(netdev); unsigned int i; - int err; + int err, ret; err = enic_request_intr(enic); if (err) { @@ -1977,10 +1977,9 @@ static int enic_open(struct net_device *netdev) err_out_free_rq: for (i = 0; i < enic->rq_count; i++) { - err = vnic_rq_disable(&enic->rq[i]); - if (err) - return err; - vnic_rq_clean(&enic->rq[i], enic_free_rq_buf); + ret = vnic_rq_disable(&enic->rq[i]); + if (!ret) + vnic_rq_clean(&enic->rq[i], enic_free_rq_buf); } enic_dev_notify_unset(enic); err_out_free_intr: -- cgit v1.2.3 From 7892bd081045222b9e4027fec279a28d6fe7aa66 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 19 Jun 2018 17:23:17 +0800 Subject: net: propagate dev_get_valid_name return code if dev_get_valid_name failed, propagate its return code and remove the setting err to ENODEV, it will be set to 0 again before dev_change_net_namespace exits. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 57b7bab5f70b..a5aa1c7444e6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8643,7 +8643,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* We get here if we can't use the current device name */ if (!pat) goto out; - if (dev_get_valid_name(net, dev, pat) < 0) + err = dev_get_valid_name(net, dev, pat); + if (err < 0) goto out; } @@ -8655,7 +8656,6 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char dev_close(dev); /* And unlink it from device chain */ - err = -ENODEV; unlist_netdevice(dev); synchronize_net(); -- cgit v1.2.3 From 758380b8155f69b4e2f77f27562f8a7a466749d6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 12 Jun 2018 19:38:08 +1000 Subject: powerpc/64s/radix: Fix radix_kvm_prefetch_workaround paca access of not possible CPU If possible CPUs are limited (e.g., by kexec), then the kvm prefetch workaround function can access the paca pointer for a !possible CPU. Fixes: d2e60075a3d44 ("powerpc/64: Use array of paca pointers and allocate pacas individually") Cc: stable@kernel.org Reported-by: Pridhiviraj Paidipeddi Tested-by: Pridhiviraj Paidipeddi Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index a734e486664d..1135b43a597c 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -1097,6 +1097,8 @@ extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { if (sib == cpu) continue; + if (!cpu_possible(sib)) + continue; if (paca_ptrs[sib]->kvm_hstate.kvm_vcpu) flush = true; } -- cgit v1.2.3 From fadd03c615922d8521a2e76d4ba2335891cb2790 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 14 Jun 2018 16:01:52 +0530 Subject: powerpc/mm/hash/4k: Free hugetlb page table caches correctly. With 4k page size for hugetlb we allocate hugepage directories from its on slab cache. With patch 0c4d26802 ("powerpc/book3s64/mm: Simplify the rcu callback for page table free") we missed to free these allocated hugepd tables. Update pgtable_free to handle hugetlb hugepd directory table. Fixes: 0c4d268029bf ("powerpc/book3s64/mm: Simplify the rcu callback for page table free") Signed-off-by: Aneesh Kumar K.V [mpe: Add CONFIG_HUGETLB_PAGE guard to fix build break] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/32/pgalloc.h | 1 + arch/powerpc/include/asm/book3s/64/pgtable-4k.h | 21 +++++++++++++++++++++ arch/powerpc/include/asm/book3s/64/pgtable-64k.h | 9 +++++++++ arch/powerpc/include/asm/book3s/64/pgtable.h | 5 +++++ arch/powerpc/include/asm/nohash/32/pgalloc.h | 1 + arch/powerpc/include/asm/nohash/64/pgalloc.h | 1 + arch/powerpc/mm/hugetlbpage.c | 3 ++- arch/powerpc/mm/pgtable-book3s64.c | 12 ++++++++++++ 8 files changed, 52 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h index 6a6673907e45..e4633803fe43 100644 --- a/arch/powerpc/include/asm/book3s/32/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h @@ -108,6 +108,7 @@ static inline void pgtable_free(void *table, unsigned index_size) } #define check_pgt_cache() do { } while (0) +#define get_hugepd_cache_index(x) (x) #ifdef CONFIG_SMP static inline void pgtable_free_tlb(struct mmu_gather *tlb, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h index af5f2baac80f..a069dfcac9a9 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -49,6 +49,27 @@ static inline int hugepd_ok(hugepd_t hpd) } #define is_hugepd(hpd) (hugepd_ok(hpd)) +/* + * 16M and 16G huge page directory tables are allocated from slab cache + * + */ +#define H_16M_CACHE_INDEX (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE - 24) +#define H_16G_CACHE_INDEX \ + (PAGE_SHIFT + H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + H_PUD_INDEX_SIZE - 34) + +static inline int get_hugepd_cache_index(int index) +{ + switch (index) { + case H_16M_CACHE_INDEX: + return HTLB_16M_INDEX; + case H_16G_CACHE_INDEX: + return HTLB_16G_INDEX; + default: + BUG(); + } + /* should not reach */ +} + #else /* !CONFIG_HUGETLB_PAGE */ static inline int pmd_huge(pmd_t pmd) { return 0; } static inline int pud_huge(pud_t pud) { return 0; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index fb4b3ba52339..d7ee249d6890 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -45,8 +45,17 @@ static inline int hugepd_ok(hugepd_t hpd) { return 0; } + #define is_hugepd(pdep) 0 +/* + * This should never get called + */ +static inline int get_hugepd_cache_index(int index) +{ + BUG(); +} + #else /* !CONFIG_HUGETLB_PAGE */ static inline int pmd_huge(pmd_t pmd) { return 0; } static inline int pud_huge(pud_t pud) { return 0; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 63cee159022b..42aafba7a308 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -287,6 +287,11 @@ enum pgtable_index { PMD_INDEX, PUD_INDEX, PGD_INDEX, + /* + * Below are used with 4k page size and hugetlb + */ + HTLB_16M_INDEX, + HTLB_16G_INDEX, }; extern unsigned long __vmalloc_start; diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h b/arch/powerpc/include/asm/nohash/32/pgalloc.h index 1707781d2f20..9de40eb614da 100644 --- a/arch/powerpc/include/asm/nohash/32/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h @@ -109,6 +109,7 @@ static inline void pgtable_free(void *table, unsigned index_size) } #define check_pgt_cache() do { } while (0) +#define get_hugepd_cache_index(x) (x) #ifdef CONFIG_SMP static inline void pgtable_free_tlb(struct mmu_gather *tlb, diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 0e693f322cb2..e2d62d033708 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -141,6 +141,7 @@ static inline void pgtable_free(void *table, int shift) } } +#define get_hugepd_cache_index(x) (x) #ifdef CONFIG_SMP static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) { diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 7c5f479c5c00..8a9a49c13865 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -337,7 +337,8 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif if (shift >= pdshift) hugepd_free(tlb, hugepte); else - pgtable_free_tlb(tlb, hugepte, pdshift - shift); + pgtable_free_tlb(tlb, hugepte, + get_hugepd_cache_index(pdshift - shift)); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index c1f4ca45c93a..4afbfbb64bfd 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -409,6 +409,18 @@ static inline void pgtable_free(void *table, int index) case PUD_INDEX: kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); break; +#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) + /* 16M hugepd directory at pud level */ + case HTLB_16M_INDEX: + BUILD_BUG_ON(H_16M_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16M_CACHE_INDEX), table); + break; + /* 16G hugepd directory at the pgd level */ + case HTLB_16G_INDEX: + BUILD_BUG_ON(H_16G_CACHE_INDEX <= 0); + kmem_cache_free(PGT_CACHE(H_16G_CACHE_INDEX), table); + break; +#endif /* We don't free pgd table via RCU callback */ default: BUG(); -- cgit v1.2.3 From dc45519eb181b5687ac8382361a8aa085acd1fe1 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 19 Jun 2018 14:44:00 +0200 Subject: net: ethernet: fix suspend/resume in davinci_emac This patch reverts commit 3243ff2a05ec ("net: ethernet: davinci_emac: Deduplicate bus_find_device() by name matching") and adds a comment which should stop anyone from reintroducing the same "fix" in the future. We can't use bus_find_device_by_name() here because the device name is not guaranteed to be 'davinci_mdio'. On some systems it can be 'davinci_mdio.0' so we need to use strncmp() against the first part of the string to correctly match it. Fixes: 3243ff2a05ec ("net: ethernet: davinci_emac: Deduplicate bus_find_device() by name matching") Cc: stable@vger.kernel.org Signed-off-by: Bartosz Golaszewski Acked-by: Lukas Wunner Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/davinci_emac.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 06d7c9e4dcda..a1a6445b5a7e 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -1385,6 +1385,11 @@ static int emac_devioctl(struct net_device *ndev, struct ifreq *ifrq, int cmd) return -EOPNOTSUPP; } +static int match_first_device(struct device *dev, void *data) +{ + return !strncmp(dev_name(dev), "davinci_mdio", 12); +} + /** * emac_dev_open - EMAC device open * @ndev: The DaVinci EMAC network adapter @@ -1484,8 +1489,14 @@ static int emac_dev_open(struct net_device *ndev) /* use the first phy on the bus if pdata did not give us a phy id */ if (!phydev && !priv->phy_id) { - phy = bus_find_device_by_name(&mdio_bus_type, NULL, - "davinci_mdio"); + /* NOTE: we can't use bus_find_device_by_name() here because + * the device name is not guaranteed to be 'davinci_mdio'. On + * some systems it can be 'davinci_mdio.0' so we need to use + * strncmp() against the first part of the string to correctly + * match it. + */ + phy = bus_find_device(&mdio_bus_type, NULL, NULL, + match_first_device); if (phy) { priv->phy_id = dev_name(phy); if (!priv->phy_id || !*priv->phy_id) -- cgit v1.2.3 From 0a889b9404c084c6fd145020c939a8f688b3e058 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 19 Jun 2018 15:39:46 +0200 Subject: net/sched: act_ife: fix recursive lock and idr leak a recursive lock warning [1] can be observed with the following script, # $TC actions add action ife encode allow prio pass index 42 IFE type 0xED3E # $TC actions replace action ife encode allow tcindex pass index 42 in case the kernel was unable to run the last command (e.g. because of the impossibility to load 'act_meta_skbtcindex'). For a similar reason, the kernel can leak idr in the error path of tcf_ife_init(), because tcf_idr_release() is not called after successful idr reservation: # $TC actions add action ife encode allow tcindex index 47 IFE type 0xED3E RTNETLINK answers: No such file or directory We have an error talking to the kernel # $TC actions add action ife encode allow tcindex index 47 IFE type 0xED3E RTNETLINK answers: No space left on device We have an error talking to the kernel # $TC actions add action ife encode use mark 7 type 0xfefe pass index 47 IFE type 0xFEFE RTNETLINK answers: No space left on device We have an error talking to the kernel Since tcfa_lock is already taken when the action is being edited, a call to tcf_idr_release() wrongly makes tcf_idr_cleanup() take the same lock again. On the other hand, tcf_idr_release() needs to be called in the error path of tcf_ife_init(), to undo the last tcf_idr_create() invocation. Fix both problems in tcf_ife_init(). Since the cleanup() routine can now be called when ife->params is NULL, also add a NULL pointer check to avoid calling kfree_rcu(NULL, rcu). [1] ============================================ WARNING: possible recursive locking detected 4.17.0-rc4.kasan+ #417 Tainted: G E -------------------------------------------- tc/3932 is trying to acquire lock: 000000005097c9a6 (&(&p->tcfa_lock)->rlock){+...}, at: tcf_ife_cleanup+0x19/0x80 [act_ife] but task is already holding lock: 000000005097c9a6 (&(&p->tcfa_lock)->rlock){+...}, at: tcf_ife_init+0xf6d/0x13c0 [act_ife] other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&p->tcfa_lock)->rlock); lock(&(&p->tcfa_lock)->rlock); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by tc/3932: #0: 000000007ca8e990 (rtnl_mutex){+.+.}, at: tcf_ife_init+0xf61/0x13c0 [act_ife] #1: 000000005097c9a6 (&(&p->tcfa_lock)->rlock){+...}, at: tcf_ife_init+0xf6d/0x13c0 [act_ife] stack backtrace: CPU: 3 PID: 3932 Comm: tc Tainted: G E 4.17.0-rc4.kasan+ #417 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 Call Trace: dump_stack+0x9a/0xeb __lock_acquire+0xf43/0x34a0 ? debug_check_no_locks_freed+0x2b0/0x2b0 ? debug_check_no_locks_freed+0x2b0/0x2b0 ? debug_check_no_locks_freed+0x2b0/0x2b0 ? __mutex_lock+0x62f/0x1240 ? kvm_sched_clock_read+0x1a/0x30 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0x18/0x170 ? find_held_lock+0x39/0x1d0 ? lock_acquire+0x10b/0x330 lock_acquire+0x10b/0x330 ? tcf_ife_cleanup+0x19/0x80 [act_ife] _raw_spin_lock_bh+0x38/0x70 ? tcf_ife_cleanup+0x19/0x80 [act_ife] tcf_ife_cleanup+0x19/0x80 [act_ife] __tcf_idr_release+0xff/0x350 tcf_ife_init+0xdde/0x13c0 [act_ife] ? ife_exit_net+0x290/0x290 [act_ife] ? __lock_is_held+0xb4/0x140 tcf_action_init_1+0x67b/0xad0 ? tcf_action_dump_old+0xa0/0xa0 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0x18/0x170 ? kvm_sched_clock_read+0x1a/0x30 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0x18/0x170 ? memset+0x1f/0x40 tcf_action_init+0x30f/0x590 ? tcf_action_init_1+0xad0/0xad0 ? memset+0x1f/0x40 tc_ctl_action+0x48e/0x5e0 ? mutex_lock_io_nested+0x1160/0x1160 ? tca_action_gd+0x990/0x990 ? sched_clock+0x5/0x10 ? find_held_lock+0x39/0x1d0 rtnetlink_rcv_msg+0x4da/0x990 ? validate_linkmsg+0x680/0x680 ? sched_clock_cpu+0x18/0x170 ? find_held_lock+0x39/0x1d0 netlink_rcv_skb+0x127/0x350 ? validate_linkmsg+0x680/0x680 ? netlink_ack+0x970/0x970 ? __kmalloc_node_track_caller+0x304/0x3a0 netlink_unicast+0x40f/0x5d0 ? netlink_attachskb+0x580/0x580 ? _copy_from_iter_full+0x187/0x760 ? import_iovec+0x90/0x390 netlink_sendmsg+0x67f/0xb50 ? netlink_unicast+0x5d0/0x5d0 ? copy_msghdr_from_user+0x206/0x340 ? netlink_unicast+0x5d0/0x5d0 sock_sendmsg+0xb3/0xf0 ___sys_sendmsg+0x60a/0x8b0 ? copy_msghdr_from_user+0x340/0x340 ? lock_downgrade+0x5e0/0x5e0 ? tty_write_lock+0x18/0x50 ? kvm_sched_clock_read+0x1a/0x30 ? sched_clock+0x5/0x10 ? sched_clock_cpu+0x18/0x170 ? find_held_lock+0x39/0x1d0 ? lock_downgrade+0x5e0/0x5e0 ? lock_acquire+0x10b/0x330 ? __audit_syscall_entry+0x316/0x690 ? current_kernel_time64+0x6b/0xd0 ? __fget_light+0x55/0x1f0 ? __sys_sendmsg+0xd2/0x170 __sys_sendmsg+0xd2/0x170 ? __ia32_sys_shutdown+0x70/0x70 ? syscall_trace_enter+0x57a/0xd60 ? rcu_read_lock_sched_held+0xdc/0x110 ? __bpf_trace_sys_enter+0x10/0x10 ? do_syscall_64+0x22/0x480 do_syscall_64+0xa5/0x480 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fd646988ba0 RSP: 002b:00007fffc9fab3c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007fffc9fab4f0 RCX: 00007fd646988ba0 RDX: 0000000000000000 RSI: 00007fffc9fab440 RDI: 0000000000000003 RBP: 000000005b28c8b3 R08: 0000000000000002 R09: 0000000000000000 R10: 00007fffc9faae20 R11: 0000000000000246 R12: 0000000000000000 R13: 00007fffc9fab504 R14: 0000000000000001 R15: 000000000066c100 Fixes: 4e8c86155010 ("net sched: net sched: ife action fix late binding") Fixes: ef6980b6becb ("introduce IFE action") Signed-off-by: Davide Caratti Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_ife.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 8527cfdc446d..078d52212172 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -415,7 +415,8 @@ static void tcf_ife_cleanup(struct tc_action *a) spin_unlock_bh(&ife->tcf_lock); p = rcu_dereference_protected(ife->params, 1); - kfree_rcu(p, rcu); + if (p) + kfree_rcu(p, rcu); } /* under ife->tcf_lock for existing action */ @@ -543,10 +544,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, NULL, NULL); if (err) { metadata_parse_err: - if (exists) - tcf_idr_release(*a, bind); if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(*a); + tcf_idr_release(*a, bind); if (exists) spin_unlock_bh(&ife->tcf_lock); @@ -567,7 +566,7 @@ metadata_parse_err: err = use_all_metadata(ife); if (err) { if (ret == ACT_P_CREATED) - _tcf_ife_cleanup(*a); + tcf_idr_release(*a, bind); if (exists) spin_unlock_bh(&ife->tcf_lock); -- cgit v1.2.3 From cbf56c29624fa056a0c1c3d177e67aa51a7fd8d6 Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 19 Jun 2018 15:45:50 +0200 Subject: net/sched: act_ife: preserve the action control in case of error in the following script # tc actions add action ife encode allow prio pass index 42 # tc actions replace action ife encode allow tcindex drop index 42 the action control should remain equal to 'pass', if the kernel failed to replace the TC action. Pospone the assignment of the action control, to ensure it is not overwritten in the error path of tcf_ife_init(). Fixes: ef6980b6becb ("introduce IFE action") Signed-off-by: Davide Caratti Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/act_ife.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 078d52212172..20d7d36b2fc9 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -517,8 +517,6 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, saddr = nla_data(tb[TCA_IFE_SMAC]); } - ife->tcf_action = parm->action; - if (parm->flags & IFE_ENCODE) { if (daddr) ether_addr_copy(p->eth_dst, daddr); @@ -575,6 +573,7 @@ metadata_parse_err: } } + ife->tcf_action = parm->action; if (exists) spin_unlock_bh(&ife->tcf_lock); -- cgit v1.2.3 From 2aee167c3675b088c86f648f834e793a0085e04d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 19 Jun 2018 16:14:30 +0200 Subject: net/usb/drivers: Remove useless hrtimer_active check The code does: if (hrtimer_active(&t)) hrtimer_cancel(&t); However, hrtimer_cancel() checks if the timer is active, so the test above is pointless. Signed-off-by: Daniel Lezcano Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ncm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index b0e8b9613054..1eaec648bd1f 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -967,8 +967,7 @@ void cdc_ncm_unbind(struct usbnet *dev, struct usb_interface *intf) atomic_set(&ctx->stop, 1); - if (hrtimer_active(&ctx->tx_timer)) - hrtimer_cancel(&ctx->tx_timer); + hrtimer_cancel(&ctx->tx_timer); tasklet_kill(&ctx->bh); -- cgit v1.2.3 From 421780fd498399235b044638e85b352d6da20b6a Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 19 Jun 2018 17:16:20 +0200 Subject: bpfilter: fix build error bpfilter Makefile assumes that the system locale is en_US, and the parsing of objdump output fails. Set LC_ALL=C and, while at it, rewrite the objdump parsing so it spawns only 2 processes instead of 7. Fixes: d2ba09c17a064 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- net/bpfilter/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index e0bbe7583e58..dd86b022eff0 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -21,8 +21,10 @@ endif # which bpfilter_kern.c passes further into umh blob loader at run-time quiet_cmd_copy_umh = GEN $@ cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \ - $(OBJCOPY) -I binary -O `$(OBJDUMP) -f $<|grep format|cut -d' ' -f8` \ - -B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \ + $(OBJCOPY) -I binary \ + `LC_ALL=C objdump -f net/bpfilter/bpfilter_umh \ + |awk -F' |,' '/file format/{print "-O",$$NF} \ + /^architecture:/{print "-B",$$2}'` \ --rename-section .data=.init.rodata $< $@ $(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh -- cgit v1.2.3 From 8b26a06ad4f2a12425f1f63a0ee57f42961dfd1e Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 19 Jun 2018 17:21:36 +0200 Subject: bpfilter: ignore binary files net/bpfilter/bpfilter_umh is a binary file generated when bpfilter is enabled, add it to .gitignore to avoid committing it. Fixes: d2ba09c17a064 ("net: add skeleton of bpfilter kernel module") Signed-off-by: Matteo Croce Signed-off-by: David S. Miller --- net/bpfilter/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 net/bpfilter/.gitignore diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore new file mode 100644 index 000000000000..e97084e3eea2 --- /dev/null +++ b/net/bpfilter/.gitignore @@ -0,0 +1 @@ +bpfilter_umh -- cgit v1.2.3 From 684fb246578b9e81fc7b4ca5c71eae22edb650b2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 19 Jun 2018 10:47:50 -0500 Subject: objtool: Add machine_real_restart() to the noreturn list machine_real_restart() is annotated as '__noreturn", so add it to the objtool noreturn list. This fixes the following warning with clang and CONFIG_CC_OPTIMIZE_FOR_SIZE=y: arch/x86/kernel/reboot.o: warning: objtool: native_machine_emergency_restart() falls through to next function machine_power_off() Reported-by: Matthias Kaehlcke Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Matthias Kaehlcke Reviewed-by: Matthias Kaehlcke Link: https://lkml.kernel.org/r/791712792aa4431bdd55bf1beb33a169ddf3b4a2.1529423255.git.jpoimboe@redhat.com --- tools/objtool/check.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 38047c6aa575..f4a25bd1871f 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -164,6 +164,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func, "lbug_with_loc", "fortify_panic", "usercopy_abort", + "machine_real_restart", }; if (func->bind == STB_WEAK) -- cgit v1.2.3 From 18f3e95b90b28318ef35910d21c39908de672331 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 12 Jun 2018 17:54:42 +0800 Subject: MIPS: io: Add barrier after register read in inX() While a barrier is present in the outX() functions before the register write, a similar barrier is missing in the inX() functions after the register read. This could allow memory accesses following inX() to observe stale data. This patch is very similar to commit a1cc7034e33d12dc1 ("MIPS: io: Add barrier after register read in readX()"). Because war_io_reorder_wmb() is both used by writeX() and outX(), if readX() need a barrier then so does inX(). Cc: stable@vger.kernel.org Signed-off-by: Huacai Chen Patchwork: https://patchwork.linux-mips.org/patch/19516/ Signed-off-by: Paul Burton Cc: James Hogan Cc: linux-mips@linux-mips.org Cc: Fuxin Zhang Cc: Zhangjin Wu Cc: Huacai Chen --- arch/mips/include/asm/io.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index a7d0b836f2f7..cea8ad864b3f 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -414,6 +414,8 @@ static inline type pfx##in##bwlq##p(unsigned long port) \ __val = *__addr; \ slow; \ \ + /* prevent prefetching of coherent DMA data prematurely */ \ + rmb(); \ return pfx##ioswab##bwlq(__addr, __val); \ } -- cgit v1.2.3 From 9ea141ad54716d48e79d0093052c12ed67debf09 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 14 Jun 2018 10:13:53 -0700 Subject: MIPS: Add support for restartable sequences Implement support for restartable sequences on MIPS, which requires 3 simple things: - Call rseq_handle_notify_resume() on return to userspace if TIF_NOTIFY_RESUME is set. - Call rseq_signal_deliver() to fixup the pre-signal stack frame when a signal is delivered whilst executing a restartable sequence critical section. - Select CONFIG_HAVE_RSEQ. Signed-off-by: Paul Burton Reviewed-by: James Hogan Patchwork: https://patchwork.linux-mips.org/patch/19523/ Cc: Ralf Baechle Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Paul E. McKenney Cc: Boqun Feng Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org --- arch/mips/Kconfig | 1 + arch/mips/kernel/signal.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 3f9deec70b92..08c10c518f83 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -65,6 +65,7 @@ config MIPS select HAVE_OPROFILE select HAVE_PERF_EVENTS select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RSEQ select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index 9e224469c788..00f2535d2226 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -801,6 +801,8 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) regs->regs[0] = 0; /* Don't deal with this again. */ } + rseq_signal_deliver(regs); + if (sig_uses_siginfo(&ksig->ka, abi)) ret = abi->setup_rt_frame(vdso + abi->vdso->off_rt_sigreturn, ksig, regs, oldset); @@ -868,6 +870,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + rseq_handle_notify_resume(regs); } user_enter(); -- cgit v1.2.3 From 9bcf53598dfe1bd8caaf8e03738d3cc51d45904e Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 14 Jun 2018 10:20:54 -0700 Subject: MIPS: Add syscall detection for restartable sequences Syscalls are not allowed inside restartable sequences, so add a call to rseq_syscall() at the very beginning of the system call exit path when CONFIG_DEBUG_RSEQ=y. This will help us to detect whether there is a syscall issued erroneously inside a restartable sequence. Signed-off-by: Paul Burton Reviewed-by: James Hogan Patchwork: https://patchwork.linux-mips.org/patch/19522/ Cc: Ralf Baechle Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Paul E. McKenney Cc: Boqun Feng Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org --- arch/mips/kernel/entry.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/mips/kernel/entry.S b/arch/mips/kernel/entry.S index 38a302919e6b..d7de8adcfcc8 100644 --- a/arch/mips/kernel/entry.S +++ b/arch/mips/kernel/entry.S @@ -79,6 +79,10 @@ FEXPORT(ret_from_fork) jal schedule_tail # a0 = struct task_struct *prev FEXPORT(syscall_exit) +#ifdef CONFIG_DEBUG_RSEQ + move a0, sp + jal rseq_syscall +#endif local_irq_disable # make sure need_resched and # signals dont change between # sampling and return @@ -141,6 +145,10 @@ work_notifysig: # deal with pending signals and j resume_userspace_check FEXPORT(syscall_exit_partial) +#ifdef CONFIG_DEBUG_RSEQ + move a0, sp + jal rseq_syscall +#endif local_irq_disable # make sure need_resched doesn't # change between and return LONG_L a2, TI_FLAGS($28) # current->work -- cgit v1.2.3 From e426b3754a2cb8bb45b71283fdac0cfc6d247db7 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 14 Jun 2018 10:22:44 -0700 Subject: MIPS: Wire up the restartable sequences (rseq) syscall Wire up the restartable sequences (rseq) syscall for MIPS. This was introduced by commit d7822b1e24f2 ("rseq: Introduce restartable sequences system call") & MIPS now supports the prerequisites. Signed-off-by: Paul Burton Reviewed-by: James Hogan Patchwork: https://patchwork.linux-mips.org/patch/19525/ Cc: Ralf Baechle Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Paul E. McKenney Cc: Boqun Feng Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org --- arch/mips/include/uapi/asm/unistd.h | 15 +++++++++------ arch/mips/kernel/scall32-o32.S | 1 + arch/mips/kernel/scall64-64.S | 1 + arch/mips/kernel/scall64-n32.S | 1 + arch/mips/kernel/scall64-o32.S | 1 + 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/uapi/asm/unistd.h b/arch/mips/include/uapi/asm/unistd.h index bb05e9916a5f..170bf0b5b250 100644 --- a/arch/mips/include/uapi/asm/unistd.h +++ b/arch/mips/include/uapi/asm/unistd.h @@ -388,17 +388,18 @@ #define __NR_pkey_alloc (__NR_Linux + 364) #define __NR_pkey_free (__NR_Linux + 365) #define __NR_statx (__NR_Linux + 366) +#define __NR_rseq (__NR_Linux + 367) /* * Offset of the last Linux o32 flavoured syscall */ -#define __NR_Linux_syscalls 366 +#define __NR_Linux_syscalls 367 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ #define __NR_O32_Linux 4000 -#define __NR_O32_Linux_syscalls 366 +#define __NR_O32_Linux_syscalls 367 #if _MIPS_SIM == _MIPS_SIM_ABI64 @@ -733,16 +734,17 @@ #define __NR_pkey_alloc (__NR_Linux + 324) #define __NR_pkey_free (__NR_Linux + 325) #define __NR_statx (__NR_Linux + 326) +#define __NR_rseq (__NR_Linux + 327) /* * Offset of the last Linux 64-bit flavoured syscall */ -#define __NR_Linux_syscalls 326 +#define __NR_Linux_syscalls 327 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ #define __NR_64_Linux 5000 -#define __NR_64_Linux_syscalls 326 +#define __NR_64_Linux_syscalls 327 #if _MIPS_SIM == _MIPS_SIM_NABI32 @@ -1081,15 +1083,16 @@ #define __NR_pkey_alloc (__NR_Linux + 328) #define __NR_pkey_free (__NR_Linux + 329) #define __NR_statx (__NR_Linux + 330) +#define __NR_rseq (__NR_Linux + 331) /* * Offset of the last N32 flavoured syscall */ -#define __NR_Linux_syscalls 330 +#define __NR_Linux_syscalls 331 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */ #define __NR_N32_Linux 6000 -#define __NR_N32_Linux_syscalls 330 +#define __NR_N32_Linux_syscalls 331 #endif /* _UAPI_ASM_UNISTD_H */ diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index a9a7d78803cd..842ff1612893 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -590,3 +590,4 @@ EXPORT(sys_call_table) PTR sys_pkey_alloc PTR sys_pkey_free /* 4365 */ PTR sys_statx + PTR sys_rseq diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 65d5aeeb9bdb..558830d1e5ba 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -439,4 +439,5 @@ EXPORT(sys_call_table) PTR sys_pkey_alloc PTR sys_pkey_free /* 5325 */ PTR sys_statx + PTR sys_rseq .size sys_call_table,.-sys_call_table diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index cbf190ef9e8a..293f0b0119f3 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -434,4 +434,5 @@ EXPORT(sysn32_call_table) PTR sys_pkey_alloc PTR sys_pkey_free PTR sys_statx /* 6330 */ + PTR sys_rseq .size sysn32_call_table,.-sysn32_call_table diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 9ebe3e2403b1..f13a08de8078 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -583,4 +583,5 @@ EXPORT(sys32_call_table) PTR sys_pkey_alloc PTR sys_pkey_free /* 4365 */ PTR sys_statx + PTR sys_rseq .size sys32_call_table,.-sys32_call_table -- cgit v1.2.3 From 744f4be542d705a39dac9810350e96f37474eda3 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 14 Jun 2018 11:06:22 -0700 Subject: rseq/selftests: Implement MIPS support Implement support for both MIPS32 & MIPS64 in the rseq selftests, in order to sanity check the recently enabled rseq syscall. The tests all pass on a MIPS Boston development board running either a MIPS32r2 interAptiv CPU & a MIPS64r6 I6500 CPU, both of which were configured with 2 cores each of which have 2 hardware threads (VP(E)s) - ie. 4 CPUs. Signed-off-by: Paul Burton Reviewed-by: James Hogan Patchwork: https://patchwork.linux-mips.org/patch/19524/ Cc: Ralf Baechle Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Paul E. McKenney Cc: Boqun Feng Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org --- tools/testing/selftests/rseq/param_test.c | 24 + tools/testing/selftests/rseq/rseq-mips.h | 725 ++++++++++++++++++++++++++++++ tools/testing/selftests/rseq/rseq.h | 2 + 3 files changed, 751 insertions(+) create mode 100644 tools/testing/selftests/rseq/rseq-mips.h diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c index 6a9f602a8718..615252331813 100644 --- a/tools/testing/selftests/rseq/param_test.c +++ b/tools/testing/selftests/rseq/param_test.c @@ -137,6 +137,30 @@ unsigned int yield_mod_cnt, nr_abort; "subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \ "bne 222b\n\t" \ "333:\n\t" + +#elif defined(__mips__) + +#define RSEQ_INJECT_INPUT \ + , [loop_cnt_1]"m"(loop_cnt[1]) \ + , [loop_cnt_2]"m"(loop_cnt[2]) \ + , [loop_cnt_3]"m"(loop_cnt[3]) \ + , [loop_cnt_4]"m"(loop_cnt[4]) \ + , [loop_cnt_5]"m"(loop_cnt[5]) \ + , [loop_cnt_6]"m"(loop_cnt[6]) + +#define INJECT_ASM_REG "$5" + +#define RSEQ_INJECT_CLOBBER \ + , INJECT_ASM_REG + +#define RSEQ_INJECT_ASM(n) \ + "lw " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \ + "beqz " INJECT_ASM_REG ", 333f\n\t" \ + "222:\n\t" \ + "addiu " INJECT_ASM_REG ", -1\n\t" \ + "bnez " INJECT_ASM_REG ", 222b\n\t" \ + "333:\n\t" + #else #error unsupported target #endif diff --git a/tools/testing/selftests/rseq/rseq-mips.h b/tools/testing/selftests/rseq/rseq-mips.h new file mode 100644 index 000000000000..7f48ecf46994 --- /dev/null +++ b/tools/testing/selftests/rseq/rseq-mips.h @@ -0,0 +1,725 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* + * Author: Paul Burton + * (C) Copyright 2018 MIPS Tech LLC + * + * Based on rseq-arm.h: + * (C) Copyright 2016-2018 - Mathieu Desnoyers + */ + +#define RSEQ_SIG 0x53053053 + +#define rseq_smp_mb() __asm__ __volatile__ ("sync" ::: "memory") +#define rseq_smp_rmb() rseq_smp_mb() +#define rseq_smp_wmb() rseq_smp_mb() + +#define rseq_smp_load_acquire(p) \ +__extension__ ({ \ + __typeof(*p) ____p1 = RSEQ_READ_ONCE(*p); \ + rseq_smp_mb(); \ + ____p1; \ +}) + +#define rseq_smp_acquire__after_ctrl_dep() rseq_smp_rmb() + +#define rseq_smp_store_release(p, v) \ +do { \ + rseq_smp_mb(); \ + RSEQ_WRITE_ONCE(*p, v); \ +} while (0) + +#ifdef RSEQ_SKIP_FASTPATH +#include "rseq-skip.h" +#else /* !RSEQ_SKIP_FASTPATH */ + +#if _MIPS_SZLONG == 64 +# define LONG ".dword" +# define LONG_LA "dla" +# define LONG_L "ld" +# define LONG_S "sd" +# define LONG_ADDI "daddiu" +# define U32_U64_PAD(x) x +#elif _MIPS_SZLONG == 32 +# define LONG ".word" +# define LONG_LA "la" +# define LONG_L "lw" +# define LONG_S "sw" +# define LONG_ADDI "addiu" +# ifdef __BIG_ENDIAN +# define U32_U64_PAD(x) "0x0, " x +# else +# define U32_U64_PAD(x) x ", 0x0" +# endif +#else +# error unsupported _MIPS_SZLONG +#endif + +#define __RSEQ_ASM_DEFINE_TABLE(version, flags, start_ip, \ + post_commit_offset, abort_ip) \ + ".pushsection __rseq_table, \"aw\"\n\t" \ + ".balign 32\n\t" \ + ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \ + LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \ + LONG " " U32_U64_PAD(__rseq_str(post_commit_offset)) "\n\t" \ + LONG " " U32_U64_PAD(__rseq_str(abort_ip)) "\n\t" \ + ".popsection\n\t" + +#define RSEQ_ASM_DEFINE_TABLE(start_ip, post_commit_ip, abort_ip) \ + __RSEQ_ASM_DEFINE_TABLE(0x0, 0x0, start_ip, \ + (post_commit_ip - start_ip), abort_ip) + +#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \ + RSEQ_INJECT_ASM(1) \ + LONG_LA " $4, " __rseq_str(cs_label) "\n\t" \ + LONG_S " $4, %[" __rseq_str(rseq_cs) "]\n\t" \ + __rseq_str(label) ":\n\t" + +#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \ + RSEQ_INJECT_ASM(2) \ + "lw $4, %[" __rseq_str(current_cpu_id) "]\n\t" \ + "bne $4, %[" __rseq_str(cpu_id) "], " __rseq_str(label) "\n\t" + +#define __RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, \ + abort_label, version, flags, \ + start_ip, post_commit_offset, abort_ip) \ + ".balign 32\n\t" \ + __rseq_str(table_label) ":\n\t" \ + ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \ + LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \ + LONG " " U32_U64_PAD(__rseq_str(post_commit_offset)) "\n\t" \ + LONG " " U32_U64_PAD(__rseq_str(abort_ip)) "\n\t" \ + ".word " __rseq_str(RSEQ_SIG) "\n\t" \ + __rseq_str(label) ":\n\t" \ + teardown \ + "b %l[" __rseq_str(abort_label) "]\n\t" + +#define RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, abort_label, \ + start_ip, post_commit_ip, abort_ip) \ + __RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, \ + abort_label, 0x0, 0x0, start_ip, \ + (post_commit_ip - start_ip), abort_ip) + +#define RSEQ_ASM_DEFINE_CMPFAIL(label, teardown, cmpfail_label) \ + __rseq_str(label) ":\n\t" \ + teardown \ + "b %l[" __rseq_str(cmpfail_label) "]\n\t" + +#define rseq_workaround_gcc_asm_size_guess() __asm__ __volatile__("") + +static inline __attribute__((always_inline)) +int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu) +{ + RSEQ_INJECT_C(9) + + rseq_workaround_gcc_asm_size_guess(); + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */ + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs) + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f) + RSEQ_INJECT_ASM(3) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], %l[cmpfail]\n\t" + RSEQ_INJECT_ASM(4) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1]) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], %l[error2]\n\t" +#endif + /* final store */ + LONG_S " %[newv], %[v]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(5) + "b 5f\n\t" + RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f) + "5:\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (__rseq_abi.cpu_id), + [rseq_cs] "m" (__rseq_abi.rseq_cs), + [v] "m" (*v), + [expect] "r" (expect), + [newv] "r" (newv) + RSEQ_INJECT_INPUT + : "$4", "memory" + RSEQ_INJECT_CLOBBER + : abort, cmpfail +#ifdef RSEQ_COMPARE_TWICE + , error1, error2 +#endif + ); + rseq_workaround_gcc_asm_size_guess(); + return 0; +abort: + rseq_workaround_gcc_asm_size_guess(); + RSEQ_INJECT_FAILED + return -1; +cmpfail: + rseq_workaround_gcc_asm_size_guess(); + return 1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_bug("cpu_id comparison failed"); +error2: + rseq_bug("expected value comparison failed"); +#endif +} + +static inline __attribute__((always_inline)) +int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot, + off_t voffp, intptr_t *load, int cpu) +{ + RSEQ_INJECT_C(9) + + rseq_workaround_gcc_asm_size_guess(); + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */ + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs) + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f) + RSEQ_INJECT_ASM(3) + LONG_L " $4, %[v]\n\t" + "beq $4, %[expectnot], %l[cmpfail]\n\t" + RSEQ_INJECT_ASM(4) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1]) + LONG_L " $4, %[v]\n\t" + "beq $4, %[expectnot], %l[error2]\n\t" +#endif + LONG_S " $4, %[load]\n\t" + LONG_ADDI " $4, %[voffp]\n\t" + LONG_L " $4, 0($4)\n\t" + /* final store */ + LONG_S " $4, %[v]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(5) + "b 5f\n\t" + RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f) + "5:\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (__rseq_abi.cpu_id), + [rseq_cs] "m" (__rseq_abi.rseq_cs), + /* final store input */ + [v] "m" (*v), + [expectnot] "r" (expectnot), + [voffp] "Ir" (voffp), + [load] "m" (*load) + RSEQ_INJECT_INPUT + : "$4", "memory" + RSEQ_INJECT_CLOBBER + : abort, cmpfail +#ifdef RSEQ_COMPARE_TWICE + , error1, error2 +#endif + ); + rseq_workaround_gcc_asm_size_guess(); + return 0; +abort: + rseq_workaround_gcc_asm_size_guess(); + RSEQ_INJECT_FAILED + return -1; +cmpfail: + rseq_workaround_gcc_asm_size_guess(); + return 1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_bug("cpu_id comparison failed"); +error2: + rseq_bug("expected value comparison failed"); +#endif +} + +static inline __attribute__((always_inline)) +int rseq_addv(intptr_t *v, intptr_t count, int cpu) +{ + RSEQ_INJECT_C(9) + + rseq_workaround_gcc_asm_size_guess(); + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */ + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs) + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f) + RSEQ_INJECT_ASM(3) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1]) +#endif + LONG_L " $4, %[v]\n\t" + LONG_ADDI " $4, %[count]\n\t" + /* final store */ + LONG_S " $4, %[v]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(4) + "b 5f\n\t" + RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f) + "5:\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (__rseq_abi.cpu_id), + [rseq_cs] "m" (__rseq_abi.rseq_cs), + [v] "m" (*v), + [count] "Ir" (count) + RSEQ_INJECT_INPUT + : "$4", "memory" + RSEQ_INJECT_CLOBBER + : abort +#ifdef RSEQ_COMPARE_TWICE + , error1 +#endif + ); + rseq_workaround_gcc_asm_size_guess(); + return 0; +abort: + rseq_workaround_gcc_asm_size_guess(); + RSEQ_INJECT_FAILED + return -1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_bug("cpu_id comparison failed"); +#endif +} + +static inline __attribute__((always_inline)) +int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect, + intptr_t *v2, intptr_t newv2, + intptr_t newv, int cpu) +{ + RSEQ_INJECT_C(9) + + rseq_workaround_gcc_asm_size_guess(); + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */ + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs) + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f) + RSEQ_INJECT_ASM(3) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], %l[cmpfail]\n\t" + RSEQ_INJECT_ASM(4) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1]) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], %l[error2]\n\t" +#endif + /* try store */ + LONG_S " %[newv2], %[v2]\n\t" + RSEQ_INJECT_ASM(5) + /* final store */ + LONG_S " %[newv], %[v]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(6) + "b 5f\n\t" + RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f) + "5:\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (__rseq_abi.cpu_id), + [rseq_cs] "m" (__rseq_abi.rseq_cs), + /* try store input */ + [v2] "m" (*v2), + [newv2] "r" (newv2), + /* final store input */ + [v] "m" (*v), + [expect] "r" (expect), + [newv] "r" (newv) + RSEQ_INJECT_INPUT + : "$4", "memory" + RSEQ_INJECT_CLOBBER + : abort, cmpfail +#ifdef RSEQ_COMPARE_TWICE + , error1, error2 +#endif + ); + rseq_workaround_gcc_asm_size_guess(); + return 0; +abort: + rseq_workaround_gcc_asm_size_guess(); + RSEQ_INJECT_FAILED + return -1; +cmpfail: + rseq_workaround_gcc_asm_size_guess(); + return 1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_bug("cpu_id comparison failed"); +error2: + rseq_bug("expected value comparison failed"); +#endif +} + +static inline __attribute__((always_inline)) +int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect, + intptr_t *v2, intptr_t newv2, + intptr_t newv, int cpu) +{ + RSEQ_INJECT_C(9) + + rseq_workaround_gcc_asm_size_guess(); + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */ + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs) + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f) + RSEQ_INJECT_ASM(3) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], %l[cmpfail]\n\t" + RSEQ_INJECT_ASM(4) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1]) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], %l[error2]\n\t" +#endif + /* try store */ + LONG_S " %[newv2], %[v2]\n\t" + RSEQ_INJECT_ASM(5) + "sync\n\t" /* full sync provides store-release */ + /* final store */ + LONG_S " %[newv], %[v]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(6) + "b 5f\n\t" + RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f) + "5:\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (__rseq_abi.cpu_id), + [rseq_cs] "m" (__rseq_abi.rseq_cs), + /* try store input */ + [v2] "m" (*v2), + [newv2] "r" (newv2), + /* final store input */ + [v] "m" (*v), + [expect] "r" (expect), + [newv] "r" (newv) + RSEQ_INJECT_INPUT + : "$4", "memory" + RSEQ_INJECT_CLOBBER + : abort, cmpfail +#ifdef RSEQ_COMPARE_TWICE + , error1, error2 +#endif + ); + rseq_workaround_gcc_asm_size_guess(); + return 0; +abort: + rseq_workaround_gcc_asm_size_guess(); + RSEQ_INJECT_FAILED + return -1; +cmpfail: + rseq_workaround_gcc_asm_size_guess(); + return 1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_bug("cpu_id comparison failed"); +error2: + rseq_bug("expected value comparison failed"); +#endif +} + +static inline __attribute__((always_inline)) +int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect, + intptr_t *v2, intptr_t expect2, + intptr_t newv, int cpu) +{ + RSEQ_INJECT_C(9) + + rseq_workaround_gcc_asm_size_guess(); + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */ + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs) + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f) + RSEQ_INJECT_ASM(3) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], %l[cmpfail]\n\t" + RSEQ_INJECT_ASM(4) + LONG_L " $4, %[v2]\n\t" + "bne $4, %[expect2], %l[cmpfail]\n\t" + RSEQ_INJECT_ASM(5) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1]) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], %l[error2]\n\t" + LONG_L " $4, %[v2]\n\t" + "bne $4, %[expect2], %l[error3]\n\t" +#endif + /* final store */ + LONG_S " %[newv], %[v]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(6) + "b 5f\n\t" + RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f) + "5:\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (__rseq_abi.cpu_id), + [rseq_cs] "m" (__rseq_abi.rseq_cs), + /* cmp2 input */ + [v2] "m" (*v2), + [expect2] "r" (expect2), + /* final store input */ + [v] "m" (*v), + [expect] "r" (expect), + [newv] "r" (newv) + RSEQ_INJECT_INPUT + : "$4", "memory" + RSEQ_INJECT_CLOBBER + : abort, cmpfail +#ifdef RSEQ_COMPARE_TWICE + , error1, error2, error3 +#endif + ); + rseq_workaround_gcc_asm_size_guess(); + return 0; +abort: + rseq_workaround_gcc_asm_size_guess(); + RSEQ_INJECT_FAILED + return -1; +cmpfail: + rseq_workaround_gcc_asm_size_guess(); + return 1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_bug("cpu_id comparison failed"); +error2: + rseq_bug("1st expected value comparison failed"); +error3: + rseq_bug("2nd expected value comparison failed"); +#endif +} + +static inline __attribute__((always_inline)) +int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect, + void *dst, void *src, size_t len, + intptr_t newv, int cpu) +{ + uintptr_t rseq_scratch[3]; + + RSEQ_INJECT_C(9) + + rseq_workaround_gcc_asm_size_guess(); + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */ + LONG_S " %[src], %[rseq_scratch0]\n\t" + LONG_S " %[dst], %[rseq_scratch1]\n\t" + LONG_S " %[len], %[rseq_scratch2]\n\t" + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs) + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f) + RSEQ_INJECT_ASM(3) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], 5f\n\t" + RSEQ_INJECT_ASM(4) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], 7f\n\t" +#endif + /* try memcpy */ + "beqz %[len], 333f\n\t" \ + "222:\n\t" \ + "lb $4, 0(%[src])\n\t" \ + "sb $4, 0(%[dst])\n\t" \ + LONG_ADDI " %[src], 1\n\t" \ + LONG_ADDI " %[dst], 1\n\t" \ + LONG_ADDI " %[len], -1\n\t" \ + "bnez %[len], 222b\n\t" \ + "333:\n\t" \ + RSEQ_INJECT_ASM(5) + /* final store */ + LONG_S " %[newv], %[v]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(6) + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t" + "b 8f\n\t" + RSEQ_ASM_DEFINE_ABORT(3, 4, + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t", + abort, 1b, 2b, 4f) + RSEQ_ASM_DEFINE_CMPFAIL(5, + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t", + cmpfail) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_DEFINE_CMPFAIL(6, + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t", + error1) + RSEQ_ASM_DEFINE_CMPFAIL(7, + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t", + error2) +#endif + "8:\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (__rseq_abi.cpu_id), + [rseq_cs] "m" (__rseq_abi.rseq_cs), + /* final store input */ + [v] "m" (*v), + [expect] "r" (expect), + [newv] "r" (newv), + /* try memcpy input */ + [dst] "r" (dst), + [src] "r" (src), + [len] "r" (len), + [rseq_scratch0] "m" (rseq_scratch[0]), + [rseq_scratch1] "m" (rseq_scratch[1]), + [rseq_scratch2] "m" (rseq_scratch[2]) + RSEQ_INJECT_INPUT + : "$4", "memory" + RSEQ_INJECT_CLOBBER + : abort, cmpfail +#ifdef RSEQ_COMPARE_TWICE + , error1, error2 +#endif + ); + rseq_workaround_gcc_asm_size_guess(); + return 0; +abort: + rseq_workaround_gcc_asm_size_guess(); + RSEQ_INJECT_FAILED + return -1; +cmpfail: + rseq_workaround_gcc_asm_size_guess(); + return 1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_workaround_gcc_asm_size_guess(); + rseq_bug("cpu_id comparison failed"); +error2: + rseq_workaround_gcc_asm_size_guess(); + rseq_bug("expected value comparison failed"); +#endif +} + +static inline __attribute__((always_inline)) +int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect, + void *dst, void *src, size_t len, + intptr_t newv, int cpu) +{ + uintptr_t rseq_scratch[3]; + + RSEQ_INJECT_C(9) + + rseq_workaround_gcc_asm_size_guess(); + __asm__ __volatile__ goto ( + RSEQ_ASM_DEFINE_TABLE(1f, 2f, 4f) /* start, commit, abort */ + LONG_S " %[src], %[rseq_scratch0]\n\t" + LONG_S " %[dst], %[rseq_scratch1]\n\t" + LONG_S " %[len], %[rseq_scratch2]\n\t" + /* Start rseq by storing table entry pointer into rseq_cs. */ + RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs) + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f) + RSEQ_INJECT_ASM(3) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], 5f\n\t" + RSEQ_INJECT_ASM(4) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f) + LONG_L " $4, %[v]\n\t" + "bne $4, %[expect], 7f\n\t" +#endif + /* try memcpy */ + "beqz %[len], 333f\n\t" \ + "222:\n\t" \ + "lb $4, 0(%[src])\n\t" \ + "sb $4, 0(%[dst])\n\t" \ + LONG_ADDI " %[src], 1\n\t" \ + LONG_ADDI " %[dst], 1\n\t" \ + LONG_ADDI " %[len], -1\n\t" \ + "bnez %[len], 222b\n\t" \ + "333:\n\t" \ + RSEQ_INJECT_ASM(5) + "sync\n\t" /* full sync provides store-release */ + /* final store */ + LONG_S " %[newv], %[v]\n\t" + "2:\n\t" + RSEQ_INJECT_ASM(6) + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t" + "b 8f\n\t" + RSEQ_ASM_DEFINE_ABORT(3, 4, + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t", + abort, 1b, 2b, 4f) + RSEQ_ASM_DEFINE_CMPFAIL(5, + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t", + cmpfail) +#ifdef RSEQ_COMPARE_TWICE + RSEQ_ASM_DEFINE_CMPFAIL(6, + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t", + error1) + RSEQ_ASM_DEFINE_CMPFAIL(7, + /* teardown */ + LONG_L " %[len], %[rseq_scratch2]\n\t" + LONG_L " %[dst], %[rseq_scratch1]\n\t" + LONG_L " %[src], %[rseq_scratch0]\n\t", + error2) +#endif + "8:\n\t" + : /* gcc asm goto does not allow outputs */ + : [cpu_id] "r" (cpu), + [current_cpu_id] "m" (__rseq_abi.cpu_id), + [rseq_cs] "m" (__rseq_abi.rseq_cs), + /* final store input */ + [v] "m" (*v), + [expect] "r" (expect), + [newv] "r" (newv), + /* try memcpy input */ + [dst] "r" (dst), + [src] "r" (src), + [len] "r" (len), + [rseq_scratch0] "m" (rseq_scratch[0]), + [rseq_scratch1] "m" (rseq_scratch[1]), + [rseq_scratch2] "m" (rseq_scratch[2]) + RSEQ_INJECT_INPUT + : "$4", "memory" + RSEQ_INJECT_CLOBBER + : abort, cmpfail +#ifdef RSEQ_COMPARE_TWICE + , error1, error2 +#endif + ); + rseq_workaround_gcc_asm_size_guess(); + return 0; +abort: + rseq_workaround_gcc_asm_size_guess(); + RSEQ_INJECT_FAILED + return -1; +cmpfail: + rseq_workaround_gcc_asm_size_guess(); + return 1; +#ifdef RSEQ_COMPARE_TWICE +error1: + rseq_workaround_gcc_asm_size_guess(); + rseq_bug("cpu_id comparison failed"); +error2: + rseq_workaround_gcc_asm_size_guess(); + rseq_bug("expected value comparison failed"); +#endif +} + +#endif /* !RSEQ_SKIP_FASTPATH */ diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index 0a808575cbc4..a4684112676c 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -73,6 +73,8 @@ extern __thread volatile struct rseq __rseq_abi; #include #elif defined(__PPC__) #include +#elif defined(__mips__) +#include #else #error unsupported target #endif -- cgit v1.2.3 From 4337aac1e1c97cfda56fbec4077fbc0e37b867c0 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 14 Jun 2018 17:24:07 -0700 Subject: MIPS: Wire up io_pgetevents syscall Wire up the io_pgetevents syscall that was introduced by commit 7a074e96dee6 ("aio: implement io_pgetevents"). Signed-off-by: Paul Burton Patchwork: https://patchwork.linux-mips.org/patch/19593/ Cc: James Hogan Cc: Ralf Baechle Cc: linux-mips@linux-mips.org --- arch/mips/include/uapi/asm/unistd.h | 15 +++++++++------ arch/mips/kernel/scall32-o32.S | 1 + arch/mips/kernel/scall64-64.S | 1 + arch/mips/kernel/scall64-n32.S | 1 + arch/mips/kernel/scall64-o32.S | 1 + 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/mips/include/uapi/asm/unistd.h b/arch/mips/include/uapi/asm/unistd.h index 170bf0b5b250..f25dd1d83fb7 100644 --- a/arch/mips/include/uapi/asm/unistd.h +++ b/arch/mips/include/uapi/asm/unistd.h @@ -389,17 +389,18 @@ #define __NR_pkey_free (__NR_Linux + 365) #define __NR_statx (__NR_Linux + 366) #define __NR_rseq (__NR_Linux + 367) +#define __NR_io_pgetevents (__NR_Linux + 368) /* * Offset of the last Linux o32 flavoured syscall */ -#define __NR_Linux_syscalls 367 +#define __NR_Linux_syscalls 368 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ #define __NR_O32_Linux 4000 -#define __NR_O32_Linux_syscalls 367 +#define __NR_O32_Linux_syscalls 368 #if _MIPS_SIM == _MIPS_SIM_ABI64 @@ -735,16 +736,17 @@ #define __NR_pkey_free (__NR_Linux + 325) #define __NR_statx (__NR_Linux + 326) #define __NR_rseq (__NR_Linux + 327) +#define __NR_io_pgetevents (__NR_Linux + 328) /* * Offset of the last Linux 64-bit flavoured syscall */ -#define __NR_Linux_syscalls 327 +#define __NR_Linux_syscalls 328 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ #define __NR_64_Linux 5000 -#define __NR_64_Linux_syscalls 327 +#define __NR_64_Linux_syscalls 328 #if _MIPS_SIM == _MIPS_SIM_NABI32 @@ -1084,15 +1086,16 @@ #define __NR_pkey_free (__NR_Linux + 329) #define __NR_statx (__NR_Linux + 330) #define __NR_rseq (__NR_Linux + 331) +#define __NR_io_pgetevents (__NR_Linux + 332) /* * Offset of the last N32 flavoured syscall */ -#define __NR_Linux_syscalls 331 +#define __NR_Linux_syscalls 332 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */ #define __NR_N32_Linux 6000 -#define __NR_N32_Linux_syscalls 331 +#define __NR_N32_Linux_syscalls 332 #endif /* _UAPI_ASM_UNISTD_H */ diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 842ff1612893..91d3c8c46097 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -591,3 +591,4 @@ EXPORT(sys_call_table) PTR sys_pkey_free /* 4365 */ PTR sys_statx PTR sys_rseq + PTR sys_io_pgetevents diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 558830d1e5ba..358d9599983d 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -440,4 +440,5 @@ EXPORT(sys_call_table) PTR sys_pkey_free /* 5325 */ PTR sys_statx PTR sys_rseq + PTR sys_io_pgetevents .size sys_call_table,.-sys_call_table diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 293f0b0119f3..c65eaacc1abf 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -435,4 +435,5 @@ EXPORT(sysn32_call_table) PTR sys_pkey_free PTR sys_statx /* 6330 */ PTR sys_rseq + PTR compat_sys_io_pgetevents .size sysn32_call_table,.-sysn32_call_table diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index f13a08de8078..73913f072e39 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -584,4 +584,5 @@ EXPORT(sys32_call_table) PTR sys_pkey_free /* 4365 */ PTR sys_statx PTR sys_rseq + PTR compat_sys_io_pgetevents .size sys32_call_table,.-sys32_call_table -- cgit v1.2.3 From 12b03558cef6d655d0d394f5e98a6fd07c1f6c0f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Jun 2018 19:18:50 -0700 Subject: net: sungem: fix rx checksum support After commit 88078d98d1bb ("net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"), sungem owners reported the infamous "eth0: hw csum failure" message. CHECKSUM_COMPLETE has in fact never worked for this driver, but this was masked by the fact that upper stacks had to strip the FCS, and therefore skb->ip_summed was set back to CHECKSUM_NONE before my recent change. Driver configures a number of bytes to skip when the chip computes the checksum, and for some reason only half of the Ethernet header was skipped. Then a second problem is that we should strip the FCS by default, unless the driver is updated to eventually support NETIF_F_RXFCS in the future. Finally, a driver should check if NETIF_F_RXCSUM feature is enabled or not, so that the admin can turn off rx checksum if wanted. Many thanks to Andreas Schwab and Mathieu Malaterre for their help in debugging this issue. Signed-off-by: Eric Dumazet Reported-by: Meelis Roos Reported-by: Mathieu Malaterre Reported-by: Andreas Schwab Tested-by: Andreas Schwab Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/sungem.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c index 7a16d40a72d1..b9221fc1674d 100644 --- a/drivers/net/ethernet/sun/sungem.c +++ b/drivers/net/ethernet/sun/sungem.c @@ -60,8 +60,7 @@ #include #include "sungem.h" -/* Stripping FCS is causing problems, disabled for now */ -#undef STRIP_FCS +#define STRIP_FCS #define DEFAULT_MSG (NETIF_MSG_DRV | \ NETIF_MSG_PROBE | \ @@ -435,7 +434,7 @@ static int gem_rxmac_reset(struct gem *gp) writel(desc_dma & 0xffffffff, gp->regs + RXDMA_DBLOW); writel(RX_RING_SIZE - 4, gp->regs + RXDMA_KICK); val = (RXDMA_CFG_BASE | (RX_OFFSET << 10) | - ((14 / 2) << 13) | RXDMA_CFG_FTHRESH_128); + (ETH_HLEN << 13) | RXDMA_CFG_FTHRESH_128); writel(val, gp->regs + RXDMA_CFG); if (readl(gp->regs + GREG_BIFCFG) & GREG_BIFCFG_M66EN) writel(((5 & RXDMA_BLANK_IPKTS) | @@ -760,7 +759,6 @@ static int gem_rx(struct gem *gp, int work_to_do) struct net_device *dev = gp->dev; int entry, drops, work_done = 0; u32 done; - __sum16 csum; if (netif_msg_rx_status(gp)) printk(KERN_DEBUG "%s: rx interrupt, done: %d, rx_new: %d\n", @@ -855,9 +853,13 @@ static int gem_rx(struct gem *gp, int work_to_do) skb = copy_skb; } - csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 0xffff); - skb->csum = csum_unfold(csum); - skb->ip_summed = CHECKSUM_COMPLETE; + if (likely(dev->features & NETIF_F_RXCSUM)) { + __sum16 csum; + + csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 0xffff); + skb->csum = csum_unfold(csum); + skb->ip_summed = CHECKSUM_COMPLETE; + } skb->protocol = eth_type_trans(skb, gp->dev); napi_gro_receive(&gp->napi, skb); @@ -1761,7 +1763,7 @@ static void gem_init_dma(struct gem *gp) writel(0, gp->regs + TXDMA_KICK); val = (RXDMA_CFG_BASE | (RX_OFFSET << 10) | - ((14 / 2) << 13) | RXDMA_CFG_FTHRESH_128); + (ETH_HLEN << 13) | RXDMA_CFG_FTHRESH_128); writel(val, gp->regs + RXDMA_CFG); writel(desc_dma >> 32, gp->regs + RXDMA_DBHI); @@ -2985,8 +2987,8 @@ static int gem_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, dev); /* We can do scatter/gather and HW checksum */ - dev->hw_features = NETIF_F_SG | NETIF_F_HW_CSUM; - dev->features |= dev->hw_features | NETIF_F_RXCSUM; + dev->hw_features = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; + dev->features = dev->hw_features; if (pci_using_dac) dev->features |= NETIF_F_HIGHDMA; -- cgit v1.2.3 From bc8a2d9bcbf1ca548b1deb315d14e1da81945bea Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Tue, 19 Jun 2018 10:35:38 -0500 Subject: net: stmmac: socfpga: add additional ocp reset line for Stratix10 The Stratix10 platform has an additional reset line, OCP(Open Core Protocol), that also needs to get deasserted for the stmmac ethernet controller to work. Thus we need to update the Kconfig to include ARCH_STRATIX10 in order to build dwmac-socfpga. Also, remove the redundant check for the reset controller pointer. The reset driver already checks for the pointer and returns 0 if the pointer is NULL. Signed-off-by: Dinh Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/Kconfig | 2 +- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/Kconfig b/drivers/net/ethernet/stmicro/stmmac/Kconfig index cb5b0f58c395..edf20361ea5f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/Kconfig +++ b/drivers/net/ethernet/stmicro/stmmac/Kconfig @@ -111,7 +111,7 @@ config DWMAC_ROCKCHIP config DWMAC_SOCFPGA tristate "SOCFPGA dwmac support" default ARCH_SOCFPGA - depends on OF && (ARCH_SOCFPGA || COMPILE_TEST) + depends on OF && (ARCH_SOCFPGA || ARCH_STRATIX10 || COMPILE_TEST) select MFD_SYSCON help Support for ethernet controller on Altera SOCFPGA diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 6e359572b9f0..5b3b06a0a3bf 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -55,6 +55,7 @@ struct socfpga_dwmac { struct device *dev; struct regmap *sys_mgr_base_addr; struct reset_control *stmmac_rst; + struct reset_control *stmmac_ocp_rst; void __iomem *splitter_base; bool f2h_ptp_ref_clk; struct tse_pcs pcs; @@ -262,8 +263,8 @@ static int socfpga_dwmac_set_phy_mode(struct socfpga_dwmac *dwmac) val = SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_GMII_MII; /* Assert reset to the enet controller before changing the phy mode */ - if (dwmac->stmmac_rst) - reset_control_assert(dwmac->stmmac_rst); + reset_control_assert(dwmac->stmmac_ocp_rst); + reset_control_assert(dwmac->stmmac_rst); regmap_read(sys_mgr_base_addr, reg_offset, &ctrl); ctrl &= ~(SYSMGR_EMACGRP_CTRL_PHYSEL_MASK << reg_shift); @@ -288,8 +289,8 @@ static int socfpga_dwmac_set_phy_mode(struct socfpga_dwmac *dwmac) /* Deassert reset for the phy configuration to be sampled by * the enet controller, and operation to start in requested mode */ - if (dwmac->stmmac_rst) - reset_control_deassert(dwmac->stmmac_rst); + reset_control_deassert(dwmac->stmmac_ocp_rst); + reset_control_deassert(dwmac->stmmac_rst); if (phymode == PHY_INTERFACE_MODE_SGMII) { if (tse_pcs_init(dwmac->pcs.tse_pcs_base, &dwmac->pcs) != 0) { dev_err(dwmac->dev, "Unable to initialize TSE PCS"); @@ -324,6 +325,15 @@ static int socfpga_dwmac_probe(struct platform_device *pdev) goto err_remove_config_dt; } + dwmac->stmmac_ocp_rst = devm_reset_control_get_optional(dev, "stmmaceth-ocp"); + if (IS_ERR(dwmac->stmmac_ocp_rst)) { + ret = PTR_ERR(dwmac->stmmac_ocp_rst); + dev_err(dev, "error getting reset control of ocp %d\n", ret); + goto err_remove_config_dt; + } + + reset_control_deassert(dwmac->stmmac_ocp_rst); + ret = socfpga_dwmac_parse_data(dwmac, dev); if (ret) { dev_err(dev, "Unable to parse OF data\n"); -- cgit v1.2.3 From b6cfffa7ad923c73f317ea50fd4ebcb3b4b6669c Mon Sep 17 00:00:00 2001 From: Bhadram Varka Date: Sun, 17 Jun 2018 20:02:05 +0530 Subject: stmmac: fix DMA channel hang in half-duplex mode HW does not support Half-duplex mode in multi-queue scenario. Fix it by not advertising the Half-Duplex mode if multi-queue enabled. Signed-off-by: Bhadram Varka Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index e79b0d7b388a..cba46b62a1cd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -928,6 +928,7 @@ static void stmmac_check_pcs_mode(struct stmmac_priv *priv) static int stmmac_init_phy(struct net_device *dev) { struct stmmac_priv *priv = netdev_priv(dev); + u32 tx_cnt = priv->plat->tx_queues_to_use; struct phy_device *phydev; char phy_id_fmt[MII_BUS_ID_SIZE + 3]; char bus_id[MII_BUS_ID_SIZE]; @@ -968,6 +969,15 @@ static int stmmac_init_phy(struct net_device *dev) phydev->advertising &= ~(SUPPORTED_1000baseT_Half | SUPPORTED_1000baseT_Full); + /* + * Half-duplex mode not supported with multiqueue + * half-duplex can only works with single queue + */ + if (tx_cnt > 1) + phydev->supported &= ~(SUPPORTED_1000baseT_Half | + SUPPORTED_100baseT_Half | + SUPPORTED_10baseT_Half); + /* * Broken HW is sometimes missing the pull-up resistor on the * MDIO line, which results in reads to non-existent devices returning -- cgit v1.2.3 From 9887cba19978a5f288100ef90a37684cc8d5e0a6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 19 Jun 2018 12:47:52 -0400 Subject: ip: limit use of gso_size to udp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ipcm(6)_cookie field gso_size is set only in the udp path. The ip layer copies this to cork only if sk_type is SOCK_DGRAM. This check proved too permissive. Ping and l2tp sockets have the same type. Limit to sockets of type SOCK_DGRAM and protocol IPPROTO_UDP to exclude ping sockets. v1 -> v2 - remove irrelevant whitespace changes Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Reported-by: Maciej Żenczykowski Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 3 ++- net/ipv6/ip6_output.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index af5a830ff6ad..b3308e9d9762 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1145,7 +1145,8 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->fragsize = ip_sk_use_pmtu(sk) ? dst_mtu(&rt->dst) : rt->dst.dev->mtu; - cork->gso_size = sk->sk_type == SOCK_DGRAM ? ipc->gso_size : 0; + cork->gso_size = sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP ? ipc->gso_size : 0; cork->dst = &rt->dst; cork->length = 0; cork->ttl = ipc->ttl; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 021e5aef6ba3..a14fb4fcdf18 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1219,7 +1219,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, if (mtu < IPV6_MIN_MTU) return -EINVAL; cork->base.fragsize = mtu; - cork->base.gso_size = sk->sk_type == SOCK_DGRAM ? ipc6->gso_size : 0; + cork->base.gso_size = sk->sk_type == SOCK_DGRAM && + sk->sk_protocol == IPPROTO_UDP ? ipc6->gso_size : 0; if (dst_allfrag(xfrm_dst_path(&rt->dst))) cork->base.flags |= IPCORK_ALLFRAG; -- cgit v1.2.3 From f5b65348fd77839b50e79bc0a5e536832ea52d8d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 20 Jun 2018 09:47:20 +0900 Subject: proc: fix missing final NUL in get_mm_cmdline() rewrite The rewrite of the cmdline fetching missed the fact that we used to also return the final terminating NUL character of the last argument. I hadn't noticed, and none of the tools I tested cared, but something obviously must care, because Michal Kubecek noticed the change in behavior. Tweak the "find the end" logic to actually include the NUL character, and once past the eend of argv, always start the strnlen() at the expected (original) argument end. This whole "allow people to rewrite their arguments in place" is a nasty hack and requires that odd slop handling at the end of the argv array, but it's our traditional model, so we continue to support it. Repored-and-bisected-by: Michal Kubecek Reviewed-and-tested-by: Michal Kubecek Cc: Alexey Dobriyan Signed-off-by: Linus Torvalds --- fs/proc/base.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index b6572944efc3..aaffc0c30216 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -235,6 +235,10 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, if (env_start != arg_end || env_start >= env_end) env_start = env_end = arg_end; + /* .. and limit it to a maximum of one page of slop */ + if (env_end >= arg_end + PAGE_SIZE) + env_end = arg_end + PAGE_SIZE - 1; + /* We're not going to care if "*ppos" has high bits set */ pos = arg_start + *ppos; @@ -254,10 +258,19 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, while (count) { int got; size_t size = min_t(size_t, PAGE_SIZE, count); + long offset; - got = access_remote_vm(mm, pos, page, size, FOLL_ANON); - if (got <= 0) + /* + * Are we already starting past the official end? + * We always include the last byte that is *supposed* + * to be NUL + */ + offset = (pos >= arg_end) ? pos - arg_end + 1 : 0; + + got = access_remote_vm(mm, pos - offset, page, size + offset, FOLL_ANON); + if (got <= offset) break; + got -= offset; /* Don't walk past a NUL character once you hit arg_end */ if (pos + got >= arg_end) { @@ -276,12 +289,17 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, n = arg_end - pos - 1; /* Cut off at first NUL after 'n' */ - got = n + strnlen(page+n, got-n); - if (!got) + got = n + strnlen(page+n, offset+got-n); + if (got < offset) break; + got -= offset; + + /* Include the NUL if it existed */ + if (got < size) + got++; } - got -= copy_to_user(buf, page, got); + got -= copy_to_user(buf, page+offset, got); if (unlikely(!got)) { if (!len) len = -EFAULT; -- cgit v1.2.3 From 4bff980f920693693d7a529c06a1bd1e7f77603a Mon Sep 17 00:00:00 2001 From: Rodrigo Rivas Costa Date: Tue, 22 May 2018 22:10:06 +0200 Subject: HID: steam: use hid_device.driver_data instead of hid_set_drvdata() When creating the low-level hidraw device, the reference to steam_device was stored using hid_set_drvdata(). But this value is not guaranteed to be kept when set before calling probe. If this pointer is reset, it crashes when opening the emulated hidraw device. It looks like hid_set_drvdata() is for users "avobe" this hid_device, while hid_device.driver_data it for users "below" this one. In this case, we are creating a virtual hidraw device, so we must use hid_device.driver_data. Signed-off-by: Rodrigo Rivas Costa Tested-by: Mariusz Ceier Signed-off-by: Jiri Kosina --- drivers/hid/hid-steam.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/hid/hid-steam.c b/drivers/hid/hid-steam.c index cb86cc834201..0422ec2b13d2 100644 --- a/drivers/hid/hid-steam.c +++ b/drivers/hid/hid-steam.c @@ -573,7 +573,7 @@ static bool steam_is_valve_interface(struct hid_device *hdev) static int steam_client_ll_parse(struct hid_device *hdev) { - struct steam_device *steam = hid_get_drvdata(hdev); + struct steam_device *steam = hdev->driver_data; return hid_parse_report(hdev, steam->hdev->dev_rdesc, steam->hdev->dev_rsize); @@ -590,7 +590,7 @@ static void steam_client_ll_stop(struct hid_device *hdev) static int steam_client_ll_open(struct hid_device *hdev) { - struct steam_device *steam = hid_get_drvdata(hdev); + struct steam_device *steam = hdev->driver_data; int ret; ret = hid_hw_open(steam->hdev); @@ -605,7 +605,7 @@ static int steam_client_ll_open(struct hid_device *hdev) static void steam_client_ll_close(struct hid_device *hdev) { - struct steam_device *steam = hid_get_drvdata(hdev); + struct steam_device *steam = hdev->driver_data; mutex_lock(&steam->mutex); steam->client_opened = false; @@ -623,7 +623,7 @@ static int steam_client_ll_raw_request(struct hid_device *hdev, size_t count, unsigned char report_type, int reqtype) { - struct steam_device *steam = hid_get_drvdata(hdev); + struct steam_device *steam = hdev->driver_data; return hid_hw_raw_request(steam->hdev, reportnum, buf, count, report_type, reqtype); @@ -710,7 +710,7 @@ static int steam_probe(struct hid_device *hdev, ret = PTR_ERR(steam->client_hdev); goto client_hdev_fail; } - hid_set_drvdata(steam->client_hdev, steam); + steam->client_hdev->driver_data = steam; /* * With the real steam controller interface, do not connect hidraw. -- cgit v1.2.3 From ebeaa367548e9e92dd9374b9464ff6e7d157117b Mon Sep 17 00:00:00 2001 From: Even Xu Date: Fri, 12 Feb 2016 04:11:34 +0800 Subject: HID: intel_ish-hid: ipc: register more pm callbacks to support hibernation Current ISH driver only registers suspend/resume PM callbacks which don't support hibernation (suspend to disk). Basically after hiberation, the ISH can't resume properly and user may not see sensor events (for example: screen rotation may not work). User will not see a crash or panic or anything except the following message in log: hid-sensor-hub 001F:8086:22D8.0001: timeout waiting for response from ISHTP device So this patch adds support for S4/hiberbation to ISH by using the SIMPLE_DEV_PM_OPS() MACRO instead of struct dev_pm_ops directly. The suspend and resume functions will now be used for both suspend to RAM and hibernation. If power management is disabled, SIMPLE_DEV_PM_OPS will do nothing, the suspend and resume related functions won't be used, so mark them as __maybe_unused to clarify that this is the intended behavior, and remove #ifdefs for power management. Cc: stable@vger.kernel.org Signed-off-by: Even Xu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ipc/pci-ish.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/hid/intel-ish-hid/ipc/pci-ish.c b/drivers/hid/intel-ish-hid/ipc/pci-ish.c index 582e449be9fe..a2c53ea3b5ed 100644 --- a/drivers/hid/intel-ish-hid/ipc/pci-ish.c +++ b/drivers/hid/intel-ish-hid/ipc/pci-ish.c @@ -205,8 +205,7 @@ static void ish_remove(struct pci_dev *pdev) kfree(ishtp_dev); } -#ifdef CONFIG_PM -static struct device *ish_resume_device; +static struct device __maybe_unused *ish_resume_device; /* 50ms to get resume response */ #define WAIT_FOR_RESUME_ACK_MS 50 @@ -220,7 +219,7 @@ static struct device *ish_resume_device; * in that case a simple resume message is enough, others we need * a reset sequence. */ -static void ish_resume_handler(struct work_struct *work) +static void __maybe_unused ish_resume_handler(struct work_struct *work) { struct pci_dev *pdev = to_pci_dev(ish_resume_device); struct ishtp_device *dev = pci_get_drvdata(pdev); @@ -262,7 +261,7 @@ static void ish_resume_handler(struct work_struct *work) * * Return: 0 to the pm core */ -static int ish_suspend(struct device *device) +static int __maybe_unused ish_suspend(struct device *device) { struct pci_dev *pdev = to_pci_dev(device); struct ishtp_device *dev = pci_get_drvdata(pdev); @@ -288,7 +287,7 @@ static int ish_suspend(struct device *device) return 0; } -static DECLARE_WORK(resume_work, ish_resume_handler); +static __maybe_unused DECLARE_WORK(resume_work, ish_resume_handler); /** * ish_resume() - ISH resume callback * @device: device pointer @@ -297,7 +296,7 @@ static DECLARE_WORK(resume_work, ish_resume_handler); * * Return: 0 to the pm core */ -static int ish_resume(struct device *device) +static int __maybe_unused ish_resume(struct device *device) { struct pci_dev *pdev = to_pci_dev(device); struct ishtp_device *dev = pci_get_drvdata(pdev); @@ -311,21 +310,14 @@ static int ish_resume(struct device *device) return 0; } -static const struct dev_pm_ops ish_pm_ops = { - .suspend = ish_suspend, - .resume = ish_resume, -}; -#define ISHTP_ISH_PM_OPS (&ish_pm_ops) -#else -#define ISHTP_ISH_PM_OPS NULL -#endif /* CONFIG_PM */ +static SIMPLE_DEV_PM_OPS(ish_pm_ops, ish_suspend, ish_resume); static struct pci_driver ish_driver = { .name = KBUILD_MODNAME, .id_table = ish_pci_tbl, .probe = ish_probe, .remove = ish_remove, - .driver.pm = ISHTP_ISH_PM_OPS, + .driver.pm = &ish_pm_ops, }; module_pci_driver(ish_driver); -- cgit v1.2.3 From d471b6b22d37bf9928c6d0202bdaaf76583b8b61 Mon Sep 17 00:00:00 2001 From: Jason Gerecke Date: Tue, 12 Jun 2018 13:42:46 -0700 Subject: HID: wacom: Correct logical maximum Y for 2nd-gen Intuos Pro large The HID descriptor for the 2nd-gen Intuos Pro large (PTH-860) contains a typo which defines an incorrect logical maximum Y value. This causes a small portion of the bottom of the tablet to become unusable (both because the area is below the "bottom" of the tablet and because 'wacom_wac_event' ignores out-of-range values). It also results in a skewed aspect ratio. To fix this, we add a quirk to 'wacom_usage_mapping' which overwrites the data with the correct value. Signed-off-by: Jason Gerecke CC: stable@vger.kernel.org # v4.10+ Signed-off-by: Jiri Kosina --- drivers/hid/wacom_sys.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c index ee7a37eb159a..545986cfb978 100644 --- a/drivers/hid/wacom_sys.c +++ b/drivers/hid/wacom_sys.c @@ -395,6 +395,14 @@ static void wacom_usage_mapping(struct hid_device *hdev, } } + /* 2nd-generation Intuos Pro Large has incorrect Y maximum */ + if (hdev->vendor == USB_VENDOR_ID_WACOM && + hdev->product == 0x0358 && + WACOM_PEN_FIELD(field) && + wacom_equivalent_usage(usage->hid) == HID_GD_Y) { + field->logical_maximum = 43200; + } + switch (usage->hid) { case HID_GD_X: features->x_max = field->logical_maximum; -- cgit v1.2.3 From a507a3065c09d69677c502905e597750da3a9815 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 19 Jun 2018 10:02:01 -0700 Subject: ACPI / processor: Finish making acpi_processor_ppc_has_changed() void Commit bca5f557dcea "ACPI / processor: Make acpi_processor_ppc_has_changed() void" changed one of the declarations of acpi_processor_ppc_has_changed() to return void, but the !CPU_FREQ version still returns int. Let's return void to be consistent. Fixes: bca5f557dcea "ACPI / processor: Make acpi_processor_ppc_has_changed() void" Signed-off-by: Brian Norris Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 40a916efd7c0..1194a4c78d55 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -309,7 +309,7 @@ static inline void acpi_processor_ppc_exit(void) { return; } -static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr, +static inline void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) { static unsigned int printout = 1; @@ -320,7 +320,6 @@ static inline int acpi_processor_ppc_has_changed(struct acpi_processor *pr, "Consider compiling CPUfreq support into your kernel.\n"); printout = 0; } - return 0; } static inline int acpi_processor_get_bios_limit(int cpu, unsigned int *limit) { -- cgit v1.2.3 From 9560ba306df3e46b4b1037d101e2e4ca68610f55 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 7 Jun 2018 18:37:59 -0700 Subject: quota: reclaim least recently used dquots The dquots in the free_dquots list are not reclaimed in LRU way. put_dquot_last() puts entries to the tail and dqcache_shrink_scan() frees from the tail. Free unreferenced dquots in LRU order because it seems more reasonable than freeing most recently used. Signed-off-by: Greg Thelen Signed-off-by: Shakeel Butt Signed-off-by: Jan Kara --- fs/quota/dquot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index d88231e3b2be..241b00f835b9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -716,7 +716,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) unsigned long freed = 0; spin_lock(&dq_list_lock); - head = free_dquots.prev; + head = free_dquots.next; while (head != &free_dquots && sc->nr_to_scan) { dquot = list_entry(head, struct dquot, dq_free); remove_dquot_hash(dquot); @@ -725,7 +725,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) do_destroy_dquot(dquot); sc->nr_to_scan--; freed++; - head = free_dquots.prev; + head = free_dquots.next; } spin_unlock(&dq_list_lock); return freed; -- cgit v1.2.3 From 1822193b5d4fd5d9800e6a7ed141375b8e8e68eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 11 Jun 2018 12:14:45 +0200 Subject: quota: Cleanup list iteration in dqcache_shrink_scan() Use list_first_entry() and list_empty() instead of opencoded variants. Reviewed-by: Matthew Wilcox Signed-off-by: Jan Kara --- fs/quota/dquot.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 241b00f835b9..fc20e06c56ba 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -711,21 +711,18 @@ EXPORT_SYMBOL(dquot_quota_sync); static unsigned long dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct list_head *head; struct dquot *dquot; unsigned long freed = 0; spin_lock(&dq_list_lock); - head = free_dquots.next; - while (head != &free_dquots && sc->nr_to_scan) { - dquot = list_entry(head, struct dquot, dq_free); + while (!list_empty(&free_dquots) && sc->nr_to_scan) { + dquot = list_first_entry(&free_dquots, struct dquot, dq_free); remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); do_destroy_dquot(dquot); sc->nr_to_scan--; freed++; - head = free_dquots.next; } spin_unlock(&dq_list_lock); return freed; -- cgit v1.2.3 From 27e6ed54a30a00d6520ddb4518214df8ff99daf1 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Fri, 8 Jun 2018 10:53:40 +0800 Subject: ext2: add warning when specifying nocheck option The option nocheck(nocheck/check=none) is useless but considering backwards compatibility it's better to print warning for a while before completely remove from the code. This patch add proper warning message for option 'nocheck' and remove unnecessary comment/function declaration which is used for removed option 'check'. Signed-off-by: Chengguang Xu Signed-off-by: Jan Kara --- fs/ext2/ext2.h | 2 -- fs/ext2/super.c | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index cc40802ddfa8..00e759f05161 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -748,7 +748,6 @@ extern void ext2_free_blocks (struct inode *, unsigned long, unsigned long); extern unsigned long ext2_count_free_blocks (struct super_block *); extern unsigned long ext2_count_dirs (struct super_block *); -extern void ext2_check_blocks_bitmap (struct super_block *); extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh); @@ -771,7 +770,6 @@ extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct qstr *); extern void ext2_free_inode (struct inode *); extern unsigned long ext2_count_free_inodes (struct super_block *); -extern void ext2_check_inodes_bitmap (struct super_block *); extern unsigned long ext2_count_free (struct buffer_head *, unsigned); /* inode.c */ diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 25ab1274090f..8ff53f8da3bc 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -557,6 +557,9 @@ static int parse_options(char *options, struct super_block *sb, set_opt (opts->s_mount_opt, NO_UID32); break; case Opt_nocheck: + ext2_msg(sb, KERN_WARNING, + "Option nocheck/check=none is deprecated and" + " will be removed in June 2020."); clear_opt (opts->s_mount_opt, CHECK); break; case Opt_debug: @@ -1335,9 +1338,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) new_opts.s_resgid = sbi->s_resgid; spin_unlock(&sbi->s_lock); - /* - * Allow the "check" option to be passed as a remount option. - */ if (!parse_options(data, sb, &new_opts)) return -EINVAL; -- cgit v1.2.3 From fa65653e575fbd958bdf5fb9c4a71a324e39510d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 13 Jun 2018 12:09:22 +0200 Subject: udf: Detect incorrect directory size Detect when a directory entry is (possibly partially) beyond directory size and return EIO in that case since it means the filesystem is corrupted. Otherwise directory operations can further corrupt the directory and possibly also oops the kernel. CC: Anatoly Trosinenko CC: stable@vger.kernel.org Reported-and-tested-by: Anatoly Trosinenko Signed-off-by: Jan Kara --- fs/udf/directory.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 0a98a2369738..3835f983cc99 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -152,6 +152,9 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, sizeof(struct fileIdentDesc)); } } + /* Got last entry outside of dir size - fs is corrupted! */ + if (*nf_pos > dir->i_size) + return NULL; return fi; } -- cgit v1.2.3 From f2e83347119acc0412941c5a23d895624c9300e2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 13 Jun 2018 17:30:14 +0200 Subject: udf: Provide function for calculating dir entry length Provide function for calculating directory entry length and use to reduce code duplication. Signed-off-by: Jan Kara --- fs/udf/directory.c | 5 +---- fs/udf/namei.c | 14 +++----------- fs/udf/udfdecl.h | 6 ++++++ 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 3835f983cc99..d9523013096f 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -141,10 +141,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, fibh->ebh->b_data, sizeof(struct fileIdentDesc) + fibh->soffset); - fi_len = (sizeof(struct fileIdentDesc) + - cfi->lengthFileIdent + - le16_to_cpu(cfi->lengthOfImpUse) + 3) & ~3; - + fi_len = udf_dir_entry_len(cfi); *nf_pos += fi_len - (fibh->eoffset - fibh->soffset); fibh->eoffset = fibh->soffset + fi_len; } else { diff --git a/fs/udf/namei.c b/fs/udf/namei.c index c586026508db..06f37ddd2997 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -351,8 +351,6 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, loff_t f_pos; loff_t size = udf_ext0_offset(dir) + dir->i_size; int nfidlen; - uint8_t lfi; - uint16_t liu; udf_pblk_t block; struct kernel_lb_addr eloc; uint32_t elen = 0; @@ -383,7 +381,7 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, namelen = 0; } - nfidlen = (sizeof(struct fileIdentDesc) + namelen + 3) & ~3; + nfidlen = ALIGN(sizeof(struct fileIdentDesc) + namelen, UDF_NAME_PAD); f_pos = udf_ext0_offset(dir); @@ -424,12 +422,8 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir, goto out_err; } - liu = le16_to_cpu(cfi->lengthOfImpUse); - lfi = cfi->lengthFileIdent; - if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (((sizeof(struct fileIdentDesc) + - liu + lfi + 3) & ~3) == nfidlen) { + if (udf_dir_entry_len(cfi) == nfidlen) { cfi->descTag.tagSerialNum = cpu_to_le16(1); cfi->fileVersionNum = cpu_to_le16(1); cfi->fileCharacteristics = 0; @@ -1201,9 +1195,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, if (dir_fi) { dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); - udf_update_tag((char *)dir_fi, - (sizeof(struct fileIdentDesc) + - le16_to_cpu(dir_fi->lengthOfImpUse) + 3) & ~3); + udf_update_tag((char *)dir_fi, udf_dir_entry_len(dir_fi)); if (old_iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) mark_inode_dirty(old_inode); else diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index bae311b59400..ed890bc4416d 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -132,6 +132,12 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, struct fileIdentDesc *, struct udf_fileident_bh *, uint8_t *, uint8_t *); +static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) +{ + return ALIGN(sizeof(struct fileIdentDesc) + + le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, + UDF_NAME_PAD); +} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); -- cgit v1.2.3 From 6c1e4d06a3808dc67dbce2d631f4c12574567dd5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 13 Jun 2018 18:04:24 +0200 Subject: udf: Drop unused arguments of udf_delete_aext() udf_delete_aext() uses its last two arguments only as local variables. Drop them. Signed-off-by: Jan Kara --- fs/udf/balloc.c | 5 ++--- fs/udf/inode.c | 8 ++++---- fs/udf/udfdecl.h | 3 +-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 1b961b1d9699..fcda0fc97b90 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -533,8 +533,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb, udf_write_aext(table, &epos, &eloc, (etype << 30) | elen, 1); } else - udf_delete_aext(table, epos, eloc, - (etype << 30) | elen); + udf_delete_aext(table, epos); } else { alloc_count = 0; } @@ -630,7 +629,7 @@ static udf_pblk_t udf_table_new_block(struct super_block *sb, if (goal_elen) udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); else - udf_delete_aext(table, goal_epos, goal_eloc, goal_elen); + udf_delete_aext(table, goal_epos); brelse(goal_epos.bh); udf_add_free_space(sb, partition, -1); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 7f39d17352c9..9915a58fbabd 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1147,8 +1147,7 @@ static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) - udf_delete_aext(inode, *epos, laarr[i].extLocation, - laarr[i].extLength); + udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { udf_insert_aext(inode, *epos, laarr[i].extLocation, @@ -2176,14 +2175,15 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, return (nelen >> 30); } -int8_t udf_delete_aext(struct inode *inode, struct extent_position epos, - struct kernel_lb_addr eloc, uint32_t elen) +int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) { struct extent_position oepos; int adsize; int8_t etype; struct allocExtDesc *aed; struct udf_inode_info *iinfo; + struct kernel_lb_addr eloc; + uint32_t elen; if (epos.bh) { get_bh(epos.bh); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index ed890bc4416d..84c47dde4d26 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -173,8 +173,7 @@ extern int udf_add_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern void udf_write_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); -extern int8_t udf_delete_aext(struct inode *, struct extent_position, - struct kernel_lb_addr, uint32_t); +extern int8_t udf_delete_aext(struct inode *, struct extent_position); extern int8_t udf_next_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t *, int); extern int8_t udf_current_aext(struct inode *, struct extent_position *, -- cgit v1.2.3 From 03eeafdd9ab06a770d42c2b264d50dff7e2f4eee Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 24 May 2018 09:26:38 -0400 Subject: locking/rwsem: Fix up_read_non_owner() warning with DEBUG_RWSEMS It was found that the use of up_read_non_owner() in NFS was causing the following warning when DEBUG_RWSEMS was configured. DEBUG_LOCKS_WARN_ON(sem->owner != ((struct task_struct *)(1UL << 0))) Looking into the rwsem.c file, it was discovered that the corresponding down_read_non_owner() function was not setting the owner field properly. This is fixed now, and the warning should be gone. Fixes: 5149cbac4235 ("locking/rwsem: Add DEBUG_RWSEMS to look for lock/unlock mismatches") Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Tested-by: Gavin Schenk Cc: Davidlohr Bueso Cc: Dan Williams Cc: Arnd Bergmann Cc: linux-nfs@vger.kernel.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1527168398-4291-1-git-send-email-longman@redhat.com --- kernel/locking/rwsem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index bc1e507be9ff..776308d2fa9e 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -181,6 +181,7 @@ void down_read_non_owner(struct rw_semaphore *sem) might_sleep(); __down_read(sem); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_non_owner); -- cgit v1.2.3 From 3ae6295ccb7cf6d344908209701badbbbb503e40 Mon Sep 17 00:00:00 2001 From: Siarhei Liakh Date: Thu, 14 Jun 2018 19:36:07 +0000 Subject: x86: Call fixup_exception() before notify_die() in math_error() fpu__drop() has an explicit fwait which under some conditions can trigger a fixable FPU exception while in kernel. Thus, we should attempt to fixup the exception first, and only call notify_die() if the fixup failed just like in do_general_protection(). The original call sequence incorrectly triggers KDB entry on debug kernels under particular FPU-intensive workloads. Andy noted, that this makes the whole conditional irq enable thing even more inconsistent, but fixing that it outside the scope of this. Signed-off-by: Siarhei Liakh Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Cc: "H. Peter Anvin" Cc: "Borislav Petkov" Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/DM5PR11MB201156F1CAB2592B07C79A03B17D0@DM5PR11MB2011.namprd11.prod.outlook.com --- arch/x86/kernel/traps.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 03f3d7695dac..162a31d80ad5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -834,16 +834,18 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) - return; cond_local_irq_enable(regs); if (!user_mode(regs)) { - if (!fixup_exception(regs, trapnr)) { - task->thread.error_code = error_code; - task->thread.trap_nr = trapnr; + if (fixup_exception(regs, trapnr)) + return; + + task->thread.error_code = error_code; + task->thread.trap_nr = trapnr; + + if (notify_die(DIE_TRAP, str, regs, error_code, + trapnr, SIGFPE) != NOTIFY_STOP) die(str, regs, error_code); - } return; } -- cgit v1.2.3 From 3d0641015bf73aaa1cb54c936674959e7805070f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 19 Jun 2018 15:34:09 +0300 Subject: nvme-rdma: fix possible double free condition when failing to create a controller Failures after nvme_init_ctrl will defer resource cleanups to .free_ctrl when the reference is released, hence we should not free the controller queues for these failures. Fix that by moving controller queues allocation before controller initialization and correctly freeing them for failures before initialization and skip them for failures after initialization. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index c9424da0d23e..bcb0e5d6343d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -888,9 +888,9 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) list_del(&ctrl->list); mutex_unlock(&nvme_rdma_ctrl_mutex); - kfree(ctrl->queues); nvmf_free_options(nctrl->opts); free_ctrl: + kfree(ctrl->queues); kfree(ctrl); } @@ -1932,11 +1932,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, goto out_free_ctrl; } - ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, - 0 /* no quirks, we're perfect! */); - if (ret) - goto out_free_ctrl; - INIT_DELAYED_WORK(&ctrl->reconnect_work, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); @@ -1950,14 +1945,19 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), GFP_KERNEL); if (!ctrl->queues) - goto out_uninit_ctrl; + goto out_free_ctrl; + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_kfree_queues; changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); WARN_ON_ONCE(!changed); ret = nvme_rdma_configure_admin_queue(ctrl, true); if (ret) - goto out_kfree_queues; + goto out_uninit_ctrl; /* sanity check icdoff */ if (ctrl->ctrl.icdoff) { @@ -2014,14 +2014,14 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_remove_admin_queue: nvme_rdma_destroy_admin_queue(ctrl, true); -out_kfree_queues: - kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); +out_kfree_queues: + kfree(ctrl->queues); out_free_ctrl: kfree(ctrl); return ERR_PTR(ret); -- cgit v1.2.3 From 94e42213cc1ae41c57819539c0130f8dfc69d718 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 19 Jun 2018 15:34:10 +0300 Subject: nvme-rdma: fix possible free of a non-allocated async event buffer If nvme_rdma_configure_admin_queue fails before we allocated the async event buffer, we will falsly free it because nvme_rdma_free_queue is freeing it. Fix it by allocating the buffer right after nvme_rdma_alloc_queue and free it right before nvme_rdma_queue_free to maintain orderly reverse cleanup sequence. Reported-by: Israel Rukshin Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index bcb0e5d6343d..f9affb71ac85 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -560,12 +560,6 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) return; - if (nvme_rdma_queue_idx(queue) == 0) { - nvme_rdma_free_qe(queue->device->dev, - &queue->ctrl->async_event_sqe, - sizeof(struct nvme_command), DMA_TO_DEVICE); - } - nvme_rdma_destroy_queue_ib(queue); rdma_destroy_id(queue->cm_id); } @@ -739,6 +733,8 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, blk_cleanup_queue(ctrl->ctrl.admin_q); nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); } + nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); nvme_rdma_free_queue(&ctrl->queues[0]); } @@ -755,11 +751,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); + error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); + if (error) + goto out_free_queue; + if (new) { ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true); if (IS_ERR(ctrl->ctrl.admin_tagset)) { error = PTR_ERR(ctrl->ctrl.admin_tagset); - goto out_free_queue; + goto out_free_async_qe; } ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); @@ -795,12 +796,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_stop_queue; - error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev, - &ctrl->async_event_sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - if (error) - goto out_stop_queue; - return 0; out_stop_queue: @@ -811,6 +806,9 @@ out_cleanup_queue: out_free_tagset: if (new) nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); +out_free_async_qe: + nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, + sizeof(struct nvme_command), DMA_TO_DEVICE); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; -- cgit v1.2.3 From c947657b15379505a9bba36a02005882b66abe57 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 19 Jun 2018 15:34:11 +0300 Subject: nvme-rdma: Fix command completion race at error recovery The race is between completing the request at error recovery work and rdma completions. If we cancel the request before getting the good rdma completion we get a NULL deref of the request MR at nvme_rdma_process_nvme_rsp(). When Canceling the request we return its mr to the mr pool (set mr to NULL) and also unmap its data. Canceling the requests while the rdma queues are active is not safe. Because rdma queues are active and we get good rdma completions that can use the mr pointer which may be NULL. Completing the request too soon may lead also to performing DMA to/from user buffers which might have been already unmapped. The commit fixes the race by draining the QP before starting the abort commands mechanism. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f9affb71ac85..2815749f4dfb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -728,7 +728,6 @@ out: static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, bool remove) { - nvme_rdma_stop_queue(&ctrl->queues[0]); if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); @@ -817,7 +816,6 @@ out_free_queue: static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, bool remove) { - nvme_rdma_stop_io_queues(ctrl); if (remove) { blk_cleanup_queue(ctrl->ctrl.connect_q); nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); @@ -947,6 +945,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; destroy_admin: + nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", @@ -963,12 +962,14 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, false); } blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl, false); @@ -1734,6 +1735,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, shutdown); @@ -1745,6 +1747,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); @@ -2011,6 +2014,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, return &ctrl->ctrl; out_remove_admin_queue: + nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_rdma_destroy_admin_queue(ctrl, true); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); -- cgit v1.2.3 From 5e77d61cbc7e766778037127dab69e6410a8fc48 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 19 Jun 2018 15:34:13 +0300 Subject: nvme-rdma: don't override opts->queue_size That is user argument, and theoretically controller limits can change over time (over reconnects/resets). Instead, use the sqsize controller attribute to check queue depth boundaries and use it to the tagset allocation. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2815749f4dfb..9544625c0b7d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -692,7 +692,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set = &ctrl->tag_set; memset(set, 0, sizeof(*set)); set->ops = &nvme_rdma_mq_ops; - set->queue_depth = nctrl->opts->queue_size; + set->queue_depth = nctrl->sqsize + 1; set->reserved_tags = 1; /* fabric connect */ set->numa_node = NUMA_NO_NODE; set->flags = BLK_MQ_F_SHOULD_MERGE; @@ -1975,20 +1975,19 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, goto out_remove_admin_queue; } - if (opts->queue_size > ctrl->ctrl.maxcmd) { - /* warn if maxcmd is lower than queue_size */ - dev_warn(ctrl->ctrl.device, - "queue_size %zu > ctrl maxcmd %u, clamping down\n", - opts->queue_size, ctrl->ctrl.maxcmd); - opts->queue_size = ctrl->ctrl.maxcmd; - } - + /* only warn if argument is too large here, will clamp later */ if (opts->queue_size > ctrl->ctrl.sqsize + 1) { - /* warn if sqsize is lower than queue_size */ dev_warn(ctrl->ctrl.device, "queue_size %zu > ctrl sqsize %u, clamping down\n", opts->queue_size, ctrl->ctrl.sqsize + 1); - opts->queue_size = ctrl->ctrl.sqsize + 1; + } + + /* warn if maxcmd is lower than sqsize+1 */ + if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { + dev_warn(ctrl->ctrl.device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); + ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; } if (opts->nr_io_queues) { -- cgit v1.2.3 From d68a90e148f5a82aa67654c5012071e31c0e4baa Mon Sep 17 00:00:00 2001 From: Max Gurtuvoy Date: Tue, 19 Jun 2018 15:45:33 +0300 Subject: nvmet: reset keep alive timer in controller enable Controllers that are not yet enabled should not really enforce keep alive timeouts, but we still want to track a timeout and cleanup in case a host died before it enabled the controller. Hence, simply reset the keep alive timer when the controller is enabled. Suggested-by: Max Gurtovoy Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index a03da764ecae..74d4b785d2da 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -686,6 +686,14 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) } ctrl->csts = NVME_CSTS_RDY; + + /* + * Controllers that are not yet enabled should not really enforce the + * keep alive timeout, but we still want to track a timeout and cleanup + * in case a host died before it enabled the controller. Hence, simply + * reset the keep alive timer when the controller is enabled. + */ + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); } static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) -- cgit v1.2.3 From a1e79188628580465ac6d7a93a313336ee3364f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jun 2018 13:45:05 +0300 Subject: blk-mq-debugfs: Off by one in blk_mq_rq_state_name() If rq_state == ARRAY_SIZE() then we read one element beyond the end of the blk_mq_rq_state_name_array[] array. Fixes: ec6dcf63c55c ("blk-mq-debugfs: Show more request state information") Reviewed-by: Bart Van Assche Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index ffa622366922..1c4532e92938 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -356,7 +356,7 @@ static const char *const blk_mq_rq_state_name_array[] = { static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) { - if (WARN_ON_ONCE((unsigned int)rq_state > + if (WARN_ON_ONCE((unsigned int)rq_state >= ARRAY_SIZE(blk_mq_rq_state_name_array))) return "(?)"; return blk_mq_rq_state_name_array[rq_state]; -- cgit v1.2.3 From bdd5ae3aa51939bb1fd26cd9fe7af07ca8c60397 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Jun 2018 17:18:36 -0400 Subject: tools/power turbostat: fix show/hide issues resulting from mis-merge The --show and --hide options failed on "Node", which was listed as "Node%". The --show and --hide options were generally fouled-up do due to come content merges that scrambled the list of column name indexes. Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 106 +++++++++++++++++----------------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d6cff3070ebd..f09a272941a1 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -381,19 +381,23 @@ int get_msr(int cpu, off_t offset, unsigned long long *msr) } /* - * Each string in this array is compared in --show and --hide cmdline. - * Thus, strings that are proper sub-sets must follow their more specific peers. + * This list matches the column headers, except + * 1. built-in only, the sysfs counters are not here -- we learn of those at run-time + * 2. Core and CPU are moved to the end, we can't have strings that contain them + * matching on them for --show and --hide. */ struct msr_counter bic[] = { { 0x0, "usec" }, { 0x0, "Time_Of_Day_Seconds" }, { 0x0, "Package" }, + { 0x0, "Node" }, { 0x0, "Avg_MHz" }, + { 0x0, "Busy%" }, { 0x0, "Bzy_MHz" }, { 0x0, "TSC_MHz" }, { 0x0, "IRQ" }, { 0x0, "SMI", "", 32, 0, FORMAT_DELTA, NULL}, - { 0x0, "Busy%" }, + { 0x0, "sysfs" }, { 0x0, "CPU%c1" }, { 0x0, "CPU%c3" }, { 0x0, "CPU%c6" }, @@ -424,15 +428,13 @@ struct msr_counter bic[] = { { 0x0, "Cor_J" }, { 0x0, "GFX_J" }, { 0x0, "RAM_J" }, - { 0x0, "Core" }, - { 0x0, "CPU" }, { 0x0, "Mod%c6" }, - { 0x0, "sysfs" }, { 0x0, "Totl%C0" }, { 0x0, "Any%C0" }, { 0x0, "GFX%C0" }, { 0x0, "CPUGFX%" }, - { 0x0, "Node%" }, + { 0x0, "Core" }, + { 0x0, "CPU" }, }; @@ -441,51 +443,51 @@ struct msr_counter bic[] = { #define BIC_USEC (1ULL << 0) #define BIC_TOD (1ULL << 1) #define BIC_Package (1ULL << 2) -#define BIC_Avg_MHz (1ULL << 3) -#define BIC_Bzy_MHz (1ULL << 4) -#define BIC_TSC_MHz (1ULL << 5) -#define BIC_IRQ (1ULL << 6) -#define BIC_SMI (1ULL << 7) -#define BIC_Busy (1ULL << 8) -#define BIC_CPU_c1 (1ULL << 9) -#define BIC_CPU_c3 (1ULL << 10) -#define BIC_CPU_c6 (1ULL << 11) -#define BIC_CPU_c7 (1ULL << 12) -#define BIC_ThreadC (1ULL << 13) -#define BIC_CoreTmp (1ULL << 14) -#define BIC_CoreCnt (1ULL << 15) -#define BIC_PkgTmp (1ULL << 16) -#define BIC_GFX_rc6 (1ULL << 17) -#define BIC_GFXMHz (1ULL << 18) -#define BIC_Pkgpc2 (1ULL << 19) -#define BIC_Pkgpc3 (1ULL << 20) -#define BIC_Pkgpc6 (1ULL << 21) -#define BIC_Pkgpc7 (1ULL << 22) -#define BIC_Pkgpc8 (1ULL << 23) -#define BIC_Pkgpc9 (1ULL << 24) -#define BIC_Pkgpc10 (1ULL << 25) -#define BIC_CPU_LPI (1ULL << 26) -#define BIC_SYS_LPI (1ULL << 27) -#define BIC_PkgWatt (1ULL << 26) -#define BIC_CorWatt (1ULL << 27) -#define BIC_GFXWatt (1ULL << 28) -#define BIC_PkgCnt (1ULL << 29) -#define BIC_RAMWatt (1ULL << 30) -#define BIC_PKG__ (1ULL << 31) -#define BIC_RAM__ (1ULL << 32) -#define BIC_Pkg_J (1ULL << 33) -#define BIC_Cor_J (1ULL << 34) -#define BIC_GFX_J (1ULL << 35) -#define BIC_RAM_J (1ULL << 36) -#define BIC_Core (1ULL << 37) -#define BIC_CPU (1ULL << 38) -#define BIC_Mod_c6 (1ULL << 39) -#define BIC_sysfs (1ULL << 40) -#define BIC_Totl_c0 (1ULL << 41) -#define BIC_Any_c0 (1ULL << 42) -#define BIC_GFX_c0 (1ULL << 43) -#define BIC_CPUGFX (1ULL << 44) -#define BIC_Node (1ULL << 45) +#define BIC_Node (1ULL << 3) +#define BIC_Avg_MHz (1ULL << 4) +#define BIC_Busy (1ULL << 5) +#define BIC_Bzy_MHz (1ULL << 6) +#define BIC_TSC_MHz (1ULL << 7) +#define BIC_IRQ (1ULL << 8) +#define BIC_SMI (1ULL << 9) +#define BIC_sysfs (1ULL << 10) +#define BIC_CPU_c1 (1ULL << 11) +#define BIC_CPU_c3 (1ULL << 12) +#define BIC_CPU_c6 (1ULL << 13) +#define BIC_CPU_c7 (1ULL << 14) +#define BIC_ThreadC (1ULL << 15) +#define BIC_CoreTmp (1ULL << 16) +#define BIC_CoreCnt (1ULL << 17) +#define BIC_PkgTmp (1ULL << 18) +#define BIC_GFX_rc6 (1ULL << 19) +#define BIC_GFXMHz (1ULL << 20) +#define BIC_Pkgpc2 (1ULL << 21) +#define BIC_Pkgpc3 (1ULL << 22) +#define BIC_Pkgpc6 (1ULL << 23) +#define BIC_Pkgpc7 (1ULL << 24) +#define BIC_Pkgpc8 (1ULL << 25) +#define BIC_Pkgpc9 (1ULL << 26) +#define BIC_Pkgpc10 (1ULL << 27) +#define BIC_CPU_LPI (1ULL << 28) +#define BIC_SYS_LPI (1ULL << 29) +#define BIC_PkgWatt (1ULL << 30) +#define BIC_CorWatt (1ULL << 31) +#define BIC_GFXWatt (1ULL << 32) +#define BIC_PkgCnt (1ULL << 33) +#define BIC_RAMWatt (1ULL << 34) +#define BIC_PKG__ (1ULL << 35) +#define BIC_RAM__ (1ULL << 36) +#define BIC_Pkg_J (1ULL << 37) +#define BIC_Cor_J (1ULL << 38) +#define BIC_GFX_J (1ULL << 39) +#define BIC_RAM_J (1ULL << 40) +#define BIC_Mod_c6 (1ULL << 41) +#define BIC_Totl_c0 (1ULL << 42) +#define BIC_Any_c0 (1ULL << 43) +#define BIC_GFX_c0 (1ULL << 44) +#define BIC_CPUGFX (1ULL << 45) +#define BIC_Core (1ULL << 46) +#define BIC_CPU (1ULL << 47) #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD) -- cgit v1.2.3 From d9d226ffadbf4a8e60f5b8fc866aaa5028c7e479 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Jun 2018 15:47:36 -0400 Subject: tools/power turbostat: decode cpuid.1.HT eg. the "HT" here: CPUID(1): SSE3 MONITOR - EIST TM2 TSC MSR ACPI-TM HT TM Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f09a272941a1..b0f294bf89c3 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4394,7 +4394,7 @@ void process_cpuid() if (!quiet) { fprintf(outf, "%d CPUID levels; family:model:stepping 0x%x:%x:%x (%d:%d:%d)\n", max_level, family, model, stepping, family, model, stepping); - fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s\n", + fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n", ecx & (1 << 0) ? "SSE3" : "-", ecx & (1 << 3) ? "MONITOR" : "-", ecx & (1 << 6) ? "SMX" : "-", @@ -4403,6 +4403,7 @@ void process_cpuid() edx & (1 << 4) ? "TSC" : "-", edx & (1 << 5) ? "MSR" : "-", edx & (1 << 22) ? "ACPI-TM" : "-", + edx & (1 << 28) ? "HT" : "-", edx & (1 << 29) ? "TM" : "-"); } -- cgit v1.2.3 From 4c2122d42116ebaa1665ad0fbef2c558fdc0e3c6 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 6 Jun 2018 17:44:48 -0400 Subject: tools/power turbostat: add optional APIC X2APIC columns Add APIC and X2APIC columns to the topology section. They are disabled-by-default -- enable like so: --debug or --enable APIC,X2APIC Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 2 +- tools/power/x86/turbostat/turbostat.c | 75 +++++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index ca9ef7017624..d39e4ff7d0bf 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -56,7 +56,7 @@ name as necessary to disambiguate it from others is necessary. Note that option .PP \fB--hide column\fP do not show the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. Use "--hide sysfs" to hide the sysfs statistics columns as a group. .PP -\fB--enable column\fP show the specified built-in columns, which are otherwise disabled, by default. Currently the only built-in counters disabled by default are "usec" and "Time_Of_Day_Seconds". +\fB--enable column\fP show the specified built-in columns, which are otherwise disabled, by default. Currently the only built-in counters disabled by default are "usec", "Time_Of_Day_Seconds", "APIC" and "X2APIC". The column name "all" can be used to enable all disabled-by-default built-in counters. .PP \fB--show column\fP show only the specified built-in columns. May be invoked multiple times, or with a comma-separated list of column names. Use "--show sysfs" to show the sysfs statistics columns as a group. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b0f294bf89c3..3bc2c9d94739 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -109,6 +109,7 @@ unsigned int has_hwp_activity_window; /* IA32_HWP_REQUEST[bits 41:32] */ unsigned int has_hwp_epp; /* IA32_HWP_REQUEST[bits 31:24] */ unsigned int has_hwp_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int has_misc_feature_control; +unsigned int first_counter_read = 1; #define RAPL_PKG (1 << 0) /* 0x610 MSR_PKG_POWER_LIMIT */ @@ -170,6 +171,8 @@ struct thread_data { unsigned long long irq_count; unsigned int smi_count; unsigned int cpu_id; + unsigned int apic_id; + unsigned int x2apic_id; unsigned int flags; #define CPU_IS_FIRST_THREAD_IN_CORE 0x2 #define CPU_IS_FIRST_CORE_IN_PACKAGE 0x4 @@ -435,10 +438,10 @@ struct msr_counter bic[] = { { 0x0, "CPUGFX%" }, { 0x0, "Core" }, { 0x0, "CPU" }, + { 0x0, "APIC" }, + { 0x0, "X2APIC" }, }; - - #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter)) #define BIC_USEC (1ULL << 0) #define BIC_TOD (1ULL << 1) @@ -488,11 +491,13 @@ struct msr_counter bic[] = { #define BIC_CPUGFX (1ULL << 45) #define BIC_Core (1ULL << 46) #define BIC_CPU (1ULL << 47) +#define BIC_APIC (1ULL << 48) +#define BIC_X2APIC (1ULL << 49) -#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD) +#define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC) unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT); -unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs; +unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC; #define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME) #define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME) @@ -603,6 +608,10 @@ void print_header(char *delim) outp += sprintf(outp, "%sCore", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU)) outp += sprintf(outp, "%sCPU", (printed++ ? delim : "")); + if (DO_BIC(BIC_APIC)) + outp += sprintf(outp, "%sAPIC", (printed++ ? delim : "")); + if (DO_BIC(BIC_X2APIC)) + outp += sprintf(outp, "%sX2APIC", (printed++ ? delim : "")); if (DO_BIC(BIC_Avg_MHz)) outp += sprintf(outp, "%sAvg_MHz", (printed++ ? delim : "")); if (DO_BIC(BIC_Busy)) @@ -882,6 +891,10 @@ int format_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "%s-", (printed++ ? delim : "")); if (DO_BIC(BIC_CPU)) outp += sprintf(outp, "%s-", (printed++ ? delim : "")); + if (DO_BIC(BIC_APIC)) + outp += sprintf(outp, "%s-", (printed++ ? delim : "")); + if (DO_BIC(BIC_X2APIC)) + outp += sprintf(outp, "%s-", (printed++ ? delim : "")); } else { if (DO_BIC(BIC_Package)) { if (p) @@ -906,6 +919,10 @@ int format_counters(struct thread_data *t, struct core_data *c, } if (DO_BIC(BIC_CPU)) outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->cpu_id); + if (DO_BIC(BIC_APIC)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->apic_id); + if (DO_BIC(BIC_X2APIC)) + outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->x2apic_id); } if (DO_BIC(BIC_Avg_MHz)) @@ -1233,6 +1250,12 @@ delta_thread(struct thread_data *new, struct thread_data *old, int i; struct msr_counter *mp; + /* we run cpuid just the 1st time, copy the results */ + if (DO_BIC(BIC_APIC)) + new->apic_id = old->apic_id; + if (DO_BIC(BIC_X2APIC)) + new->x2apic_id = old->x2apic_id; + /* * the timestamps from start of measurement interval are in "old" * the timestamp from end of measurement interval are in "new" @@ -1395,6 +1418,12 @@ int sum_counters(struct thread_data *t, struct core_data *c, int i; struct msr_counter *mp; + /* copy un-changing apic_id's */ + if (DO_BIC(BIC_APIC)) + average.threads.apic_id = t->apic_id; + if (DO_BIC(BIC_X2APIC)) + average.threads.x2apic_id = t->x2apic_id; + /* remember first tv_begin */ if (average.threads.tv_begin.tv_sec == 0) average.threads.tv_begin = t->tv_begin; @@ -1621,6 +1650,34 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) return 0; } +void get_apic_id(struct thread_data *t) +{ + unsigned int eax, ebx, ecx, edx, max_level; + + eax = ebx = ecx = edx = 0; + + if (!genuine_intel) + return; + + __cpuid(0, max_level, ebx, ecx, edx); + + __cpuid(1, eax, ebx, ecx, edx); + t->apic_id = (ebx >> 24) & 0xf; + + if (max_level < 0xb) + return; + + if (!DO_BIC(BIC_X2APIC)) + return; + + ecx = 0; + __cpuid(0xb, eax, ebx, ecx, edx); + t->x2apic_id = edx; + + if (debug && (t->apic_id != t->x2apic_id)) + fprintf(stderr, "cpu%d: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id); +} + /* * get_counters(...) * migrate to cpu @@ -1634,7 +1691,6 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) struct msr_counter *mp; int i; - gettimeofday(&t->tv_begin, (struct timezone *)NULL); if (cpu_migrate(cpu)) { @@ -1642,6 +1698,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -1; } + if (first_counter_read) + get_apic_id(t); retry: t->tsc = rdtsc(); /* we are running on local CPU of interest */ @@ -2881,6 +2939,7 @@ void do_sleep(void) } } + void turbostat_loop() { int retval; @@ -2894,6 +2953,7 @@ restart: snapshot_proc_sysfs_files(); retval = for_all_cpus(get_counters, EVEN_COUNTERS); + first_counter_read = 0; if (retval < -1) { exit(retval); } else if (retval == -1) { @@ -4655,7 +4715,6 @@ void process_cpuid() return; } - /* * in /dev/cpu/ return success for names that are numbers * ie. filter out ".", "..", "microcode". @@ -4949,6 +5008,7 @@ int fork_it(char **argv) snapshot_proc_sysfs_files(); status = for_all_cpus(get_counters, EVEN_COUNTERS); + first_counter_read = 0; if (status) exit(status); /* clear affinity side-effect of get_counters() */ @@ -5384,7 +5444,7 @@ void cmdline(int argc, char **argv) break; case 'e': /* --enable specified counter */ - bic_enabled |= bic_lookup(optarg, SHOW_LIST); + bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST); break; case 'd': debug++; @@ -5468,7 +5528,6 @@ void cmdline(int argc, char **argv) int main(int argc, char **argv) { outf = stderr; - cmdline(argc, argv); if (!quiet) -- cgit v1.2.3 From 42dd45209201edb222de5f9eadc1c8f93700ef28 Mon Sep 17 00:00:00 2001 From: Nathan Ciobanu Date: Fri, 8 Jun 2018 15:15:12 -0700 Subject: tools/power turbostat: fix segfault on 'no node' machines Running turbostat on machines that don't expose nodes in sysfs (no /sys/bus/node) causes a segfault or a -nan value diesplayed in the log. This is caused by physical_node_id being reported as -1 and logical_node_id being calculated as a negative number resulting in the new GET_THREAD/GET_CORE returning an incorrect address. Signed-off-by: Nathan Ciobanu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 3bc2c9d94739..97cc00a9c763 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2492,6 +2492,12 @@ void set_node_data(void) if (pni[pkg].count > topo.nodes_per_pkg) topo.nodes_per_pkg = pni[0].count; + /* Fake 1 node per pkg for machines that don't + * expose nodes and thus avoid -nan results + */ + if (topo.nodes_per_pkg == 0) + topo.nodes_per_pkg = 1; + for (cpu = 0; cpu < topo.num_cpus; cpu++) { pkg = cpus[cpu].physical_package_id; node = cpus[cpu].physical_node_id; @@ -4904,6 +4910,13 @@ void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct core_data *c; struct pkg_data *p; + + /* Workaround for systems where physical_node_id==-1 + * and logical_node_id==(-1 - topo.num_cpus) + */ + if (node_id < 0) + node_id = 0; + t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id); c = GET_CORE(core_base, core_id, node_id, pkg_id); p = GET_PKG(pkg_base, pkg_id); -- cgit v1.2.3 From 2ee19bdea1bbc04a06606b5c9681a07d005ecbaf Mon Sep 17 00:00:00 2001 From: Nathan Ciobanu Date: Wed, 13 Jun 2018 19:51:32 -0700 Subject: tools/power turbostat: alphabetize the help output Sort the command line arguments output of help() in alphabetical order in line with other linux tools. Signed-off-by: Nathan Ciobanu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 97cc00a9c763..d33b655299ba 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -528,12 +528,12 @@ void help(void) " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" "--cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" - "--quiet skip decoding system configuration header\n" "--interval sec.subsec Override default 5-second measurement interval\n" - "--help print this help message\n" "--list list column headers only\n" "--num_iterations num number of the measurement iterations\n" "--out file create or truncate \"file\" for all output\n" + "--quiet skip decoding system configuration header\n" + "--help print this help message\n" "--version print version information\n" "\n" "For more help, run \"man turbostat\"\n"); -- cgit v1.2.3 From cc4816503f835c7cea184776fe8ae5bb3f505083 Mon Sep 17 00:00:00 2001 From: Nathan Ciobanu Date: Wed, 13 Jun 2018 19:51:33 -0700 Subject: tools/power turbostat: add single character tokens to help Improve the help() output by adding the single character tokens (e.g -a). Signed-off-by: Nathan Ciobanu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index d33b655299ba..2dcc05f3ee6f 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -524,17 +524,20 @@ void help(void) "when COMMAND completes.\n" "If no COMMAND is specified, turbostat wakes every 5-seconds\n" "to print statistics, until interrupted.\n" - "--add add a counter\n" - " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" - "--cpu cpu-set limit output to summary plus cpu-set:\n" - " {core | package | j,k,l..m,n-p }\n" - "--interval sec.subsec Override default 5-second measurement interval\n" - "--list list column headers only\n" - "--num_iterations num number of the measurement iterations\n" - "--out file create or truncate \"file\" for all output\n" - "--quiet skip decoding system configuration header\n" - "--help print this help message\n" - "--version print version information\n" + " -a, --add add a counter\n" + " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" + " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" + " {core | package | j,k,l..m,n-p }\n" + " -i, --interval sec.subsec\n" + " Override default 5-second measurement interval\n" + " -l, --list list column headers only\n" + " -n, --num_iterations num\n" + " number of the measurement iterations\n" + " -o, --out file\n" + " create or truncate \"file\" for all output\n" + " -q, --quiet skip decoding system configuration header\n" + " -h, --help print this help message\n" + " -v, --version print version information\n" "\n" "For more help, run \"man turbostat\"\n"); } -- cgit v1.2.3 From 9ce80578d5f5279545c272563851d059a2359f3e Mon Sep 17 00:00:00 2001 From: Nathan Ciobanu Date: Wed, 13 Jun 2018 19:51:34 -0700 Subject: tools/power turbostat: add the missing command line switches Document the missing command line tokens in the help() function. Signed-off-by: Nathan Ciobanu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2dcc05f3ee6f..108c3bf2a67c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -528,14 +528,28 @@ void help(void) " eg. --add msr0x10,u64,cpu,delta,MY_TSC\n" " -c, --cpu cpu-set limit output to summary plus cpu-set:\n" " {core | package | j,k,l..m,n-p }\n" + " -d, --debug displays usec, Time_Of_Day_Seconds and more debugging\n" + " -D, --Dump displays the raw counter values\n" + " -e, --enable [all | column]\n" + " shows all or the specified disabled column\n" + " -H, --hide [column|column,column,...]\n" + " hide the specified column(s)\n" " -i, --interval sec.subsec\n" " Override default 5-second measurement interval\n" + " -J, --Joules displays energy in Joules instead of Watts\n" " -l, --list list column headers only\n" " -n, --num_iterations num\n" " number of the measurement iterations\n" " -o, --out file\n" " create or truncate \"file\" for all output\n" " -q, --quiet skip decoding system configuration header\n" + " -s, --show [column|column,column,...]\n" + " show only the specified column(s)\n" + " -S, --Summary\n" + " limits output to 1-line system summary per interval\n" + " -T, --TCC temperature\n" + " sets the Thermal Control Circuit temperature in\n" + " degrees Celsius\n" " -h, --help print this help message\n" " -v, --version print version information\n" "\n" -- cgit v1.2.3 From 73780cd816e071b0fc0f74e204a9cb30fdb291c5 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Wed, 20 Jun 2018 13:55:29 -0400 Subject: tools/power turbostat: version 18.06.20 Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 108c3bf2a67c..4d14bbbf9b63 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5102,7 +5102,7 @@ int get_and_dump_counters(void) } void print_version() { - fprintf(outf, "turbostat version 18.06.01" + fprintf(outf, "turbostat version 18.06.20" " - Len Brown \n"); } -- cgit v1.2.3 From ce042c183bcb94eb2919e8036473a1fc203420f9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jun 2018 13:41:51 +0300 Subject: block: sed-opal: Fix a couple off by one bugs resp->num is the number of tokens in resp->tok[]. It gets set in response_parse(). So if n == resp->num then we're reading beyond the end of the data. Fixes: 455a7b238cd6 ("block: Add Sed-opal library") Reviewed-by: Scott Bauer Tested-by: Scott Bauer Signed-off-by: Dan Carpenter Signed-off-by: Jens Axboe --- block/sed-opal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 945f4b8610e0..e0de4dd448b3 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -877,7 +877,7 @@ static size_t response_get_string(const struct parsed_resp *resp, int n, return 0; } - if (n > resp->num) { + if (n >= resp->num) { pr_debug("Response has %d tokens. Can't access %d\n", resp->num, n); return 0; @@ -916,7 +916,7 @@ static u64 response_get_u64(const struct parsed_resp *resp, int n) return 0; } - if (n > resp->num) { + if (n >= resp->num) { pr_debug("Response has %d tokens. Can't access %d\n", resp->num, n); return 0; -- cgit v1.2.3 From 7a0f9d1eb51ff25d119b48fe7cc6aa0433cd6621 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Jun 2018 10:42:07 +0200 Subject: Documentation: intel_pstate: Fix typo Fix a typo in the intel_pstate admin-guide documentation. Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_pstate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index ab2fe0eda1d7..8b9164990956 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -410,7 +410,7 @@ argument is passed to the kernel in the command line. That only is supported in some configurations, though (for example, if the `HWP feature is enabled in the processor `_, the operation mode of the driver cannot be changed), and if it is not - supported in the current configuration, writes to this attribute with + supported in the current configuration, writes to this attribute will fail with an appropriate error. Interpretation of Policy Attributes -- cgit v1.2.3 From 08ba91ee6e2c1c08d3f0648f978cbb5dbf3491d8 Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Fri, 15 Jun 2018 14:05:32 -0700 Subject: nbd: Add the nbd NBD_DISCONNECT_ON_CLOSE config flag. If NBD_DISCONNECT_ON_CLOSE is set on a device, then the driver will issue a disconnect from nbd_release if the device has no remaining bdev->bd_openers. Fix ret val so reconfigure with only setting the flag succeeds. Reviewed-by: Josef Bacik Signed-off-by: Doron Roberts-Kedes Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 42 ++++++++++++++++++++++++++++++++++-------- include/uapi/linux/nbd.h | 3 +++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3b7083b8ecbb..74a05561b620 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -76,6 +76,7 @@ struct link_dead_args { #define NBD_HAS_CONFIG_REF 4 #define NBD_BOUND 5 #define NBD_DESTROY_ON_DISCONNECT 6 +#define NBD_DISCONNECT_ON_CLOSE 7 struct nbd_config { u32 flags; @@ -138,6 +139,7 @@ static void nbd_config_put(struct nbd_device *nbd); static void nbd_connect_reply(struct genl_info *info, int index); static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info); static void nbd_dead_link_work(struct work_struct *work); +static void nbd_disconnect_and_put(struct nbd_device *nbd); static inline struct device *nbd_to_dev(struct nbd_device *nbd) { @@ -1305,6 +1307,12 @@ out: static void nbd_release(struct gendisk *disk, fmode_t mode) { struct nbd_device *nbd = disk->private_data; + struct block_device *bdev = bdget_disk(disk, 0); + + if (test_bit(NBD_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && + bdev->bd_openers == 0) + nbd_disconnect_and_put(nbd); + nbd_config_put(nbd); nbd_put(nbd); } @@ -1705,6 +1713,10 @@ again: &config->runtime_flags); put_dev = true; } + if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { + set_bit(NBD_DISCONNECT_ON_CLOSE, + &config->runtime_flags); + } } if (info->attrs[NBD_ATTR_SOCKETS]) { @@ -1749,6 +1761,17 @@ out: return ret; } +static void nbd_disconnect_and_put(struct nbd_device *nbd) +{ + mutex_lock(&nbd->config_lock); + nbd_disconnect(nbd); + nbd_clear_sock(nbd); + mutex_unlock(&nbd->config_lock); + if (test_and_clear_bit(NBD_HAS_CONFIG_REF, + &nbd->config->runtime_flags)) + nbd_config_put(nbd); +} + static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) { struct nbd_device *nbd; @@ -1781,13 +1804,7 @@ static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info) nbd_put(nbd); return 0; } - mutex_lock(&nbd->config_lock); - nbd_disconnect(nbd); - nbd_clear_sock(nbd); - mutex_unlock(&nbd->config_lock); - if (test_and_clear_bit(NBD_HAS_CONFIG_REF, - &nbd->config->runtime_flags)) - nbd_config_put(nbd); + nbd_disconnect_and_put(nbd); nbd_config_put(nbd); nbd_put(nbd); return 0; @@ -1798,7 +1815,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) struct nbd_device *nbd = NULL; struct nbd_config *config; int index; - int ret = -EINVAL; + int ret = 0; bool put_dev = false; if (!netlink_capable(skb, CAP_SYS_ADMIN)) @@ -1838,6 +1855,7 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) !nbd->task_recv) { dev_err(nbd_to_dev(nbd), "not configured, cannot reconfigure\n"); + ret = -EINVAL; goto out; } @@ -1862,6 +1880,14 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) &config->runtime_flags)) refcount_inc(&nbd->refs); } + + if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) { + set_bit(NBD_DISCONNECT_ON_CLOSE, + &config->runtime_flags); + } else { + clear_bit(NBD_DISCONNECT_ON_CLOSE, + &config->runtime_flags); + } } if (info->attrs[NBD_ATTR_SOCKETS]) { diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 85a3fb65e40a..20d6cc91435d 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -53,6 +53,9 @@ enum { /* These are client behavior specific flags. */ #define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on disconnect. */ +#define NBD_CFLAG_DISCONNECT_ON_CLOSE (1 << 1) /* disconnect the nbd device on + * close by last opener. + */ /* userspace doesn't need the nbd_device structure */ -- cgit v1.2.3 From 02d62a8bc48e92171c46540722e2d52ce77d87af Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 20 Jun 2018 07:44:12 -0700 Subject: nvme-fc: release io queues to allow fast fail Rather than leaving io queues quiesced after tearing down an association, restart them. This allows ios to be replayed, with fastfail ios terminating and non-fastfail getting into loops of retry. This follows rdma's lead. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b528a2f5826c..41d45a1b5c62 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2790,6 +2790,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) /* re-enable the admin_q so anything new can fast fail */ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + /* resume the io queues so that things will fast fail */ + nvme_start_queues(&ctrl->ctrl); + nvme_fc_ctlr_inactive_on_rport(ctrl); } @@ -2804,9 +2807,6 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) * waiting for io to terminate */ nvme_fc_delete_association(ctrl); - - /* resume the io queues so that things will fast fail */ - nvme_start_queues(nctrl); } static void -- cgit v1.2.3 From ba56bc3a0786992755e6804fbcbdc60ef6cfc24c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 1 Jun 2018 17:06:28 +0200 Subject: KVM: arm/arm64: Drop resource size check for GICV window When booting a 64 KB pages kernel on a ACPI GICv3 system that implements support for v2 emulation, the following warning is produced GICV size 0x2000 not a multiple of page size 0x10000 and support for v2 emulation is disabled, preventing GICv2 VMs from being able to run on such hosts. The reason is that vgic_v3_probe() performs a sanity check on the size of the window (it should be a multiple of the page size), while the ACPI MADT parsing code hardcodes the size of the window to 8 KB. This makes sense, considering that ACPI does not bother to describe the size in the first place, under the assumption that platforms implementing ACPI will follow the architecture and not put anything else in the same 64 KB window. So let's just drop the sanity check altogether, and assume that the window is at least 64 KB in size. Fixes: 909777324588 ("KVM: arm/arm64: vgic-new: vgic_init: implement kvm_vgic_hyp_init") Signed-off-by: Ard Biesheuvel Signed-off-by: Marc Zyngier --- virt/kvm/arm/vgic/vgic-v3.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index ff7dc890941a..cdce653e3c47 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -617,11 +617,6 @@ int vgic_v3_probe(const struct gic_kvm_info *info) pr_warn("GICV physical address 0x%llx not page aligned\n", (unsigned long long)info->vcpu.start); kvm_vgic_global_state.vcpu_base = 0; - } else if (!PAGE_ALIGNED(resource_size(&info->vcpu))) { - pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n", - (unsigned long long)resource_size(&info->vcpu), - PAGE_SIZE); - kvm_vgic_global_state.vcpu_base = 0; } else { kvm_vgic_global_state.vcpu_base = info->vcpu.start; kvm_vgic_global_state.can_emulate_gicv2 = true; -- cgit v1.2.3 From 6ebdf4db8fa564a150f46d32178af0873eb5abbb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 15 Jun 2018 16:47:23 +0100 Subject: arm64: Introduce sysreg_clear_set() Currently we have a couple of helpers to manipulate bits in particular sysregs: * config_sctlr_el1(u32 clear, u32 set) * change_cpacr(u64 val, u64 mask) The parameters of these differ in naming convention, order, and size, which is unfortunate. They also differ slightly in behaviour, as change_cpacr() skips the sysreg write if the bits are unchanged, which is a useful optimization when sysreg writes are expensive. Before we gain yet another sysreg manipulation function, let's unify these with a common helper, providing a consistent order for clear/set operands, and the write skipping behaviour from change_cpacr(). Code will be migrated to the new helper in subsequent patches. Signed-off-by: Mark Rutland Reviewed-by: Dave Martin Acked-by: Catalin Marinas Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/sysreg.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6171178075dc..a8f84812c6e8 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -728,6 +728,17 @@ asm( asm volatile("msr_s " __stringify(r) ", %x0" : : "rZ" (__val)); \ } while (0) +/* + * Modify bits in a sysreg. Bits in the clear mask are zeroed, then bits in the + * set mask are set. Other bits are left as-is. + */ +#define sysreg_clear_set(sysreg, clear, set) do { \ + u64 __scs_val = read_sysreg(sysreg); \ + u64 __scs_new = (__scs_val & ~(u64)(clear)) | (set); \ + if (__scs_new != __scs_val) \ + write_sysreg(__scs_new, sysreg); \ +} while (0) + static inline void config_sctlr_el1(u32 clear, u32 set) { u32 val; -- cgit v1.2.3 From b045e4d0f392cbdab2674b0aa78c8d2b187e4e27 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Jun 2018 16:47:24 +0100 Subject: KVM: arm64: Don't mask softirq with IRQs disabled in vcpu_put() Commit e6b673b ("KVM: arm64: Optimise FPSIMD handling to reduce guest/host thrashing") introduces a specific helper kvm_arch_vcpu_put_fp() for saving the vcpu FPSIMD state during vcpu_put(). This function uses local_bh_disable()/_enable() to protect the FPSIMD context manipulation from interruption by softirqs. This approach is not correct, because vcpu_put() can be invoked either from the KVM host vcpu thread (when exiting the vcpu run loop), or via a preempt notifier. In the former case, only preemption is disabled. In the latter case, the function is called from inside __schedule(), which means that IRQs are disabled. Use of local_bh_disable()/_enable() with IRQs disabled is considerd an error, resulting in lockdep splats while running VMs if lockdep is enabled. This patch disables IRQs instead of attempting to disable softirqs, avoiding the problem of calling local_bh_enable() with IRQs disabled in the __schedule() path. This creates an additional interrupt blackout during vcpu run loop exit, but this is the rare case and the blackout latency is still less than that of __schedule(). Fixes: e6b673b741ea ("KVM: arm64: Optimise FPSIMD handling to reduce guest/host thrashing") Reported-by: Andre Przywara Signed-off-by: Dave Martin Signed-off-by: Marc Zyngier --- arch/arm64/kvm/fpsimd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index dc6ecfa5a2d2..f9d09318b8db 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -5,7 +5,7 @@ * Copyright 2018 Arm Limited * Author: Dave Martin */ -#include +#include #include #include #include @@ -92,7 +92,9 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) */ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) { - local_bh_disable(); + unsigned long flags; + + local_irq_save(flags); update_thread_flag(TIF_SVE, vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE); @@ -106,5 +108,5 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) fpsimd_bind_task_to_cpu(); } - local_bh_enable(); + local_irq_restore(flags); } -- cgit v1.2.3 From b3eb56b629d1095dde56fa37f4d7bcd5f783c8b2 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Jun 2018 16:47:25 +0100 Subject: KVM: arm64/sve: Fix SVE trap restoration for non-current tasks Commit e6b673b ("KVM: arm64: Optimise FPSIMD handling to reduce guest/host thrashing") attempts to restore the configuration of userspace SVE trapping via a call to fpsimd_bind_task_to_cpu(), but the logic for determining when to do this is not correct. The patch makes the errnoenous assumption that the only task that may try to enter userspace with the currently loaded FPSIMD/SVE register content is current. This may not be the case however: if some other user task T is scheduled on the CPU during the execution of the KVM run loop, and the vcpu does not try to use the registers in the meantime, then T's state may be left there intact. If T happens to be the next task to enter userspace on this CPU then the hooks for reloading the register state and configuring traps will be skipped. (Also, current never has SVE state at this point anyway and should always have the trap enabled, as a side-effect of the ioctl() syscall needed to reach the KVM run loop in the first place.) This patch instead restores the state of the EL0 trap from the state observed at the most recent vcpu_load(), ensuring that the trap is set correctly for the loaded context (if any). Fixes: e6b673b741ea ("KVM: arm64: Optimise FPSIMD handling to reduce guest/host thrashing") Signed-off-by: Dave Martin Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/fpsimd.c | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index fda9a8ca48be..fe8777b12f86 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -306,6 +306,7 @@ struct kvm_vcpu_arch { #define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ #define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */ #define KVM_ARM64_HOST_SVE_IN_USE (1 << 3) /* backup for host TIF_SVE */ +#define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ #define vcpu_gp_regs(v) (&(v)->arch.ctxt.gp_regs) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index f9d09318b8db..98d19d1afa50 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * Called on entry to KVM_RUN unless this vcpu previously ran at least @@ -61,10 +62,16 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) { BUG_ON(!current->mm); - vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | KVM_ARM64_HOST_SVE_IN_USE); + vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | + KVM_ARM64_HOST_SVE_IN_USE | + KVM_ARM64_HOST_SVE_ENABLED); vcpu->arch.flags |= KVM_ARM64_FP_HOST; + if (test_thread_flag(TIF_SVE)) vcpu->arch.flags |= KVM_ARM64_HOST_SVE_IN_USE; + + if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) + vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED; } /* @@ -103,9 +110,18 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) /* Clean guest FP state to memory and invalidate cpu view */ fpsimd_save(); fpsimd_flush_cpu_state(); - } else if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { - /* Ensure user trap controls are correctly restored */ - fpsimd_bind_task_to_cpu(); + } else if (system_supports_sve()) { + /* + * The FPSIMD/SVE state in the CPU has not been touched, and we + * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been + * reset to CPACR_EL1_DEFAULT by the Hyp code, disabling SVE + * for EL0. To avoid spurious traps, restore the trap state + * seen by kvm_arch_vcpu_load_fp(): + */ + if (vcpu->arch.flags & KVM_ARM64_HOST_SVE_ENABLED) + sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); + else + sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); } local_irq_restore(flags); -- cgit v1.2.3 From 2955bcc8c309bb8f2c773db4798649aa802a491f Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 15 Jun 2018 16:47:26 +0100 Subject: KVM: arm64: Avoid mistaken attempts to save SVE state for vcpus Commit e6b673b ("KVM: arm64: Optimise FPSIMD handling to reduce guest/host thrashing") uses fpsimd_save() to save the FPSIMD state for a vcpu when scheduling the vcpu out. However, currently current's value of TIF_SVE is restored before calling fpsimd_save() which means that fpsimd_save() may erroneously attempt to save SVE state from the vcpu. This enables current's vector state to be polluted with guest data. current->thread.sve_state may be unallocated or not large enough, so this can also trigger a NULL dereference or buffer overrun. Instead of this, TIF_SVE should be configured properly for the guest when calling fpsimd_save() with the vcpu context loaded. This patch ensures this by delaying restoration of current's TIF_SVE until after the call to fpsimd_save(). Fixes: e6b673b741ea ("KVM: arm64: Optimise FPSIMD handling to reduce guest/host thrashing") Signed-off-by: Dave Martin Signed-off-by: Marc Zyngier --- arch/arm64/kvm/fpsimd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 98d19d1afa50..aac7808ce216 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -103,9 +103,6 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) local_irq_save(flags); - update_thread_flag(TIF_SVE, - vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { /* Clean guest FP state to memory and invalidate cpu view */ fpsimd_save(); @@ -124,5 +121,8 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); } + update_thread_flag(TIF_SVE, + vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE); + local_irq_restore(flags); } -- cgit v1.2.3 From e8b92efa629dac0e70ea4145c5e70616de5f89c8 Mon Sep 17 00:00:00 2001 From: Maciej Purski Date: Tue, 23 Jan 2018 12:17:19 +0100 Subject: drm/bridge/sii8620: fix display of packed pixel modes in MHL2 Currently packed pixel modes in MHL2 can't be displayed. The device automatically recognizes output format, so setting format other than RGB causes failure. Fix it by writing proper values to registers. Tested on MHL1 and MHL2 using various vendors' dongles both in DVI and HDMI mode. Signed-off-by: Maciej Purski Signed-off-by: Andrzej Hajda Link: https://patchwork.freedesktop.org/patch/msgid/1516706239-9104-1-git-send-email-m.purski@samsung.com --- drivers/gpu/drm/bridge/sil-sii8620.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/drivers/gpu/drm/bridge/sil-sii8620.c b/drivers/gpu/drm/bridge/sil-sii8620.c index 4a3deeda065c..250effa0e6b8 100644 --- a/drivers/gpu/drm/bridge/sil-sii8620.c +++ b/drivers/gpu/drm/bridge/sil-sii8620.c @@ -1017,20 +1017,11 @@ static void sii8620_stop_video(struct sii8620 *ctx) static void sii8620_set_format(struct sii8620 *ctx) { - u8 out_fmt; - if (sii8620_is_mhl3(ctx)) { sii8620_setbits(ctx, REG_M3_P0CTRL, BIT_M3_P0CTRL_MHL3_P0_PIXEL_MODE_PACKED, ctx->use_packed_pixel ? ~0 : 0); } else { - if (ctx->use_packed_pixel) - sii8620_write_seq_static(ctx, - REG_VID_MODE, BIT_VID_MODE_M1080P, - REG_MHL_TOP_CTL, BIT_MHL_TOP_CTL_MHL_PP_SEL | 1, - REG_MHLTX_CTL6, 0x60 - ); - else sii8620_write_seq_static(ctx, REG_VID_MODE, 0, REG_MHL_TOP_CTL, 1, @@ -1038,15 +1029,9 @@ static void sii8620_set_format(struct sii8620 *ctx) ); } - if (ctx->use_packed_pixel) - out_fmt = VAL_TPI_FORMAT(YCBCR422, FULL) | - BIT_TPI_OUTPUT_CSCMODE709; - else - out_fmt = VAL_TPI_FORMAT(RGB, FULL); - sii8620_write_seq(ctx, REG_TPI_INPUT, VAL_TPI_FORMAT(RGB, FULL), - REG_TPI_OUTPUT, out_fmt, + REG_TPI_OUTPUT, VAL_TPI_FORMAT(RGB, FULL), ); } -- cgit v1.2.3 From 74899d92e66663dc7671a8017b3146dcd4735f3b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 21 Jun 2018 10:43:31 +0200 Subject: x86/xen: Add call of speculative_store_bypass_ht_init() to PV paths Commit: 1f50ddb4f418 ("x86/speculation: Handle HT correctly on AMD") ... added speculative_store_bypass_ht_init() to the per-CPU initialization sequence. speculative_store_bypass_ht_init() needs to be called on each CPU for PV guests, too. Reported-by: Brian Woods Tested-by: Brian Woods Signed-off-by: Juergen Gross Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: xen-devel@lists.xenproject.org Fixes: 1f50ddb4f4189243c05926b842dc1a0332195f31 ("x86/speculation: Handle HT correctly on AMD") Link: https://lore.kernel.org/lkml/20180621084331.21228-1-jgross@suse.com Signed-off-by: Ingo Molnar --- arch/x86/xen/smp_pv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 2e20ae2fa2d6..e3b18ad49889 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -32,6 +32,7 @@ #include #include +#include #include #include @@ -70,6 +71,8 @@ static void cpu_bringup(void) cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); + speculative_store_bypass_ht_init(); + xen_setup_cpu_clockevents(); notify_cpu_starting(cpu); @@ -250,6 +253,8 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); + speculative_store_bypass_ht_init(); + xen_pmu_init(0); if (xen_smp_intr_init(0) || xen_smp_intr_init_pv(0)) -- cgit v1.2.3 From e7c9996bb3dedb5553e6b34e6dbbed210a72f3e1 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 19 Jun 2018 09:32:28 -0400 Subject: rseq/selftests: Make run_param_test.sh executable The executable bit of the run_param_test.sh script got lost in the merge. Signed-off-by: Mathieu Desnoyers Cc: Andrew Hunter Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ben Maurer Cc: Boqun Feng Cc: Catalin Marinas Cc: Chris Lameter Cc: Dave Watson Cc: Joel Fernandes Cc: Josh Triplett Cc: Linus Torvalds Cc: Michael Kerrisk Cc: Paul E . McKenney Cc: Paul Turner Cc: Peter Zijlstra Cc: Russell King Cc: Shuah Khan Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Link: https://lore.kernel.org/lkml/20180619133230.4087-2-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- tools/testing/selftests/rseq/run_param_test.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/testing/selftests/rseq/run_param_test.sh diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh old mode 100644 new mode 100755 -- cgit v1.2.3 From 0ea73d5e286193be4dec70d04021d6005b5b1771 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 19 Jun 2018 09:32:29 -0400 Subject: rseq/selftests/arm: Align 'struct rseq_cs' on 32 bytes uapi/linux/rseq.h aligns 'struct rseq_cs' on 32 bytes. Satisfy this alignment requirement in its definition within the rseq-arm.h inline assembly as well. Signed-off-by: Mathieu Desnoyers Cc: Andrew Hunter Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ben Maurer Cc: Boqun Feng Cc: Catalin Marinas Cc: Chris Lameter Cc: Dave Watson Cc: Joel Fernandes Cc: Josh Triplett Cc: Linus Torvalds Cc: Michael Kerrisk Cc: Paul E . McKenney Cc: Paul Turner Cc: Peter Zijlstra Cc: Russell King Cc: Shuah Khan Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Link: https://lore.kernel.org/lkml/20180619133230.4087-3-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- tools/testing/selftests/rseq/rseq-arm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h index 3b055f9aeaab..3cea19877227 100644 --- a/tools/testing/selftests/rseq/rseq-arm.h +++ b/tools/testing/selftests/rseq/rseq-arm.h @@ -57,6 +57,7 @@ do { \ #define __RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, \ abort_label, version, flags, \ start_ip, post_commit_offset, abort_ip) \ + ".balign 32\n\t" \ __rseq_str(table_label) ":\n\t" \ ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \ ".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \ -- cgit v1.2.3 From 9a789fcfe8605417f7a1a970355f5efa4fe88c64 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 19 Jun 2018 09:32:30 -0400 Subject: rseq/cleanup: Do not abort rseq c.s. in child on fork() Considering that we explicitly forbid system calls in rseq critical sections, it is not valid to issue a fork or clone system call within a rseq critical section, so rseq_fork() is not required to restart an active rseq c.s. in the child process. Signed-off-by: Mathieu Desnoyers Cc: Andrew Morton Cc: Andy Lutomirski Cc: Ben Maurer Cc: Boqun Feng Cc: Catalin Marinas Cc: Chris Lameter Cc: Dave Watson Cc: Joel Fernandes Cc: Josh Triplett Cc: Linus Torvalds Cc: Michael Kerrisk Cc: Paul E . McKenney Cc: Paul Turner Cc: Peter Zijlstra Cc: Russell King Cc: Shuah Khan Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-kselftest@vger.kernel.org Link: https://lore.kernel.org/lkml/20180619133230.4087-4-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 87bf02d93a27..c1882643d455 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1831,9 +1831,7 @@ static inline void rseq_migrate(struct task_struct *t) /* * If parent process has a registered restartable sequences area, the - * child inherits. Only applies when forking a process, not a thread. In - * case a parent fork() in the middle of a restartable sequence, set the - * resume notifier to force the child to retry. + * child inherits. Only applies when forking a process, not a thread. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { @@ -1847,7 +1845,6 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; - rseq_preempt(t); } } -- cgit v1.2.3 From 47a91b7232fa25abc7e0b7fc1c69ae4f81061594 Mon Sep 17 00:00:00 2001 From: Jia He Date: Mon, 21 May 2018 11:05:30 +0800 Subject: KVM: arm/arm64: add WARN_ON if size is not PAGE_SIZE aligned in unmap_stage2_range There is a panic in armv8a server(QDF2400) under memory pressure tests (start 20 guests and run memhog in the host). ---------------------------------begin-------------------------------- [35380.800950] BUG: Bad page state in process qemu-kvm pfn:dd0b6 [35380.805825] page:ffff7fe003742d80 count:-4871 mapcount:-2126053375 mapping: (null) index:0x0 [35380.815024] flags: 0x1fffc00000000000() [35380.818845] raw: 1fffc00000000000 0000000000000000 0000000000000000 ffffecf981470000 [35380.826569] raw: dead000000000100 dead000000000200 ffff8017c001c000 0000000000000000 [35380.805825] page:ffff7fe003742d80 count:-4871 mapcount:-2126053375 mapping: (null) index:0x0 [35380.815024] flags: 0x1fffc00000000000() [35380.818845] raw: 1fffc00000000000 0000000000000000 0000000000000000 ffffecf981470000 [35380.826569] raw: dead000000000100 dead000000000200 ffff8017c001c000 0000000000000000 [35380.834294] page dumped because: nonzero _refcount [...] --------------------------------end-------------------------------------- The root cause might be what was fixed at [1]. But from the KVM points of view, it would be better if the issue was caught earlier. If the size is not PAGE_SIZE aligned, unmap_stage2_range might unmap the wrong(more or less) page range. Hence it caused the "BUG: Bad page state" Let's WARN in that case, so that the issue is obvious. [1] https://lkml.org/lkml/2018/5/3/1042 Reviewed-by: Suzuki K Poulose Signed-off-by: jia.he@hxt-semitech.com [maz: tidied up commit message] Signed-off-by: Marc Zyngier --- virt/kvm/arm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index 8d90de213ce9..1d90d79706bd 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -297,6 +297,8 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size) phys_addr_t next; assert_spin_locked(&kvm->mmu_lock); + WARN_ON(size & ~PAGE_MASK); + pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { /* -- cgit v1.2.3 From bdab125c9301a6ac538911ba68f665dfd075ec81 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 19 Jun 2018 18:43:41 +0900 Subject: Revert "kexec/purgatory: Add clean-up for purgatory directory" Reverts the following commit: b0108f9e93d0 ("kexec: purgatory: add clean-up for purgatory directory") ... which incorrectly stated that the kexec-purgatory.c and purgatory.ro files were not removed after 'make mrproper'. In fact, they are. You can confirm it after reverting it. $ make mrproper $ touch arch/x86/purgatory/kexec-purgatory.c $ touch arch/x86/purgatory/purgatory.ro $ make mrproper CLEAN arch/x86/purgatory $ ls arch/x86/purgatory/ entry64.S Makefile purgatory.c setup-x86_64.S stack.S string.c This is obvious from the build system point of view. arch/x86/Makefile adds 'arch/x86' to core-y. Hence 'make clean' descends like this: arch/x86/Kbuild -> arch/x86/purgatory/Makefile Signed-off-by: Masahiro Yamada Cc: Linus Torvalds Cc: Michal Marek Cc: Peter Zijlstra Cc: Sam Ravnborg Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/1529401422-28838-2-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 60135cbd905c..d6f404ae3d93 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -327,7 +327,6 @@ archclean: $(Q)rm -rf $(objtree)/arch/x86_64 $(Q)$(MAKE) $(clean)=$(boot) $(Q)$(MAKE) $(clean)=arch/x86/tools - $(Q)$(MAKE) $(clean)=arch/x86/purgatory define archhelp echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' -- cgit v1.2.3 From d6605b6bbee88b74150b14f5e83a6067f5e323d2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 19 Jun 2018 18:43:42 +0900 Subject: x86/build: Remove unnecessary preparation for purgatory kexec-purgatory.c is properly generated when Kbuild descend into the arch/x86/purgatory/. Thus the 'archprepare' target is redundant. Signed-off-by: Masahiro Yamada Cc: Linus Torvalds Cc: Michal Marek Cc: Peter Zijlstra Cc: Sam Ravnborg Cc: Thomas Gleixner Link: https://lore.kernel.org/lkml/1529401422-28838-3-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index d6f404ae3d93..4fafba5df891 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -258,11 +258,6 @@ archscripts: scripts_basic archheaders: $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all -archprepare: -ifeq ($(CONFIG_KEXEC_FILE),y) - $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c -endif - ### # Kernel objects -- cgit v1.2.3 From 6cb2b08ff92460290979de4be91363e5d1b6cec1 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 18 Jun 2018 09:59:54 +0200 Subject: x86/pti: Don't report XenPV as vulnerable Xen PV domain kernel is not by design affected by meltdown as it's enforcing split CR3 itself. Let's not report such systems as "Vulnerable" in sysfs (we're also already forcing PTI to off in X86_HYPER_XEN_PV cases); the security of the system ultimately depends on presence of mitigation in the Hypervisor, which can't be easily detected from DomU; let's report that. Reported-and-tested-by: Mike Latimer Signed-off-by: Jiri Kosina Signed-off-by: Thomas Gleixner Acked-by: Juergen Gross Cc: Borislav Petkov Link: https://lkml.kernel.org/r/nycvar.YFH.7.76.1806180959080.6203@cbobk.fhfr.pm [ Merge the user-visible string into a single line. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/bugs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index cd0fda1fff6d..404df26b7de8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -27,6 +27,7 @@ #include #include #include +#include static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); @@ -664,6 +665,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr if (boot_cpu_has(X86_FEATURE_PTI)) return sprintf(buf, "Mitigation: PTI\n"); + if (hypervisor_is_type(X86_HYPER_XEN_PV)) + return sprintf(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); + break; case X86_BUG_SPECTRE_V1: -- cgit v1.2.3 From eab6870fee877258122a042bfd99ee7908c40280 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 7 Jun 2018 09:13:48 -0700 Subject: x86/spectre_v1: Disable compiler optimizations over array_index_mask_nospec() Mark Rutland noticed that GCC optimization passes have the potential to elide necessary invocations of the array_index_mask_nospec() instruction sequence, so mark the asm() volatile. Mark explains: "The volatile will inhibit *some* cases where the compiler could lift the array_index_nospec() call out of a branch, e.g. where there are multiple invocations of array_index_nospec() with the same arguments: if (idx < foo) { idx1 = array_idx_nospec(idx, foo) do_something(idx1); } < some other code > if (idx < foo) { idx2 = array_idx_nospec(idx, foo); do_something_else(idx2); } ... since the compiler can determine that the two invocations yield the same result, and reuse the first result (likely the same register as idx was in originally) for the second branch, effectively re-writing the above as: if (idx < foo) { idx = array_idx_nospec(idx, foo); do_something(idx); } < some other code > if (idx < foo) { do_something_else(idx); } ... if we don't take the first branch, then speculatively take the second, we lose the nospec protection. There's more info on volatile asm in the GCC docs: https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Volatile " Reported-by: Mark Rutland Signed-off-by: Dan Williams Acked-by: Mark Rutland Acked-by: Thomas Gleixner Acked-by: Linus Torvalds Cc: Cc: Peter Zijlstra Fixes: babdde2698d4 ("x86: Implement array_index_mask_nospec") Link: https://lkml.kernel.org/lkml/152838798950.14521.4893346294059739135.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/barrier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 042b5e892ed1..14de0432d288 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -38,7 +38,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, { unsigned long mask; - asm ("cmp %1,%2; sbb %0,%0;" + asm volatile ("cmp %1,%2; sbb %0,%0;" :"=r" (mask) :"g"(size),"r" (index) :"cc"); -- cgit v1.2.3 From f642fb5864a6e3645edce6f85ffe7b44d5e9b990 Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Thu, 24 May 2018 15:17:12 -0500 Subject: x86/platform/UV: Add adjustable set memory block size function Add a new function to "adjust" the current fixed UV memory block size of 2GB so it can be changed to a different physical boundary. This is out of necessity so arch dependent code can accommodate specific BIOS requirements which can align these new PMEM modules at less than the default boundaries. A "set order" type of function was used to insure that the memory block size will be a power of two value without requiring a validity check. 64GB was chosen as the upper limit for memory block size values to accommodate upcoming 4PB systems which have 6 more bits of physical address space (46 becoming 52). Signed-off-by: Mike Travis Reviewed-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Cc: dan.j.williams@intel.com Cc: jgross@suse.com Cc: kirill.shutemov@linux.intel.com Cc: mhocko@suse.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/20180524201711.609546602@stormcage.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 20 ++++++++++++++++---- include/linux/memory.h | 1 + 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0a400606dea0..20d8bf5fbceb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1350,16 +1350,28 @@ int kern_addr_valid(unsigned long addr) /* Amount of ram needed to start using large blocks */ #define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30) +/* Adjustable memory block size */ +static unsigned long set_memory_block_size; +int __init set_memory_block_size_order(unsigned int order) +{ + unsigned long size = 1UL << order; + + if (size > MEM_SIZE_FOR_LARGE_BLOCK || size < MIN_MEMORY_BLOCK_SIZE) + return -EINVAL; + + set_memory_block_size = size; + return 0; +} + static unsigned long probe_memory_block_size(void) { unsigned long boot_mem_end = max_pfn << PAGE_SHIFT; unsigned long bz; - /* If this is UV system, always set 2G block size */ - if (is_uv_system()) { - bz = MAX_BLOCK_SIZE; + /* If memory block size has been set, then use it */ + bz = set_memory_block_size; + if (bz) goto done; - } /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */ if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) { diff --git a/include/linux/memory.h b/include/linux/memory.h index 31ca3e28b0eb..a6ddefc60517 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -38,6 +38,7 @@ struct memory_block { int arch_get_memory_phys_device(unsigned long start_pfn); unsigned long memory_block_size_bytes(void); +int set_memory_block_size_order(unsigned int order); /* These states are exposed to userspace as text strings in sysfs */ #define MEM_ONLINE (1<<0) /* exposed to userspace */ -- cgit v1.2.3 From bbbd2b51a2aa0d76b3676271e216cf3647773397 Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Thu, 24 May 2018 15:17:13 -0500 Subject: x86/platform/UV: Use new set memory block size function Add a call to the new function to "adjust" the current fixed UV memory block size of 2GB so it can be changed to a different physical boundary. This accommodates changes in the Intel BIOS, and therefore UV BIOS, which now can align boundaries different than the previous UV standard of 2GB. It also flags any UV Global Address boundaries from BIOS that cause a change in the mem block size (boundary). The current boundary of 2GB has been used on UV since the first system release in 2009 with Linux 2.6 and has worked fine. But the new NVDIMM persistent memory modules (PMEM), along with the Intel BIOS changes to support these modules caused the memory block size boundary to be set to a lower limit. Intel only guarantees that this minimum boundary at 64MB though the current Linux limit is 128MB. Note that the default remains 2GB if no changes occur. Signed-off-by: Mike Travis Reviewed-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Cc: dan.j.williams@intel.com Cc: jgross@suse.com Cc: kirill.shutemov@linux.intel.com Cc: mhocko@suse.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/20180524201711.732785782@stormcage.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 49 +++++++++++++++++++++++++++++++++++--- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index efaf2d4f9c3c..2270a777d647 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -392,6 +393,40 @@ extern int uv_hub_info_version(void) } EXPORT_SYMBOL(uv_hub_info_version); +/* Default UV memory block size is 2GB */ +static unsigned long mem_block_size = (2UL << 30); + +static __init int adj_blksize(u32 lgre) +{ + unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT; + unsigned long size; + + for (size = mem_block_size; size > MIN_MEMORY_BLOCK_SIZE; size >>= 1) + if (IS_ALIGNED(base, size)) + break; + + if (size >= mem_block_size) + return 0; + + mem_block_size = size; + return 1; +} + +static __init void set_block_size(void) +{ + unsigned int order = ffs(mem_block_size); + + if (order) { + /* adjust for ffs return of 1..64 */ + set_memory_block_size_order(order - 1); + pr_info("UV: mem_block_size set to 0x%lx\n", mem_block_size); + } else { + /* bad or zero value, default to 1UL << 31 (2GB) */ + pr_err("UV: mem_block_size error with 0x%lx\n", mem_block_size); + set_memory_block_size_order(31); + } +} + /* Build GAM range lookup table: */ static __init void build_uv_gr_table(void) { @@ -1180,23 +1215,30 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) << UV_GAM_RANGE_SHFT); int order = 0; char suffix[] = " KMGTPE"; + int flag = ' '; while (size > 9999 && order < sizeof(suffix)) { size /= 1024; order++; } + /* adjust max block size to current range start */ + if (gre->type == 1 || gre->type == 2) + if (adj_blksize(lgre)) + flag = '*'; + if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); + pr_info("UV: # %20s %14s %6s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", "SID", "PN"); } - pr_info("UV: %2d: 0x%014lx-0x%014lx %5lu%c %3d %04x %02x %02x\n", + pr_info("UV: %2d: 0x%014lx-0x%014lx%c %5lu%c %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, - size, suffix[order], + flag, size, suffix[order], gre->type, gre->nasid, gre->sockid, gre->pnode); + /* update to next range start */ lgre = gre->limit; if (sock_min > gre->sockid) sock_min = gre->sockid; @@ -1427,6 +1469,7 @@ static void __init uv_system_init_hub(void) build_socket_tables(); build_uv_gr_table(); + set_block_size(); uv_init_hub_info(&hub_info); uv_possible_blades = num_possible_nodes(); if (!_node_to_pnode) -- cgit v1.2.3 From d7609f4210cb716c11abfe2bfb5997191095d00b Mon Sep 17 00:00:00 2001 From: "mike.travis@hpe.com" Date: Thu, 24 May 2018 15:17:14 -0500 Subject: x86/platform/UV: Add kernel parameter to set memory block size Add a kernel parameter that allows setting UV memory block size. This is to provide an adjustment for new forms of PMEM and other DIMM memory that might require alignment restrictions other than scanning the global address table for the required minimum alignment. The value set will be further adjusted by both the GAM range table scan as well as restrictions imposed by set_memory_block_size_order(). Signed-off-by: Mike Travis Reviewed-by: Andrew Banman Cc: Andrew Morton Cc: Dimitri Sivanich Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russ Anderson Cc: Thomas Gleixner Cc: dan.j.williams@intel.com Cc: jgross@suse.com Cc: kirill.shutemov@linux.intel.com Cc: mhocko@suse.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/lkml/20180524201711.854849120@stormcage.americas.sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 2270a777d647..d492752f79e1 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -396,6 +396,17 @@ EXPORT_SYMBOL(uv_hub_info_version); /* Default UV memory block size is 2GB */ static unsigned long mem_block_size = (2UL << 30); +/* Kernel parameter to specify UV mem block size */ +static int parse_mem_block_size(char *ptr) +{ + unsigned long size = memparse(ptr, NULL); + + /* Size will be rounded down by set_block_size() below */ + mem_block_size = size; + return 0; +} +early_param("uv_memblksize", parse_mem_block_size); + static __init int adj_blksize(u32 lgre) { unsigned long base = (unsigned long)lgre << UV_GAM_RANGE_SHFT; -- cgit v1.2.3 From 9f9cafc14016f23f982d3ce18f9057923bd3037a Mon Sep 17 00:00:00 2001 From: Jianchao Wang Date: Wed, 20 Jun 2018 13:42:22 +0800 Subject: nvme-pci: move nvme_kill_queues to nvme_remove_dead_ctrl There is race between nvme_remove and nvme_reset_work that can lead to io hang. nvme_remove nvme_reset_work -> nvme_remove_dead_ctrl -> nvme_dev_disable -> quiesce request_queue -> queue remove_work -> cancel_work_sync reset_work -> nvme_remove_namespaces -> splice ctrl->namespaces nvme_remove_dead_ctrl_work -> nvme_kill_queues -> nvme_ns_remove do nothing -> blk_cleanup_queue -> blk_freeze_queue Finally, the request_queue is quiesced state when wait freeze, we will get io hang here. To fix it, move the nvme_kill_queues from nvme_remove_dead_ctrl_work to nvme_remove_dead_ctrl. Suggested-by: Keith Busch Signed-off-by: Jianchao Wang Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fc33804662e7..73a97fcea364 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2289,6 +2289,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); + nvme_kill_queues(&dev->ctrl); if (!queue_work(nvme_wq, &dev->remove_work)) nvme_put_ctrl(&dev->ctrl); } @@ -2405,7 +2406,6 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work) struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); struct pci_dev *pdev = to_pci_dev(dev->dev); - nvme_kill_queues(&dev->ctrl); if (pci_get_drvdata(pdev)) device_release_driver(&pdev->dev); nvme_put_ctrl(&dev->ctrl); -- cgit v1.2.3 From 90718e32e1dcc2479acfa208ccfc6442850b594c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 18 May 2018 18:27:39 +0200 Subject: uprobes/x86: Remove incorrect WARN_ON() in uprobe_init_insn() insn_get_length() has the side-effect of processing the entire instruction but only if it was decoded successfully, otherwise insn_complete() can fail and in this case we need to just return an error without warning. Reported-by: syzbot+30d675e3ca03c1c351e7@syzkaller.appspotmail.com Signed-off-by: Oleg Nesterov Reviewed-by: Masami Hiramatsu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: syzkaller-bugs@googlegroups.com Link: https://lkml.kernel.org/lkml/20180518162739.GA5559@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 58d8d800875d..deb576b23b7c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -293,7 +293,7 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); /* has the side-effect of processing the entire instruction */ insn_get_length(insn); - if (WARN_ON_ONCE(!insn_complete(insn))) + if (!insn_complete(insn)) return -ENOEXEC; if (is_prefix_bad(insn)) -- cgit v1.2.3 From 8730662d7b2582f65dd6c59ab1e0b7fa461c79b0 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 24 Apr 2018 14:22:38 -0700 Subject: kernel.h: Fix a typo in comment Signed-off-by: Wei Wang Cc: Andrew Morton Cc: Borislav Petkov Cc: Crt Mori Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Randy Dunlap Cc: Steven Rostedt Cc: Thomas Gleixner Cc: gregkh@linuxfoundation.org Cc: wei.vince.wang@gmail.com Link: https://lkml.kernel.org/lkml/20180424212241.16013-1-wvw@google.com Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d23123238534..941dc0a5a877 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -666,7 +666,7 @@ do { \ * your code. (Extra memory is used for special buffers that are * allocated when trace_printk() is used.) * - * A little optization trick is done here. If there's only one + * A little optimization trick is done here. If there's only one * argument, there's no need to scan the string for printf formats. * The trace_puts() will suffice. But how can we take advantage of * using trace_puts() when trace_printk() has only one argument? -- cgit v1.2.3 From 7ddfd3e0df29106c728dda2a6bd6591ee43a4e3c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jun 2018 10:16:21 +0100 Subject: KVM: Enforce error in ioctl for compat tasks when !KVM_COMPAT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current behaviour of the compat ioctls is a bit odd. We provide a compat_ioctl method when KVM_COMPAT is set, and NULL otherwise. But NULL means that the normal, non-compat ioctl should be used directly for compat tasks, and there is no way to actually prevent a compat task from issueing KVM ioctls. This patch changes this behaviour, by always registering a compat_ioctl method, even if KVM_COMPAT is not selected. In that case, the callback will always return -EINVAL. Fixes: de8e5d744051568c8aad ("KVM: Disable compat ioctl for s390") Reported-by: Mark Rutland Acked-by: Christian Borntraeger Acked-by: Radim Krčmář Signed-off-by: Marc Zyngier --- virt/kvm/kvm_main.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ada21f47f22b..8b47507faab5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -116,6 +116,11 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, #ifdef CONFIG_KVM_COMPAT static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, unsigned long arg); +#define KVM_COMPAT(c) .compat_ioctl = (c) +#else +static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, + unsigned long arg) { return -EINVAL; } +#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl #endif static int hardware_enable_all(void); static void hardware_disable_all(void); @@ -2396,11 +2401,9 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp) static struct file_operations kvm_vcpu_fops = { .release = kvm_vcpu_release, .unlocked_ioctl = kvm_vcpu_ioctl, -#ifdef CONFIG_KVM_COMPAT - .compat_ioctl = kvm_vcpu_compat_ioctl, -#endif .mmap = kvm_vcpu_mmap, .llseek = noop_llseek, + KVM_COMPAT(kvm_vcpu_compat_ioctl), }; /* @@ -2824,10 +2827,8 @@ static int kvm_device_release(struct inode *inode, struct file *filp) static const struct file_operations kvm_device_fops = { .unlocked_ioctl = kvm_device_ioctl, -#ifdef CONFIG_KVM_COMPAT - .compat_ioctl = kvm_device_ioctl, -#endif .release = kvm_device_release, + KVM_COMPAT(kvm_device_ioctl), }; struct kvm_device *kvm_device_from_filp(struct file *filp) @@ -3165,10 +3166,8 @@ static long kvm_vm_compat_ioctl(struct file *filp, static struct file_operations kvm_vm_fops = { .release = kvm_vm_release, .unlocked_ioctl = kvm_vm_ioctl, -#ifdef CONFIG_KVM_COMPAT - .compat_ioctl = kvm_vm_compat_ioctl, -#endif .llseek = noop_llseek, + KVM_COMPAT(kvm_vm_compat_ioctl), }; static int kvm_dev_ioctl_create_vm(unsigned long type) @@ -3259,8 +3258,8 @@ out: static struct file_operations kvm_chardev_ops = { .unlocked_ioctl = kvm_dev_ioctl, - .compat_ioctl = kvm_dev_ioctl, .llseek = noop_llseek, + KVM_COMPAT(kvm_dev_ioctl), }; static struct miscdevice kvm_dev = { -- cgit v1.2.3 From 37b65db85f9b2fc98267eee4a18d7506492e6e8c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 17 Jun 2018 10:22:57 +0100 Subject: KVM: arm64: Prevent KVM_COMPAT from being selected There is very little point in trying to support the 32bit KVM/arm API on arm64, and this was never an anticipated use case. Let's make it clear by not selecting KVM_COMPAT. Acked-by: Mark Rutland Signed-off-by: Marc Zyngier --- virt/kvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 72143cfaf6ec..ea434ddc8499 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -47,7 +47,7 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT config KVM_COMPAT def_bool y - depends on KVM && COMPAT && !S390 + depends on KVM && COMPAT && !(S390 || ARM64) config HAVE_KVM_IRQ_BYPASS bool -- cgit v1.2.3 From fcc784be837714a9173b372ff9fb9b514590dad9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 4 Apr 2018 14:06:30 -0400 Subject: locking/lockdep: Do not record IRQ state within lockdep code While debugging where things were going wrong with mapping enabling/disabling interrupts with the lockdep state and actual real enabling and disabling interrupts, I had to silent the IRQ disabling/enabling in debug_check_no_locks_freed() because it was always showing up as it was called before the splat was. Use raw_local_irq_save/restore() for not only debug_check_no_locks_freed() but for all internal lockdep functions, as they hide useful information about where interrupts were used incorrectly last. Signed-off-by: Steven Rostedt (VMware) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: https://lkml.kernel.org/lkml/20180404140630.3f4f4c7a@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index edcac5de7ebc..5fa4d3138bf1 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1265,11 +1265,11 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class) this.parent = NULL; this.class = class; - local_irq_save(flags); + raw_local_irq_save(flags); arch_spin_lock(&lockdep_lock); ret = __lockdep_count_forward_deps(&this); arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); + raw_local_irq_restore(flags); return ret; } @@ -1292,11 +1292,11 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) this.parent = NULL; this.class = class; - local_irq_save(flags); + raw_local_irq_save(flags); arch_spin_lock(&lockdep_lock); ret = __lockdep_count_backward_deps(&this); arch_spin_unlock(&lockdep_lock); - local_irq_restore(flags); + raw_local_irq_restore(flags); return ret; } @@ -4411,7 +4411,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) if (unlikely(!debug_locks)) return; - local_irq_save(flags); + raw_local_irq_save(flags); for (i = 0; i < curr->lockdep_depth; i++) { hlock = curr->held_locks + i; @@ -4422,7 +4422,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); break; } - local_irq_restore(flags); + raw_local_irq_restore(flags); } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -- cgit v1.2.3 From 943e942e6266f22babee5efeb00f8f672fbff5bd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Jun 2018 09:49:37 -0600 Subject: nvme-pci: limit max IO size and segments to avoid high order allocations nvme requires an sg table allocation for each request. If the request is large, then the allocation can become quite large. For instance, with our default software settings of 1280KB IO size, we'll need 10248 bytes of sg table. That turns into a 2nd order allocation, which we can't always guarantee. If we fail the allocation, blk-mq will retry it later. But there's no guarantee that we'll EVER be able to allocate that much contigious memory. Limit the IO size such that we never need more than a single page of memory. That's a lot faster and more reliable. Then back that allocation with a mempool, so that we know we'll always be able to succeed the allocation at some point. Signed-off-by: Jens Axboe Acked-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 42 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 21710a7460c8..46df030b2c3f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1808,6 +1808,7 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, u32 max_segments = (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1; + max_segments = min_not_zero(max_segments, ctrl->max_segments); blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 231807cbc849..0c4a33df3b2f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -170,6 +170,7 @@ struct nvme_ctrl { u64 cap; u32 page_size; u32 max_hw_sectors; + u32 max_segments; u16 oncs; u16 oacs; u16 nssa; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 73a97fcea364..ba943f211687 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -38,6 +38,13 @@ #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +/* + * These can be higher, but we need to ensure that any command doesn't + * require an sg allocation that needs more than a page of data. + */ +#define NVME_MAX_KB_SZ 4096 +#define NVME_MAX_SEGS 127 + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -100,6 +107,8 @@ struct nvme_dev { struct nvme_ctrl ctrl; struct completion ioq_wait; + mempool_t *iod_mempool; + /* shadow doorbell buffer support: */ u32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; @@ -477,10 +486,7 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) iod->use_sgl = nvme_pci_use_sgls(dev, rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { - size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg, - iod->use_sgl); - - iod->sg = kmalloc(alloc_size, GFP_ATOMIC); + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); if (!iod->sg) return BLK_STS_RESOURCE; } else { @@ -526,7 +532,7 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) } if (iod->sg != iod->inline_sg) - kfree(iod->sg); + mempool_free(iod->sg, dev->iod_mempool); } #ifdef CONFIG_BLK_DEV_INTEGRITY @@ -2280,6 +2286,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); free_opal_dev(dev->ctrl.opal_dev); + mempool_destroy(dev->iod_mempool); kfree(dev); } @@ -2334,6 +2341,13 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + /* + * Limit the max command size to prevent iod->sg allocations going + * over a single page. + */ + dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; + dev->ctrl.max_segments = NVME_MAX_SEGS; + result = nvme_init_identify(&dev->ctrl); if (result) goto out; @@ -2509,6 +2523,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) int node, result = -ENOMEM; struct nvme_dev *dev; unsigned long quirks = id->driver_data; + size_t alloc_size; node = dev_to_node(&pdev->dev); if (node == NUMA_NO_NODE) @@ -2546,6 +2561,23 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto release_pools; + /* + * Double check that our mempool alloc size will cover the biggest + * command we support. + */ + alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, + NVME_MAX_SEGS, true); + WARN_ON_ONCE(alloc_size > PAGE_SIZE); + + dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, + mempool_kfree, + (void *) alloc_size, + GFP_KERNEL, node); + if (!dev->iod_mempool) { + result = -ENOMEM; + goto release_pools; + } + dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); nvme_get_ctrl(&dev->ctrl); -- cgit v1.2.3 From 70303420b5721c38998cf987e6b7d30cc62d4ff1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Jun 2018 13:20:53 -0400 Subject: tracing: Check for no filter when processing event filters The syzkaller detected a out-of-bounds issue with the events filter code, specifically here: prog[N].pred = NULL; /* #13 */ prog[N].target = 1; /* TRUE */ prog[N+1].pred = NULL; prog[N+1].target = 0; /* FALSE */ -> prog[N-1].target = N; prog[N-1].when_to_branch = false; As that's the first reference to a "N-1" index, it appears that the code got here with N = 0, which means the filter parser found no filter to parse (which shouldn't ever happen, but apparently it did). Add a new error to the parsing code that will check to make sure that N is not zero before going into this part of the code. If N = 0, then -EINVAL is returned, and a error message is added to the filter. Cc: stable@vger.kernel.org Fixes: 80765597bc587 ("tracing: Rewrite filter logic to be simpler and faster") Reported-by: air icy bugzilla url: https://bugzilla.kernel.org/show_bug.cgi?id=200019 Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e1c818dbc0d7..0dceb77d1d42 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -78,7 +78,8 @@ static const char * ops[] = { OPS }; C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \ C(INVALID_FILTER, "Meaningless filter expression"), \ C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ - C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), + C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ + C(NO_FILTER, "No filter found"), #undef C #define C(a, b) FILT_ERR_##a @@ -550,6 +551,13 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, goto out_free; } + if (!N) { + /* No program? */ + ret = -EINVAL; + parse_error(pe, FILT_ERR_NO_FILTER, ptr - str); + goto out_free; + } + prog[N].pred = NULL; /* #13 */ prog[N].target = 1; /* TRUE */ prog[N+1].pred = NULL; -- cgit v1.2.3 From 1a63dcd8765bc8680481dc2f9acf6ef13cee6d27 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 7 Jun 2018 13:11:43 -0700 Subject: softirq: Reorder trace_softirqs_on to prevent lockdep splat I'm able to reproduce a lockdep splat with config options: CONFIG_PROVE_LOCKING=y, CONFIG_DEBUG_LOCK_ALLOC=y and CONFIG_PREEMPTIRQ_EVENTS=y $ echo 1 > /d/tracing/events/preemptirq/preempt_enable/enable [ 26.112609] DEBUG_LOCKS_WARN_ON(current->softirqs_enabled) [ 26.112636] WARNING: CPU: 0 PID: 118 at kernel/locking/lockdep.c:3854 [...] [ 26.144229] Call Trace: [ 26.144926] [ 26.145506] lock_acquire+0x55/0x1b0 [ 26.146499] ? __do_softirq+0x46f/0x4d9 [ 26.147571] ? __do_softirq+0x46f/0x4d9 [ 26.148646] trace_preempt_on+0x8f/0x240 [ 26.149744] ? trace_preempt_on+0x4d/0x240 [ 26.150862] ? __do_softirq+0x46f/0x4d9 [ 26.151930] preempt_count_sub+0x18a/0x1a0 [ 26.152985] __do_softirq+0x46f/0x4d9 [ 26.153937] irq_exit+0x68/0xe0 [ 26.154755] smp_apic_timer_interrupt+0x271/0x280 [ 26.156056] apic_timer_interrupt+0xf/0x20 [ 26.157105] The issue was this: preempt_count = 1 << SOFTIRQ_SHIFT __local_bh_enable(cnt = 1 << SOFTIRQ_SHIFT) { if (softirq_count() == (cnt && SOFTIRQ_MASK)) { trace_softirqs_on() { current->softirqs_enabled = 1; } } preempt_count_sub(cnt) { trace_preempt_on() { tracepoint() { rcu_read_lock_sched() { // jumps into lockdep Where preempt_count still has softirqs disabled, but current->softirqs_enabled is true, and we get a splat. Link: http://lkml.kernel.org/r/20180607201143.247775-1-joel@joelfernandes.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Tom Zanussi Cc: Namhyung Kim Cc: Thomas Glexiner Cc: Boqun Feng Cc: Paul McKenney Cc: Masami Hiramatsu Cc: Todd Kjos Cc: Erick Reyes Cc: Julia Cartwright Cc: Byungchul Park Cc: stable@vger.kernel.org Reviewed-by: Steven Rostedt (VMware) Fixes: d59158162e032 ("tracing: Add support for preempt and irq enable/disable events") Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- kernel/softirq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index de2f57fddc04..900dcfee542c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -139,9 +139,13 @@ static void __local_bh_enable(unsigned int cnt) { lockdep_assert_irqs_disabled(); + if (preempt_count() == cnt) + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); + if (softirq_count() == (cnt & SOFTIRQ_MASK)) trace_softirqs_on(_RET_IP_); - preempt_count_sub(cnt); + + __preempt_count_sub(cnt); } /* -- cgit v1.2.3 From 08ae88f8104f486fd4103854119169f3e55dbc4c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 9 Feb 2018 11:53:16 -0600 Subject: tracing: Use swap macro in update_max_tr Make use of the swap macro and remove unnecessary variable _buf_. This makes the code easier to read and maintain. Also, reduces the stack usage. This code was detected with the help of Coccinelle. Link: http://lkml.kernel.org/r/20180209175316.GA18720@embeddedgus Signed-off-by: Gustavo A. R. Silva Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c9336e98ac59..a0079b4c7a49 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1360,8 +1360,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf; - if (tr->stop_count) return; @@ -1375,9 +1373,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&tr->max_lock); - buf = tr->trace_buffer.buffer; - tr->trace_buffer.buffer = tr->max_buffer.buffer; - tr->max_buffer.buffer = buf; + swap(tr->trace_buffer.buffer, tr->max_buffer.buffer); __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); -- cgit v1.2.3 From 064f35a952246c60e956717dfc5782c48f174e74 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 14 Jun 2018 15:48:59 -0700 Subject: tracing: Fix some errors in histogram documentation Fix typos, inconsistencies in using quotes, incorrect section number, etc. in the trace histogram documentation. Link: http://lkml.kernel.org/r/20180614224859.55864-1-joel@joelfernandes.org Reviewed-by: Masami Hiramatsu Acked-by: Tom Zanussi Signed-off-by: Joel Fernandes (Google) Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/histogram.txt | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt index e73bcf9cb5f3..7ffea6aa22e3 100644 --- a/Documentation/trace/histogram.txt +++ b/Documentation/trace/histogram.txt @@ -1729,35 +1729,35 @@ If a variable isn't a key variable or prefixed with 'vals=', the associated event field will be saved in a variable but won't be summed as a value: - # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger + # echo 'hist:keys=next_pid:ts1=common_timestamp ...' >> event/trigger Multiple variables can be assigned at the same time. The below would result in both ts0 and b being created as variables, with both common_timestamp and field1 additionally being summed as values: - # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \ + # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ...' >> \ event/trigger Note that variable assignments can appear either preceding or following their use. The command below behaves identically to the command above: - # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \ + # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ...' >> \ event/trigger Any number of variables not bound to a 'vals=' prefix can also be assigned by simply separating them with colons. Below is the same thing but without the values being summed in the histogram: - # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger + # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ...' >> event/trigger Variables set as above can be referenced and used in expressions on another event. For example, here's how a latency can be calculated: - # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger - # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger + # echo 'hist:keys=pid,prio:ts0=common_timestamp ...' >> event1/trigger + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ...' >> event2/trigger In the first line above, the event's timetamp is saved into the variable ts0. In the next line, ts0 is subtracted from the second @@ -1766,7 +1766,7 @@ yet another variable, 'wakeup_lat'. The hist trigger below in turn makes use of the wakeup_lat variable to compute a combined latency using the same key and variable from yet another event: - # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger + # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ...' >> event3/trigger 2.2.2 Synthetic Events ---------------------- @@ -1807,10 +1807,11 @@ the command that defined it with a '!': At this point, there isn't yet an actual 'wakeup_latency' event instantiated in the event subsytem - for this to happen, a 'hist trigger action' needs to be instantiated and bound to actual fields -and variables defined on other events (see Section 6.3.3 below). +and variables defined on other events (see Section 2.2.3 below on +how that is done using hist trigger 'onmatch' action). Once that is +done, the 'wakeup_latency' synthetic event instance is created. -Once that is done, an event instance is created, and a histogram can -be defined using it: +A histogram can now be defined for the new synthetic event: # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger @@ -1960,7 +1961,7 @@ hist trigger specification. back to that pid, the timestamp difference is calculated. If the resulting latency, stored in wakeup_lat, exceeds the current maximum latency, the values specified in the save() fields are - recoreded: + recorded: # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ if comm=="cyclictest"' >> \ -- cgit v1.2.3 From ed7d40bc67b8353c677b38c6cdddcdc310c0f452 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 8 Jun 2018 14:47:46 -0700 Subject: tracing: Fix SKIP_STACK_VALIDATION=1 build due to bad merge with -mrecord-mcount Non gcc-5 builds with CONFIG_STACK_VALIDATION=y and SKIP_STACK_VALIDATION=1 fail. Example output: /bin/sh: init/.tmp_main.o: Permission denied commit 96f60dfa5819 ("trace: Use -mcount-record for dynamic ftrace"), added a mismatched endif. This causes cmd_objtool to get mistakenly set. Relocate endif to balance the newly added -record-mcount check. Link: http://lkml.kernel.org/r/20180608214746.136554-1-gthelen@google.com Fixes: 96f60dfa5819 ("trace: Use -mcount-record for dynamic ftrace") Acked-by: Andi Kleen Tested-by: David Rientjes Signed-off-by: Greg Thelen Signed-off-by: Steven Rostedt (VMware) --- scripts/Makefile.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 34d9e9ce97c2..e7889f486ca1 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -239,6 +239,7 @@ cmd_record_mcount = \ "$(CC_FLAGS_FTRACE)" ]; then \ $(sub_cmd_record_mcount) \ fi; +endif # -record-mcount endif # CONFIG_FTRACE_MCOUNT_RECORD ifdef CONFIG_STACK_VALIDATION @@ -263,7 +264,6 @@ ifneq ($(RETPOLINE_CFLAGS),) objtool_args += --retpoline endif endif -endif ifdef CONFIG_MODVERSIONS -- cgit v1.2.3 From 6cc65be4f6f2a7186af8f3e09900787c7912dad2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Jun 2018 20:35:26 -0400 Subject: locking/qspinlock: Fix build for anonymous union in older GCC compilers One of my tests compiles the kernel with gcc 4.5.3, and I hit the following build error: include/linux/semaphore.h: In function 'sema_init': include/linux/semaphore.h:35:17: error: unknown field 'val' specified in initializer include/linux/semaphore.h:35:17: warning: missing braces around initializer include/linux/semaphore.h:35:17: warning: (near initialization for '(anonymous).raw_lock..val') I bisected it down to: 625e88be1f41 ("locking/qspinlock: Merge 'struct __qspinlock' into 'struct qspinlock'") ... which makes qspinlock have an anonymous union, which makes initializing it special for older compilers. By adding strategic brackets, it makes the build happy again. Signed-off-by: Steven Rostedt (VMware) Acked-by: Waiman Long Cc: Andrew Morton Cc: Boqun Feng Cc: Linus Torvalds Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Fixes: 625e88be1f41 ("locking/qspinlock: Merge 'struct __qspinlock' into 'struct qspinlock'") Link: http://lkml.kernel.org/r/20180621203526.172ab5c4@vmware.local.home Signed-off-by: Ingo Molnar --- include/asm-generic/qspinlock_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 0763f065b975..d10f1e7d6ba8 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -63,7 +63,7 @@ typedef struct qspinlock { /* * Initializier */ -#define __ARCH_SPIN_LOCK_UNLOCKED { .val = ATOMIC_INIT(0) } +#define __ARCH_SPIN_LOCK_UNLOCKED { { .val = ATOMIC_INIT(0) } } /* * Bitfields in the atomic value: -- cgit v1.2.3 From c51b3c639e01f20559531eef3c5919feae23c55a Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 18 Jun 2018 09:36:39 +0200 Subject: xen: add new hypercall buffer mapping device For passing arbitrary data from user land to the Xen hypervisor the Xen tools today are using mlock()ed buffers. Unfortunately the kernel might change access rights of such buffers for brief periods of time e.g. for page migration or compaction, leading to access faults in the hypervisor, as the hypervisor can't use the locks of the kernel. In order to solve this problem add a new device node to the Xen privcmd driver to easily allocate hypercall buffers via mmap(). The memory is allocated in the kernel and just mapped into user space. Marked as VM_IO the user mapping will not be subject to page migration et al. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- drivers/xen/Makefile | 2 +- drivers/xen/privcmd-buf.c | 210 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/xen/privcmd.c | 9 ++ drivers/xen/privcmd.h | 3 + 4 files changed, 223 insertions(+), 1 deletion(-) create mode 100644 drivers/xen/privcmd-buf.c diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 451e833f5931..48b154276179 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -41,4 +41,4 @@ obj-$(CONFIG_XEN_PVCALLS_FRONTEND) += pvcalls-front.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o -xen-privcmd-y := privcmd.o +xen-privcmd-y := privcmd.o privcmd-buf.o diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c new file mode 100644 index 000000000000..df1ed37c3269 --- /dev/null +++ b/drivers/xen/privcmd-buf.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT + +/****************************************************************************** + * privcmd-buf.c + * + * Mmap of hypercall buffers. + * + * Copyright (c) 2018 Juergen Gross + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "privcmd.h" + +MODULE_LICENSE("GPL"); + +static unsigned int limit = 64; +module_param(limit, uint, 0644); +MODULE_PARM_DESC(limit, "Maximum number of pages that may be allocated by " + "the privcmd-buf device per open file"); + +struct privcmd_buf_private { + struct mutex lock; + struct list_head list; + unsigned int allocated; +}; + +struct privcmd_buf_vma_private { + struct privcmd_buf_private *file_priv; + struct list_head list; + unsigned int users; + unsigned int n_pages; + struct page *pages[]; +}; + +static int privcmd_buf_open(struct inode *ino, struct file *file) +{ + struct privcmd_buf_private *file_priv; + + file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL); + if (!file_priv) + return -ENOMEM; + + mutex_init(&file_priv->lock); + INIT_LIST_HEAD(&file_priv->list); + + file->private_data = file_priv; + + return 0; +} + +static void privcmd_buf_vmapriv_free(struct privcmd_buf_vma_private *vma_priv) +{ + unsigned int i; + + vma_priv->file_priv->allocated -= vma_priv->n_pages; + + list_del(&vma_priv->list); + + for (i = 0; i < vma_priv->n_pages; i++) + if (vma_priv->pages[i]) + __free_page(vma_priv->pages[i]); + + kfree(vma_priv); +} + +static int privcmd_buf_release(struct inode *ino, struct file *file) +{ + struct privcmd_buf_private *file_priv = file->private_data; + struct privcmd_buf_vma_private *vma_priv; + + mutex_lock(&file_priv->lock); + + while (!list_empty(&file_priv->list)) { + vma_priv = list_first_entry(&file_priv->list, + struct privcmd_buf_vma_private, + list); + privcmd_buf_vmapriv_free(vma_priv); + } + + mutex_unlock(&file_priv->lock); + + kfree(file_priv); + + return 0; +} + +static void privcmd_buf_vma_open(struct vm_area_struct *vma) +{ + struct privcmd_buf_vma_private *vma_priv = vma->vm_private_data; + + if (!vma_priv) + return; + + mutex_lock(&vma_priv->file_priv->lock); + vma_priv->users++; + mutex_unlock(&vma_priv->file_priv->lock); +} + +static void privcmd_buf_vma_close(struct vm_area_struct *vma) +{ + struct privcmd_buf_vma_private *vma_priv = vma->vm_private_data; + struct privcmd_buf_private *file_priv; + + if (!vma_priv) + return; + + file_priv = vma_priv->file_priv; + + mutex_lock(&file_priv->lock); + + vma_priv->users--; + if (!vma_priv->users) + privcmd_buf_vmapriv_free(vma_priv); + + mutex_unlock(&file_priv->lock); +} + +static vm_fault_t privcmd_buf_vma_fault(struct vm_fault *vmf) +{ + pr_debug("fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", + vmf->vma, vmf->vma->vm_start, vmf->vma->vm_end, + vmf->pgoff, (void *)vmf->address); + + return VM_FAULT_SIGBUS; +} + +static const struct vm_operations_struct privcmd_buf_vm_ops = { + .open = privcmd_buf_vma_open, + .close = privcmd_buf_vma_close, + .fault = privcmd_buf_vma_fault, +}; + +static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct privcmd_buf_private *file_priv = file->private_data; + struct privcmd_buf_vma_private *vma_priv; + unsigned long count = vma_pages(vma); + unsigned int i; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED) || count > limit || + file_priv->allocated + count > limit) + return -EINVAL; + + vma_priv = kzalloc(sizeof(*vma_priv) + count * sizeof(void *), + GFP_KERNEL); + if (!vma_priv) + return -ENOMEM; + + vma_priv->n_pages = count; + count = 0; + for (i = 0; i < vma_priv->n_pages; i++) { + vma_priv->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!vma_priv->pages[i]) + break; + count++; + } + + mutex_lock(&file_priv->lock); + + file_priv->allocated += count; + + vma_priv->file_priv = file_priv; + vma_priv->users = 1; + + vma->vm_flags |= VM_IO | VM_DONTEXPAND; + vma->vm_ops = &privcmd_buf_vm_ops; + vma->vm_private_data = vma_priv; + + list_add(&vma_priv->list, &file_priv->list); + + if (vma_priv->n_pages != count) + ret = -ENOMEM; + else + for (i = 0; i < vma_priv->n_pages; i++) { + ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, + vma_priv->pages[i]); + if (ret) + break; + } + + if (ret) + privcmd_buf_vmapriv_free(vma_priv); + + mutex_unlock(&file_priv->lock); + + return ret; +} + +const struct file_operations xen_privcmdbuf_fops = { + .owner = THIS_MODULE, + .open = privcmd_buf_open, + .release = privcmd_buf_release, + .mmap = privcmd_buf_mmap, +}; +EXPORT_SYMBOL_GPL(xen_privcmdbuf_fops); + +struct miscdevice xen_privcmdbuf_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/hypercall", + .fops = &xen_privcmdbuf_fops, +}; diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 8ae0349d9f0a..7e6e682104dc 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -1007,12 +1007,21 @@ static int __init privcmd_init(void) pr_err("Could not register Xen privcmd device\n"); return err; } + + err = misc_register(&xen_privcmdbuf_dev); + if (err != 0) { + pr_err("Could not register Xen hypercall-buf device\n"); + misc_deregister(&privcmd_dev); + return err; + } + return 0; } static void __exit privcmd_exit(void) { misc_deregister(&privcmd_dev); + misc_deregister(&xen_privcmdbuf_dev); } module_init(privcmd_init); diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h index 14facaeed36f..0dd9f8f67ee3 100644 --- a/drivers/xen/privcmd.h +++ b/drivers/xen/privcmd.h @@ -1,3 +1,6 @@ #include extern const struct file_operations xen_privcmd_fops; +extern const struct file_operations xen_privcmdbuf_fops; + +extern struct miscdevice xen_privcmdbuf_dev; -- cgit v1.2.3 From eef04c7b3786ff0c9cb1019278b6c6c2ea0ad4ff Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 21 Jun 2018 13:29:44 -0400 Subject: xen: Remove unnecessary BUG_ON from __unbind_from_irq() Commit 910f8befdf5b ("xen/pirq: fix error path cleanup when binding MSIs") fixed a couple of errors in error cleanup path of xen_bind_pirq_msi_to_irq(). This cleanup allowed a call to __unbind_from_irq() with an unbound irq, which would result in triggering the BUG_ON there. Since there is really no reason for the BUG_ON (xen_free_irq() can operate on unbound irqs) we can remove it. Reported-by: Ben Hutchings Signed-off-by: Boris Ostrovsky Cc: stable@vger.kernel.org Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/events/events_base.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 762378f1811c..08e4af04d6f2 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -628,8 +628,6 @@ static void __unbind_from_irq(unsigned int irq) xen_irq_info_cleanup(info); } - BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - xen_free_irq(irq); } -- cgit v1.2.3 From 52e1cf2d19c2e62e6a81b8de3f7320d033917dd5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Fri, 22 Jun 2018 08:42:22 +0200 Subject: efi/libstub/tpm: Initialize efi_physical_addr_t vars to zero for mixed mode Commit: 79832f0b5f71 ("efi/libstub/tpm: Initialize pointer variables to zero for mixed mode") fixes a problem with the tpm code on mixed mode (64-bit kernel on 32-bit UEFI), where 64-bit pointer variables are not fully initialized by the 32-bit EFI code. A similar problem applies to the efi_physical_addr_t variables which are written by the ->get_event_log() EFI call. Even though efi_physical_addr_t is 64-bit everywhere, it seems that some 32-bit UEFI implementations only fill in the lower 32 bits when passed a pointer to an efi_physical_addr_t to fill. This commit initializes these to 0 to, to ensure the upper 32 bits are 0 in mixed mode. This fixes recent kernels sometimes hanging during early boot on mixed mode UEFI systems. Signed-off-by: Hans de Goede Signed-off-by: Ard Biesheuvel Cc: # v4.16+ Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/20180622064222.11633-2-ard.biesheuvel@linaro.org Signed-off-by: Ingo Molnar --- drivers/firmware/efi/libstub/tpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/tpm.c b/drivers/firmware/efi/libstub/tpm.c index caa37a6dd9d4..a90b0b8fc69a 100644 --- a/drivers/firmware/efi/libstub/tpm.c +++ b/drivers/firmware/efi/libstub/tpm.c @@ -64,7 +64,7 @@ static void efi_retrieve_tpm2_eventlog_1_2(efi_system_table_t *sys_table_arg) efi_guid_t tcg2_guid = EFI_TCG2_PROTOCOL_GUID; efi_guid_t linux_eventlog_guid = LINUX_EFI_TPM_EVENT_LOG_GUID; efi_status_t status; - efi_physical_addr_t log_location, log_last_entry; + efi_physical_addr_t log_location = 0, log_last_entry = 0; struct linux_efi_tpm_eventlog *log_tbl = NULL; unsigned long first_entry_addr, last_entry_addr; size_t log_size, last_entry_size; -- cgit v1.2.3 From 57d6a7938a8fc6cee8420b40ca244220b41721f5 Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 8 Mar 2018 21:28:56 +0100 Subject: perf/core: Move the inline keyword at the beginning of the function declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building perf with W=1 the following warning triggers: CC kernel/events/ring_buffer.o kernel/events/ring_buffer.c:105:1: warning: ‘inline’ is not at beginning of declaration [-Wold-style-declaration] static bool __always_inline ^~~~~~ ... Move the inline keyword to the beginning of the function declaration. Signed-off-by: Mathieu Malaterre Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: trival@kernel.org Link: http://lkml.kernel.org/r/20180308202856.9378-1-malat@debian.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 1d8ca9ea9979..d11f60cbe8ca 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -103,7 +103,7 @@ out: preempt_enable(); } -static bool __always_inline +static __always_inline bool ring_buffer_has_space(unsigned long head, unsigned long tail, unsigned long data_size, unsigned int size, bool backward) @@ -114,7 +114,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail, return CIRC_SPACE(tail, head, data_size) >= size; } -static int __always_inline +static __always_inline int __perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, bool backward) @@ -414,7 +414,7 @@ err: } EXPORT_SYMBOL_GPL(perf_aux_output_begin); -static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) +static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb) { if (rb->aux_overwrite) return false; -- cgit v1.2.3 From 72a8edc2d9134c2895eac2fec5eecf8230a05c96 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 22 Jun 2018 10:52:48 +0100 Subject: genirq/debugfs: Add missing IRQCHIP_SUPPORTS_LEVEL_MSI debug Debug is missing the IRQCHIP_SUPPORTS_LEVEL_MSI debug entry, making debugfs slightly less useful. Take this opportunity to also add a missing comment in the definition of IRQCHIP_SUPPORTS_LEVEL_MSI. Fixes: 6988e0e0d283 ("genirq/msi: Limit level-triggered MSI to platform devices") Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jason Cooper Cc: Alexandre Belloni Cc: Yang Yingliang Cc: Sumit Garg Link: https://lkml.kernel.org/r/20180622095254.5906-2-marc.zyngier@arm.com --- include/linux/irq.h | 1 + kernel/irq/debugfs.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/irq.h b/include/linux/irq.h index 4bd2f34947f4..201de12a9957 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -503,6 +503,7 @@ struct irq_chip { * IRQCHIP_SKIP_SET_WAKE: Skip chip.irq_set_wake(), for this irq chip * IRQCHIP_ONESHOT_SAFE: One shot does not require mask/unmask * IRQCHIP_EOI_THREADED: Chip requires eoi() on unmask in threaded mode + * IRQCHIP_SUPPORTS_LEVEL_MSI Chip can provide two doorbells for Level MSIs */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 4dadeb3d6666..6f636136cccc 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -55,6 +55,7 @@ static const struct irq_bit_descr irqchip_flags[] = { BIT_MASK_DESCR(IRQCHIP_SKIP_SET_WAKE), BIT_MASK_DESCR(IRQCHIP_ONESHOT_SAFE), BIT_MASK_DESCR(IRQCHIP_EOI_THREADED), + BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI), }; static void -- cgit v1.2.3 From 893fbfff976cd069f2e60c3b186dbe3f85504db2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 22 Jun 2018 10:52:49 +0100 Subject: irqchip/ls-scfg-msi: Fix MSI affinity handling The ls-scfs-msi driver is not dealing with the effective affinity as it should. Let's fix that, and make it clear that the effective affinity is restricted to a single CPU. Also prevent the driver from messing with the internals of the affinity setting infrastructure. Reported-by: Alexandre Belloni Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Alexandre Belloni Cc: Jason Cooper Cc: Yang Yingliang Cc: Sumit Garg Link: https://lkml.kernel.org/r/20180622095254.5906-3-marc.zyngier@arm.com --- drivers/irqchip/irq-ls-scfg-msi.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/irqchip/irq-ls-scfg-msi.c b/drivers/irqchip/irq-ls-scfg-msi.c index 1ec3bfe56693..c671b3212010 100644 --- a/drivers/irqchip/irq-ls-scfg-msi.c +++ b/drivers/irqchip/irq-ls-scfg-msi.c @@ -93,8 +93,12 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) msg->address_lo = lower_32_bits(msi_data->msiir_addr); msg->data = data->hwirq; - if (msi_affinity_flag) - msg->data |= cpumask_first(data->common->affinity); + if (msi_affinity_flag) { + const struct cpumask *mask; + + mask = irq_data_get_effective_affinity_mask(data); + msg->data |= cpumask_first(mask); + } iommu_dma_map_msi_msg(data->irq, msg); } @@ -121,7 +125,7 @@ static int ls_scfg_msi_set_affinity(struct irq_data *irq_data, return -EINVAL; } - cpumask_copy(irq_data->common->affinity, mask); + irq_data_update_effective_affinity(irq_data, cpumask_of(cpu)); return IRQ_SET_MASK_OK; } -- cgit v1.2.3 From cbaf45a6be497c272e80500e4fd9bccdf20d5050 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 22 Jun 2018 10:52:50 +0100 Subject: irqchip/gic-v2m: Fix SPI release on error path On failing to allocate the required SPIs, the actual number of interrupts should be freed and not its log2 value. Fixes: de337ee30142 ("irqchip/gic-v2m: Add PCI Multi-MSI support") Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jason Cooper Cc: Alexandre Belloni Cc: Yang Yingliang Cc: Sumit Garg Link: https://lkml.kernel.org/r/20180622095254.5906-4-marc.zyngier@arm.com --- drivers/irqchip/irq-gic-v2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index 0f52d44b3f69..f5fe0100f9ff 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -199,7 +199,7 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, fail: irq_domain_free_irqs_parent(domain, virq, nr_irqs); - gicv2m_unalloc_msi(v2m, hwirq, get_count_order(nr_irqs)); + gicv2m_unalloc_msi(v2m, hwirq, nr_irqs); return err; } -- cgit v1.2.3 From c1797b11a09c8323c92b074fd48b89a936c991d0 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 22 Jun 2018 10:52:51 +0100 Subject: irqchip/gic-v3-its: Don't bind LPI to unavailable NUMA node On a NUMA system, if an ITS is local to an offline node, the ITS driver may pick an offline CPU to bind the LPI. In this case, pick an online CPU (and the first one will do). But on some systems, binding an LPI to non-local node CPU may cause deadlock (see Cavium erratum 23144). In this case, just fail the activate and return an error code. Signed-off-by: Yang Yingliang Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jason Cooper Cc: Alexandre Belloni Cc: Sumit Garg Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180622095254.5906-5-marc.zyngier@arm.com --- drivers/irqchip/irq-gic-v3-its.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 5377d7e2afba..cae53937feeb 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -2310,7 +2310,14 @@ static int its_irq_domain_activate(struct irq_domain *domain, cpu_mask = cpumask_of_node(its_dev->its->numa_node); /* Bind the LPI to the first possible CPU */ - cpu = cpumask_first(cpu_mask); + cpu = cpumask_first_and(cpu_mask, cpu_online_mask); + if (cpu >= nr_cpu_ids) { + if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) + return -EINVAL; + + cpu = cpumask_first(cpu_online_mask); + } + its_dev->event_map.col_map[event] = cpu; irq_data_update_effective_affinity(d, cpumask_of(cpu)); -- cgit v1.2.3 From 83559b47cdc4d396fc1187a13b527d01b55e0fe6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 22 Jun 2018 10:52:52 +0100 Subject: irqchip/gic-v3-its: Only emit SYNC if targetting a valid collection It is possible, under obscure circumstances, to convince the ITS driver to emit a SYNC operation that targets a collection that is not bound to any redistributor (and the target_address field is zero) because the corresponding CPU has not been seen yet (the system has been booted with max_cpus="something small"). If the ITS is using the linear CPU number as the target, this is not a big deal, as we just end-up issuing a SYNC to CPU0. But if the ITS requires the physical address of the redistributor (with GITS_TYPER.PTA==1), we end-up asking the ITS to write to the physical address zero, which is not exactly a good idea (there has been report of the ITS locking up). This should of course never happen, but hey, this is SW... In order to avoid the above disaster, let's track which collections have been actually initialized, and let's not generate a SYNC if the collection hasn't been properly bound to a redistributor. Take this opportunity to spit our a warning, in the hope that someone may report the issue if it arrises again. Reported-by: Yang Yingliang Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jason Cooper Cc: Alexandre Belloni Cc: Sumit Garg Link: https://lkml.kernel.org/r/20180622095254.5906-6-marc.zyngier@arm.com --- drivers/irqchip/irq-gic-v3-its.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index cae53937feeb..fcfc96f8e0de 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -182,6 +182,14 @@ static struct its_collection *dev_event_to_col(struct its_device *its_dev, return its->collections + its_dev->event_map.col_map[event]; } +static struct its_collection *valid_col(struct its_collection *col) +{ + if (WARN_ON_ONCE(col->target_address & GENMASK_ULL(0, 15))) + return NULL; + + return col; +} + /* * ITS command descriptors - parameters to be encoded in a command * block. @@ -439,7 +447,7 @@ static struct its_collection *its_build_mapti_cmd(struct its_node *its, its_fixup_cmd(cmd); - return col; + return valid_col(col); } static struct its_collection *its_build_movi_cmd(struct its_node *its, @@ -458,7 +466,7 @@ static struct its_collection *its_build_movi_cmd(struct its_node *its, its_fixup_cmd(cmd); - return col; + return valid_col(col); } static struct its_collection *its_build_discard_cmd(struct its_node *its, @@ -476,7 +484,7 @@ static struct its_collection *its_build_discard_cmd(struct its_node *its, its_fixup_cmd(cmd); - return col; + return valid_col(col); } static struct its_collection *its_build_inv_cmd(struct its_node *its, @@ -494,7 +502,7 @@ static struct its_collection *its_build_inv_cmd(struct its_node *its, its_fixup_cmd(cmd); - return col; + return valid_col(col); } static struct its_collection *its_build_int_cmd(struct its_node *its, @@ -512,7 +520,7 @@ static struct its_collection *its_build_int_cmd(struct its_node *its, its_fixup_cmd(cmd); - return col; + return valid_col(col); } static struct its_collection *its_build_clear_cmd(struct its_node *its, @@ -530,7 +538,7 @@ static struct its_collection *its_build_clear_cmd(struct its_node *its, its_fixup_cmd(cmd); - return col; + return valid_col(col); } static struct its_collection *its_build_invall_cmd(struct its_node *its, @@ -1824,11 +1832,16 @@ static int its_alloc_tables(struct its_node *its) static int its_alloc_collections(struct its_node *its) { + int i; + its->collections = kcalloc(nr_cpu_ids, sizeof(*its->collections), GFP_KERNEL); if (!its->collections) return -ENOMEM; + for (i = 0; i < nr_cpu_ids; i++) + its->collections[i].target_address = ~0ULL; + return 0; } -- cgit v1.2.3 From 205e065d91d72e6afad112ea84f0ca60b30bf5ab Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 22 Jun 2018 10:52:53 +0100 Subject: irqchip/gic-v3-its: Only emit VSYNC if targetting a valid collection Similarily to the SYNC operation, it must be verified that the VPE targetted by a VLPI is backed by a valid collection in the GIC driver data structures. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jason Cooper Cc: Alexandre Belloni Cc: Yang Yingliang Cc: Sumit Garg Link: https://lkml.kernel.org/r/20180622095254.5906-7-marc.zyngier@arm.com --- drivers/irqchip/irq-gic-v3-its.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fcfc96f8e0de..0269ffb93f6e 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -190,6 +190,14 @@ static struct its_collection *valid_col(struct its_collection *col) return col; } +static struct its_vpe *valid_vpe(struct its_node *its, struct its_vpe *vpe) +{ + if (valid_col(its->collections + vpe->col_idx)) + return vpe; + + return NULL; +} + /* * ITS command descriptors - parameters to be encoded in a command * block. @@ -562,7 +570,7 @@ static struct its_vpe *its_build_vinvall_cmd(struct its_node *its, its_fixup_cmd(cmd); - return desc->its_vinvall_cmd.vpe; + return valid_vpe(its, desc->its_vinvall_cmd.vpe); } static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, @@ -584,7 +592,7 @@ static struct its_vpe *its_build_vmapp_cmd(struct its_node *its, its_fixup_cmd(cmd); - return desc->its_vmapp_cmd.vpe; + return valid_vpe(its, desc->its_vmapp_cmd.vpe); } static struct its_vpe *its_build_vmapti_cmd(struct its_node *its, @@ -607,7 +615,7 @@ static struct its_vpe *its_build_vmapti_cmd(struct its_node *its, its_fixup_cmd(cmd); - return desc->its_vmapti_cmd.vpe; + return valid_vpe(its, desc->its_vmapti_cmd.vpe); } static struct its_vpe *its_build_vmovi_cmd(struct its_node *its, @@ -630,7 +638,7 @@ static struct its_vpe *its_build_vmovi_cmd(struct its_node *its, its_fixup_cmd(cmd); - return desc->its_vmovi_cmd.vpe; + return valid_vpe(its, desc->its_vmovi_cmd.vpe); } static struct its_vpe *its_build_vmovp_cmd(struct its_node *its, @@ -648,7 +656,7 @@ static struct its_vpe *its_build_vmovp_cmd(struct its_node *its, its_fixup_cmd(cmd); - return desc->its_vmovp_cmd.vpe; + return valid_vpe(its, desc->its_vmovp_cmd.vpe); } static u64 its_cmd_ptr_to_offset(struct its_node *its, -- cgit v1.2.3 From 82f499c8811149069ec958b72a86643a7a289b25 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 22 Jun 2018 10:52:54 +0100 Subject: irqchip/gic-v3-its: Fix reprogramming of redistributors on CPU hotplug Enabling LPIs was made a lot stricter recently, by checking that they are disabled before enabling them. By doing so, the CPU hotplug case was missed altogether, which leaves LPIs enabled on hotplug off (expecting the CPU to eventually come back), and won't write a different value anyway on hotplug on. So skip that check if that particular case is detected Fixes: 6eb486b66a30 ("irqchip/gic-v3: Ensure GICR_CTLR.EnableLPI=0 is observed before enabling") Reported-by: Sumit Garg Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Sumit Garg Cc: Jason Cooper Cc: Alexandre Belloni Cc: Yang Yingliang Link: https://lkml.kernel.org/r/20180622095254.5906-8-marc.zyngier@arm.com --- drivers/irqchip/irq-gic-v3-its.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 0269ffb93f6e..d7842d312d3e 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3427,6 +3427,16 @@ static int redist_disable_lpis(void) u64 timeout = USEC_PER_SEC; u64 val; + /* + * If coming via a CPU hotplug event, we don't need to disable + * LPIs before trying to re-enable them. They are already + * configured and all is well in the world. Detect this case + * by checking the allocation of the pending table for the + * current CPU. + */ + if (gic_data_rdist()->pend_page) + return 0; + if (!gic_rdists_supports_plpis()) { pr_info("CPU%d: LPIs not supported\n", smp_processor_id()); return -ENXIO; -- cgit v1.2.3 From bed9df97b39e73a4607189f2c4b9fb89cc3f7f59 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 22 Jun 2018 19:35:33 +0800 Subject: irqdesc: Delete irq_desc_get_msi_desc() Function irq_desc_get_msi_desc() is not referenced in the kernel (and does not seem to have been referenced since e39758e0ea76, 3 years ago), so delete it. Signed-off-by: John Garry Signed-off-by: Thomas Gleixner Cc: Cc: Cc: Cc: Cc: Cc: Link: https://lkml.kernel.org/r/1529667333-92959-1-git-send-email-john.garry@huawei.com --- include/linux/irqdesc.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 25b33b664537..dd1e40ddac7d 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -145,11 +145,6 @@ static inline void *irq_desc_get_handler_data(struct irq_desc *desc) return desc->irq_common_data.handler_data; } -static inline struct msi_desc *irq_desc_get_msi_desc(struct irq_desc *desc) -{ - return desc->irq_common_data.msi_desc; -} - /* * Architectures call this to let the generic IRQ layer * handle an interrupt. -- cgit v1.2.3 From 1f74c8a64798e2c488f86efc97e308b85fb7d7aa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 22 Jun 2018 11:54:28 +0200 Subject: x86/mce: Do not overwrite MCi_STATUS in mce_no_way_out() mce_no_way_out() does a quick check during #MC to see whether some of the MCEs logged would require the kernel to panic immediately. And it passes a struct mce where MCi_STATUS gets written. However, after having saved a valid status value, the next iteration of the loop which goes over the MCA banks on the CPU, overwrites the valid status value because we're using struct mce as storage instead of a temporary variable. Which leads to MCE records with an empty status value: mce: [Hardware Error]: CPU 0: Machine Check Exception: 6 Bank 0: 0000000000000000 mce: [Hardware Error]: RIP 10: {trigger_mce+0x7/0x10} In order to prevent the loss of the status register value, return immediately when severity is a panic one so that we can panic immediately with the first fatal MCE logged. This is also the intention of this function and not to noodle over the banks while a fatal MCE is already logged. Tony: read the rest of the MCA bank to populate the struct mce fully. Suggested-by: Tony Luck Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Link: https://lkml.kernel.org/r/20180622095428.626-8-bp@alien8.de --- arch/x86/kernel/cpu/mcheck/mce.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index cd76380af79f..7e6f51a9d917 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll); static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { - int i, ret = 0; char *tmp; + int i; for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(msr_ops.status(i)); - if (m->status & MCI_STATUS_VAL) { - __set_bit(i, validp); - if (quirk_no_way_out) - quirk_no_way_out(i, m, regs); - } + if (!(m->status & MCI_STATUS_VAL)) + continue; + + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + mce_read_aux(m, i); *msg = tmp; - ret = 1; + return 1; } } - return ret; + return 0; } /* -- cgit v1.2.3 From 40c36e2741d7fe1e66d6ec55477ba5fd19c9c5d2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 22 Jun 2018 11:54:23 +0200 Subject: x86/mce: Fix incorrect "Machine check from unknown source" message Some injection testing resulted in the following console log: mce: [Hardware Error]: CPU 22: Machine Check Exception: f Bank 1: bd80000000100134 mce: [Hardware Error]: RIP 10: {pmem_do_bvec+0x11d/0x330 [nd_pmem]} mce: [Hardware Error]: TSC c51a63035d52 ADDR 3234bc4000 MISC 88 mce: [Hardware Error]: PROCESSOR 0:50654 TIME 1526502199 SOCKET 0 APIC 38 microcode 2000043 mce: [Hardware Error]: Run the above through 'mcelog --ascii' Kernel panic - not syncing: Machine check from unknown source This confused everybody because the first line quite clearly shows that we found a logged error in "Bank 1", while the last line says "unknown source". The problem is that the Linux code doesn't do the right thing for a local machine check that results in a fatal error. It turns out that we know very early in the handler whether the machine check is fatal. The call to mce_no_way_out() has checked all the banks for the CPU that took the local machine check. If it says we must crash, we can do so right away with the right messages. We do scan all the banks again. This means that we might initially not see a problem, but during the second scan find something fatal. If this happens we print a slightly different message (so I can see if it actually every happens). [ bp: Remove unneeded severity assignment. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Ashok Raj Cc: Dan Williams Cc: Qiuxu Zhuo Cc: linux-edac Cc: stable@vger.kernel.org # 4.2 Link: http://lkml.kernel.org/r/52e049a497e86fd0b71c529651def8871c804df0.1527283897.git.tony.luck@intel.com --- arch/x86/kernel/cpu/mcheck/mce.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7e6f51a9d917..e93670d736a6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1207,13 +1207,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) lmce = m.mcgstatus & MCG_STATUS_LMCES; /* + * Local machine check may already know that we have to panic. + * Broadcast machine check begins rendezvous in mce_start() * Go through all banks in exclusion of the other CPUs. This way we * don't report duplicated events on shared banks because the first one - * to see it will clear it. If this is a Local MCE, then no need to - * perform rendezvous. + * to see it will clear it. */ - if (!lmce) + if (lmce) { + if (no_way_out) + mce_panic("Fatal local machine check", &m, msg); + } else { order = mce_start(&no_way_out); + } for (i = 0; i < cfg->banks; i++) { __clear_bit(i, toclear); @@ -1289,12 +1294,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) no_way_out = worst >= MCE_PANIC_SEVERITY; } else { /* - * Local MCE skipped calling mce_reign() - * If we found a fatal error, we need to panic here. + * If there was a fatal machine check we should have + * already called mce_panic earlier in this function. + * Since we re-read the banks, we might have found + * something new. Check again to see if we found a + * fatal error. We call "mce_severity()" again to + * make sure we have the right "msg". */ - if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) - mce_panic("Machine check from unknown source", - NULL, NULL); + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { + mce_severity(&m, cfg->tolerant, &msg, true); + mce_panic("Local fatal machine check!", &m, msg); + } } /* -- cgit v1.2.3 From 0218c766263e70795c5eaa17d75ed54bca350950 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Fri, 22 Jun 2018 13:51:26 +0200 Subject: x86/microcode/intel: Fix memleak in save_microcode_patch() Free useless ucode_patch entry when it's replaced. [ bp: Drop the memfree_patch() two-liner. ] Signed-off-by: Zhenzhong Duan Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Srinivas REDDY Eeda Link: http://lkml.kernel.org/r/888102f0-fd22-459d-b090-a1bd8a00cb2b@default --- arch/x86/kernel/cpu/microcode/intel.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 1c2cfa0644aa..97ccf4c3b45b 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -190,8 +190,11 @@ static void save_microcode_patch(void *data, unsigned int size) p = memdup_patch(data, size); if (!p) pr_err("Error allocating buffer %p\n", data); - else + else { list_replace(&iter->plist, &p->plist); + kfree(iter->data); + kfree(iter); + } } } -- cgit v1.2.3 From 0447378a4a793da008451fad50bc0f93e9675ae6 Mon Sep 17 00:00:00 2001 From: Marc Orr Date: Wed, 20 Jun 2018 17:21:29 -0700 Subject: kvm: vmx: Nested VM-entry prereqs for event inj. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch extends the checks done prior to a nested VM entry. Specifically, it extends the check_vmentry_prereqs function with checks for fields relevant to the VM-entry event injection information, as described in the Intel SDM, volume 3. This patch is motivated by a syzkaller bug, where a bad VM-entry interruption information field is generated in the VMCS02, which causes the nested VM launch to fail. Then, KVM fails to resume L1. While KVM should be improved to correctly resume L1 execution after a failed nested launch, this change is justified because the existing code to resume L1 is flaky/ad-hoc and the test coverage for resuming L1 is sparse. Reported-by: syzbot Signed-off-by: Marc Orr [Removed comment whose parts were describing previous revisions and the rest was obvious from function/variable naming. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/include/asm/vmx.h | 3 +++ arch/x86/kvm/vmx.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.h | 9 +++++++ 3 files changed, 79 insertions(+) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 425e6b8b9547..6aa8499e1f62 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -114,6 +114,7 @@ #define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +#define VMX_MISC_ZERO_LEN_INS 0x40000000 /* VMFUNC functions */ #define VMX_VMFUNC_EPTP_SWITCHING 0x00000001 @@ -351,11 +352,13 @@ enum vmcs_field { #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_RESERVED (1 << 8) /* reserved */ #define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ #define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ #define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ #define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ +#define INTR_TYPE_OTHER_EVENT (7 << 8) /* other event */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ #define GUEST_INTR_STATE_STI 0x00000001 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 559a12b6184d..1689f433f3a0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1705,6 +1705,17 @@ static inline bool nested_cpu_has_vmwrite_any_field(struct kvm_vcpu *vcpu) MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS; } +static inline bool nested_cpu_has_zero_length_injection(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.misc_low & VMX_MISC_ZERO_LEN_INS; +} + +static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.procbased_ctls_high & + CPU_BASED_MONITOR_TRAP_FLAG; +} + static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) { return vmcs12->cpu_based_vm_exec_control & bit; @@ -11620,6 +11631,62 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) !nested_cr3_valid(vcpu, vmcs12->host_cr3)) return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD; + /* + * From the Intel SDM, volume 3: + * Fields relevant to VM-entry event injection must be set properly. + * These fields are the VM-entry interruption-information field, the + * VM-entry exception error code, and the VM-entry instruction length. + */ + if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { + u32 intr_info = vmcs12->vm_entry_intr_info_field; + u8 vector = intr_info & INTR_INFO_VECTOR_MASK; + u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; + bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; + bool should_have_error_code; + bool urg = nested_cpu_has2(vmcs12, + SECONDARY_EXEC_UNRESTRICTED_GUEST); + bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; + + /* VM-entry interruption-info field: interruption type */ + if (intr_type == INTR_TYPE_RESERVED || + (intr_type == INTR_TYPE_OTHER_EVENT && + !nested_cpu_supports_monitor_trap_flag(vcpu))) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: vector */ + if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || + (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || + (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: deliver error code */ + should_have_error_code = + intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && + x86_exception_has_error_code(vector); + if (has_error_code != should_have_error_code) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry exception error code */ + if (has_error_code && + vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry interruption-info field: reserved bits */ + if (intr_info & INTR_INFO_RESVD_BITS_MASK) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + + /* VM-entry instruction length */ + switch (intr_type) { + case INTR_TYPE_SOFT_EXCEPTION: + case INTR_TYPE_SOFT_INTR: + case INTR_TYPE_PRIV_SW_EXCEPTION: + if ((vmcs12->vm_entry_instruction_len > 15) || + (vmcs12->vm_entry_instruction_len == 0 && + !nested_cpu_has_zero_length_injection(vcpu))) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + } + } + return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 331993c49dae..257f27620bc2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -110,6 +110,15 @@ static inline bool is_la57_mode(struct kvm_vcpu *vcpu) #endif } +static inline bool x86_exception_has_error_code(unsigned int vector) +{ + static u32 exception_has_error_code = BIT(DF_VECTOR) | BIT(TS_VECTOR) | + BIT(NP_VECTOR) | BIT(SS_VECTOR) | BIT(GP_VECTOR) | + BIT(PF_VECTOR) | BIT(AC_VECTOR); + + return (1U << vector) & exception_has_error_code; +} + static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) { return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; -- cgit v1.2.3 From 2ddc649810133fcf8e5282eea898ee7ececf161e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 22 Jun 2018 16:56:14 +0200 Subject: KVM: fix KVM_CAP_HYPERV_TLBFLUSH paragraph number MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM_CAP_HYPERV_TLBFLUSH collided with KVM_CAP_S390_PSW-BPB, its paragraph number should now be 8.18. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/api.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 495b7742ab58..d10944e619d3 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4610,7 +4610,7 @@ This capability indicates that kvm will implement the interfaces to handle reset, migration and nested KVM for branch prediction blocking. The stfle facility 82 should not be provided to the guest without this capability. -8.14 KVM_CAP_HYPERV_TLBFLUSH +8.18 KVM_CAP_HYPERV_TLBFLUSH Architectures: x86 -- cgit v1.2.3 From abcbcb80cd09cd40f2089d912764e315459b71f7 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 22 Jun 2018 16:33:57 +0200 Subject: time: Make sure jiffies_to_msecs() preserves non-zero time periods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the common cases where 1000 is a multiple of HZ, or HZ is a multiple of 1000, jiffies_to_msecs() never returns zero when passed a non-zero time period. However, if HZ > 1000 and not an integer multiple of 1000 (e.g. 1024 or 1200, as used on alpha and DECstation), jiffies_to_msecs() may return zero for small non-zero time periods. This may break code that relies on receiving back a non-zero value. jiffies_to_usecs() does not need such a fix: one jiffy can only be less than one µs if HZ > 1000000, and such large values of HZ are already rejected at build time, twice: - include/linux/jiffies.h does #error if HZ >= 12288, - kernel/time/time.c has BUILD_BUG_ON(HZ > USEC_PER_SEC). Broken since forever. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Cc: John Stultz Cc: Stephen Boyd Cc: linux-alpha@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180622143357.7495-1-geert@linux-m68k.org --- kernel/time/time.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/time/time.c b/kernel/time/time.c index 6fa99213fc72..2b41e8e2d31d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -28,6 +28,7 @@ */ #include +#include #include #include #include @@ -314,9 +315,10 @@ unsigned int jiffies_to_msecs(const unsigned long j) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 - return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; + return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >> + HZ_TO_MSEC_SHR32; # else - return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; + return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN); # endif #endif } -- cgit v1.2.3 From 48e315618dc4dc8904182cd221e3d395d5d97005 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 22 Jun 2018 12:08:20 +0200 Subject: MAINTAINERS: Add file patterns for x86 device tree bindings Submitters of device tree binding documentation may forget to CC the subsystem maintainer if this is missing. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Cc: "H . Peter Anvin" Cc: Rob Herring Cc: Mark Rutland Cc: devicetree@vger.kernel.org Link: https://lkml.kernel.org/r/20180622100820.29616-1-geert@linux-m68k.org --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9c125f705f78..60929873b900 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15366,6 +15366,7 @@ M: x86@kernel.org L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/core S: Maintained +F: Documentation/devicetree/bindings/x86/ F: Documentation/x86/ F: arch/x86/ -- cgit v1.2.3 From b5b7dd647f2d21b93f734ce890671cd908e69b0a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Jun 2018 10:25:25 +0100 Subject: arm64: kpti: Use early_param for kpti= command-line option We inspect __kpti_forced early on as part of the cpufeature enable callback which remaps the swapper page table using non-global entries. Ensure that __kpti_forced has been updated to reflect the kpti= command-line option before we start using it. Fixes: ea1e3de85e94 ("arm64: entry: Add fake CPU feature for unmapping the kernel at EL0") Cc: # 4.16.x- Reported-by: Wei Xu Tested-by: Sudeep Holla Tested-by: Wei Xu Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpufeature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index d2856b129097..f24892a40d2c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -937,7 +937,7 @@ static int __init parse_kpti(char *str) __kpti_forced = enabled ? 1 : -1; return 0; } -__setup("kpti=", parse_kpti); +early_param("kpti", parse_kpti); #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ #ifdef CONFIG_ARM64_HW_AFDBM -- cgit v1.2.3 From 71c8fc0c96abf8e53e74ed4d891d671e585f9076 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Jun 2018 16:23:45 +0100 Subject: arm64: mm: Ensure writes to swapper are ordered wrt subsequent cache maintenance When rewriting swapper using nG mappings, we must performance cache maintenance around each page table access in order to avoid coherency problems with the host's cacheable alias under KVM. To ensure correct ordering of the maintenance with respect to Device memory accesses made with the Stage-1 MMU disabled, DMBs need to be added between the maintenance and the corresponding memory access. This patch adds a missing DMB between writing a new page table entry and performing a clean+invalidate on the same line. Fixes: f992b4dfd58b ("arm64: kpti: Add ->enable callback to remap swapper using nG mappings") Cc: # 4.16.x- Acked-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/proc.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 5f9a73a4452c..03646e6a2ef4 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -217,8 +217,9 @@ ENDPROC(idmap_cpu_replace_ttbr1) .macro __idmap_kpti_put_pgtable_ent_ng, type orr \type, \type, #PTE_NG // Same bit for blocks and pages - str \type, [cur_\()\type\()p] // Update the entry and ensure it - dc civac, cur_\()\type\()p // is visible to all CPUs. + str \type, [cur_\()\type\()p] // Update the entry and ensure + dmb sy // that it is visible to all + dc civac, cur_\()\type\()p // CPUs. .endm /* -- cgit v1.2.3 From 784e0300fe9fe4aa81bd7df9d59e138f56bb605b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 22 Jun 2018 11:45:07 +0100 Subject: rseq: Avoid infinite recursion when delivering SIGSEGV When delivering a signal to a task that is using rseq, we call into __rseq_handle_notify_resume() so that the registers pushed in the sigframe are updated to reflect the state of the restartable sequence (for example, ensuring that the signal returns to the abort handler if necessary). However, if the rseq management fails due to an unrecoverable fault when accessing userspace or certain combinations of RSEQ_CS_* flags, then we will attempt to deliver a SIGSEGV. This has the potential for infinite recursion if the rseq code continuously fails on signal delivery. Avoid this problem by using force_sigsegv() instead of force_sig(), which is explicitly designed to reset the SEGV handler to SIG_DFL in the case of a recursive fault. In doing so, remove rseq_signal_deliver() from the internal rseq API and have an optional struct ksignal * parameter to rseq_handle_notify_resume() instead. Signed-off-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Mathieu Desnoyers Cc: peterz@infradead.org Cc: paulmck@linux.vnet.ibm.com Cc: boqun.feng@gmail.com Link: https://lkml.kernel.org/r/1529664307-983-1-git-send-email-will.deacon@arm.com --- arch/arm/kernel/signal.c | 4 ++-- arch/powerpc/kernel/signal.c | 4 ++-- arch/x86/entry/common.c | 2 +- arch/x86/kernel/signal.c | 2 +- include/linux/sched.h | 18 +++++++++++------- kernel/rseq.c | 7 ++++--- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index f09e9d66d605..dec130e7078c 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -544,7 +544,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) * Increment event counter and perform fixup for the pre-signal * frame. */ - rseq_signal_deliver(regs); + rseq_signal_deliver(ksig, regs); /* * Set up the stack frame @@ -666,7 +666,7 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) } else { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - rseq_handle_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); } } local_irq_disable(); diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index 17fe4339ba59..b3e8db376ecd 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -134,7 +134,7 @@ static void do_signal(struct task_struct *tsk) /* Re-enable the breakpoints for the signal stack */ thread_change_pc(tsk, tsk->thread.regs); - rseq_signal_deliver(tsk->thread.regs); + rseq_signal_deliver(&ksig, tsk->thread.regs); if (is32) { if (ksig.ka.sa.sa_flags & SA_SIGINFO) @@ -170,7 +170,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - rseq_handle_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); } user_enter(); diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 92190879b228..3b2490b81918 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -164,7 +164,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) if (cached_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); - rseq_handle_notify_resume(regs); + rseq_handle_notify_resume(NULL, regs); } if (cached_flags & _TIF_USER_RETURN_NOTIFY) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 445ca11ff863..92a3b312a53c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -692,7 +692,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) * Increment event counter and perform fixup for the pre-signal * frame. */ - rseq_signal_deliver(regs); + rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ if (is_ia32_frame(ksig)) { diff --git a/include/linux/sched.h b/include/linux/sched.h index c1882643d455..9256118bd40c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1799,20 +1799,22 @@ static inline void rseq_set_notify_resume(struct task_struct *t) set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } -void __rseq_handle_notify_resume(struct pt_regs *regs); +void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct pt_regs *regs) +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) { if (current->rseq) - __rseq_handle_notify_resume(regs); + __rseq_handle_notify_resume(ksig, regs); } -static inline void rseq_signal_deliver(struct pt_regs *regs) +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) { preempt_disable(); __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); preempt_enable(); - rseq_handle_notify_resume(regs); + rseq_handle_notify_resume(ksig, regs); } /* rseq_preempt() requires preemption to be disabled. */ @@ -1861,10 +1863,12 @@ static inline void rseq_execve(struct task_struct *t) static inline void rseq_set_notify_resume(struct task_struct *t) { } -static inline void rseq_handle_notify_resume(struct pt_regs *regs) +static inline void rseq_handle_notify_resume(struct ksignal *ksig, + struct pt_regs *regs) { } -static inline void rseq_signal_deliver(struct pt_regs *regs) +static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) diff --git a/kernel/rseq.c b/kernel/rseq.c index ae306f90c514..22b6acf1ad63 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -251,10 +251,10 @@ static int rseq_ip_fixup(struct pt_regs *regs) * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. */ -void __rseq_handle_notify_resume(struct pt_regs *regs) +void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; - int ret; + int ret, sig; if (unlikely(t->flags & PF_EXITING)) return; @@ -268,7 +268,8 @@ void __rseq_handle_notify_resume(struct pt_regs *regs) return; error: - force_sig(SIGSEGV, t); + sig = ksig ? ksig->sig : 0; + force_sigsegv(sig, t); } #ifdef CONFIG_DEBUG_RSEQ -- cgit v1.2.3 From 0ae52ddf5bd7f685bb43d7687290f6c2eeacfb31 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 22 Jun 2018 13:05:35 +0200 Subject: lightnvm: Remove depends on HAS_DMA in case of platform dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove dependencies on HAS_DMA where a Kconfig symbol depends on another symbol that implies HAS_DMA, and, optionally, on "|| COMPILE_TEST". In most cases this other symbol is an architecture or platform specific symbol, or PCI. Generic symbols and drivers without platform dependencies keep their dependencies on HAS_DMA, to prevent compiling subsystems or drivers that cannot work anyway. This simplifies the dependencies, and allows to improve compile-testing. Signed-off-by: Geert Uytterhoeven Reviewed-by: Mark Brown Acked-by: Robin Murphy Reviewed-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 10c08982185a..9c03f35d9df1 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -4,7 +4,7 @@ menuconfig NVM bool "Open-Channel SSD target support" - depends on BLOCK && HAS_DMA && PCI + depends on BLOCK && PCI select BLK_DEV_NVME help Say Y here to get to enable Open-channel SSDs. -- cgit v1.2.3 From 3ee7e8697d5860b173132606d80a9cd35e7113ee Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 18 Jun 2018 15:46:58 +0200 Subject: bdi: Fix another oops in wb_workfn() syzbot is reporting NULL pointer dereference at wb_workfn() [1] due to wb->bdi->dev being NULL. And Dmitry confirmed that wb->state was WB_shutting_down after wb->bdi->dev became NULL. This indicates that unregister_bdi() failed to call wb_shutdown() on one of wb objects. The problem is in cgwb_bdi_unregister() which does cgwb_kill() and thus drops bdi's reference to wb structures before going through the list of wbs again and calling wb_shutdown() on each of them. This way the loop iterating through all wbs can easily miss a wb if that wb has already passed through cgwb_remove_from_bdi_list() called from wb_shutdown() from cgwb_release_workfn() and as a result fully shutdown bdi although wb_workfn() for this wb structure is still running. In fact there are also other ways cgwb_bdi_unregister() can race with cgwb_release_workfn() leading e.g. to use-after-free issues: CPU1 CPU2 cgwb_bdi_unregister() cgwb_kill(*slot); cgwb_release() queue_work(cgwb_release_wq, &wb->release_work); cgwb_release_workfn() wb = list_first_entry(&bdi->wb_list, ...) spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); ... kfree_rcu(wb, rcu); wb_shutdown(wb); -> oops use-after-free We solve these issues by synchronizing writeback structure shutdown from cgwb_bdi_unregister() with cgwb_release_workfn() using a new mutex. That way we also no longer need synchronization using WB_shutting_down as the mutex provides it for CONFIG_CGROUP_WRITEBACK case and without CONFIG_CGROUP_WRITEBACK wb_shutdown() can be called only once from bdi_unregister(). Reported-by: syzbot Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 2 +- mm/backing-dev.c | 20 +++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 0bd432a4d7bd..24251762c20c 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -22,7 +22,6 @@ struct dentry; */ enum wb_state { WB_registered, /* bdi_register() was done */ - WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ WB_start_all, /* nr_pages == 0 (all) work pending */ @@ -189,6 +188,7 @@ struct backing_dev_info { #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ + struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ #else struct bdi_writeback_congested *wb_congested; #endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 347cc834c04a..2e5d3df0853d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -359,15 +359,8 @@ static void wb_shutdown(struct bdi_writeback *wb) spin_lock_bh(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_bh(&wb->work_lock); - /* - * Wait for wb shutdown to finish if someone else is just - * running wb_shutdown(). Otherwise we could proceed to wb / - * bdi destruction before wb_shutdown() is finished. - */ - wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE); return; } - set_bit(WB_shutting_down, &wb->state); spin_unlock_bh(&wb->work_lock); cgwb_remove_from_bdi_list(wb); @@ -379,12 +372,6 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); - /* - * Make sure bit gets cleared after shutdown is finished. Matches with - * the barrier provided by test_and_clear_bit() above. - */ - smp_wmb(); - clear_and_wake_up_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) @@ -508,10 +495,12 @@ static void cgwb_release_workfn(struct work_struct *work) struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); + mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); + mutex_unlock(&wb->bdi->cgwb_release_mutex); fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); @@ -697,6 +686,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; + mutex_init(&bdi->cgwb_release_mutex); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { @@ -717,7 +707,10 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); + spin_unlock_irq(&cgwb_lock); + mutex_lock(&bdi->cgwb_release_mutex); + spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); @@ -726,6 +719,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); + mutex_unlock(&bdi->cgwb_release_mutex); } /** -- cgit v1.2.3 From 964d978433a4b9aa1368ff71227ca0027dd1e32f Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 13 Jun 2018 13:43:10 -0500 Subject: x86/CPU/AMD: Fix LLC ID bit-shift calculation The current logic incorrectly calculates the LLC ID from the APIC ID. Unless specified otherwise, the LLC ID should be calculated by removing the Core and Thread ID bits from the least significant end of the APIC ID. For more info, see "ApicId Enumeration Requirements" in any Fam17h PPR document. [ bp: Improve commit message. ] Fixes: 68091ee7ac3c ("Calculate last level cache ID from number of sharing threads") Signed-off-by: Suravee Suthikulpanit Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1528915390-30533-1-git-send-email-suravee.suthikulpanit@amd.com --- arch/x86/kernel/cpu/cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 38354c66df81..0c5fcbd998cf 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -671,7 +671,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id) num_sharing_cache = ((eax >> 14) & 0xfff) + 1; if (num_sharing_cache) { - int bits = get_count_order(num_sharing_cache) - 1; + int bits = get_count_order(num_sharing_cache); per_cpu(cpu_llc_id, cpu) = c->apicid >> bits; } -- cgit v1.2.3 From 51be1335151771075dcb19f3464ca9f331134285 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 23 Jun 2018 01:08:40 +0300 Subject: Revert "x86/mm: Mark __pgtable_l5_enabled __initdata" This reverts commit e4e961e36f063484c48bed919013c106d178995d. We need to use early version of pgtable_l5_enabled() in early_identify_cpu() as this code runs before cpu_feature_enabled() is usable. But it leads to section mismatch: cpu_init() load_mm_ldt() ldt_slot_va() LDT_BASE_ADDR LDT_PGD_ENTRY pgtable_l5_enabled() __pgtable_l5_enabled __pgtable_l5_enabled marked as __initdata, but cpu_init() is not __init. It's fixable: early code can be isolated into a separate translation unit, but such change collides with other work in the area. That's too much hassle to save 4 bytes of memory. Return __pgtable_l5_enabled back to be __ro_after_init. Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180622220841.54135-2-kirill.shutemov@linux.intel.com --- arch/x86/kernel/head64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index a21d6ace648e..8047379e575a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -44,7 +44,7 @@ static unsigned int __initdata next_early_pgt; pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); #ifdef CONFIG_X86_5LEVEL -unsigned int __pgtable_l5_enabled __initdata; +unsigned int __pgtable_l5_enabled __ro_after_init; unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; -- cgit v1.2.3 From 2458e53ff74cd1063ed3e00459da1d35c559d369 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 23 Jun 2018 01:08:41 +0300 Subject: x86/mm: Fix 'no5lvl' handling early_identify_cpu() has to use early version of pgtable_l5_enabled() that doesn't rely on cpu_feature_enabled(). Defining USE_EARLY_PGTABLE_L5 before all includes does the trick. I lost the define in one of reworks of the original patch. Fixes: 372fddf70904 ("x86/mm: Introduce the 'no5lvl' kernel parameter") Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180622220841.54135-3-kirill.shutemov@linux.intel.com --- arch/x86/kernel/cpu/common.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0df7151cfef4..eb4cb3efd20e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,3 +1,6 @@ +/* cpu_feature_enabled() cannot be used this early */ +#define USE_EARLY_PGTABLE_L5 + #include #include #include -- cgit v1.2.3 From f5e350f021e04ea41d2e5d58487c33b05ba3d25b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 22 Jun 2018 13:18:09 -0700 Subject: blk-mq: Fix timeout handling in case the timeout handler returns BLK_EH_DONE Make sure that RQF_TIMED_OUT is cleared when a request is reused after a block driver timeout handler has returned BLK_EH_DONE. Fixes: da6612673988 ("blk-mq: don't time out requests again that are in the timeout handler") Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Jianchao Wang Cc: Andrew Randrianasulu Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - block/blk-timeout.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8e57b84e50e9..b6888ff556cf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -781,7 +781,6 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved) WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } - req->rq_flags &= ~RQF_TIMED_OUT; blk_add_timer(req); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 4b8a48d48ba1..f2cfd56e1606 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -210,6 +210,7 @@ void blk_add_timer(struct request *req) if (!req->timeout) req->timeout = q->rq_timeout; + req->rq_flags &= ~RQF_TIMED_OUT; blk_rq_set_deadline(req, jiffies + req->timeout); /* -- cgit v1.2.3 From 2e6eb40ca5eb53836d18f3b9ac61ff2e0b417038 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 23 Jun 2018 23:19:03 +0200 Subject: efi/x86: Fix incorrect invocation of PciIo->Attributes() The following commit: 2c3625cb9fa2 ("efi/x86: Fold __setup_efi_pci32() and __setup_efi_pci64() into one function") ... merged the two versions of __setup_efi_pciXX(), without taking into account that the 32-bit version used a rather dodgy trick to pass an immediate 0 constant as argument for a uint64_t parameter. The issue is caused by the fact that on x86, UEFI protocol method calls are redirected via struct efi_config::call(), which is a variadic function, and so the compiler has to infer the types of the parameters from the arguments rather than from the prototype. As the 32-bit x86 calling convention passes arguments via the stack, passing the unqualified constant 0 twice is the same as passing 0ULL, which is why the 32-bit code in __setup_efi_pci32() contained the following call: status = efi_early->call(pci->attributes, pci, EfiPciIoAttributeOperationGet, 0, 0, &attributes); to invoke this UEFI protocol method: typedef EFI_STATUS (EFIAPI *EFI_PCI_IO_PROTOCOL_ATTRIBUTES) ( IN EFI_PCI_IO_PROTOCOL *This, IN EFI_PCI_IO_PROTOCOL_ATTRIBUTE_OPERATION Operation, IN UINT64 Attributes, OUT UINT64 *Result OPTIONAL ); After the merge, we inadvertently ended up with this version for both 32-bit and 64-bit builds, breaking the latter. So replace the two zeroes with the explicitly typed constant 0ULL, which works as expected on both 32-bit and 64-bit builds. Wilfried tested the 64-bit build, and I checked the generated assembly of a 32-bit build with and without this patch, and they are identical. Reported-by: Wilfried Klaebe Tested-by: Wilfried Klaebe Signed-off-by: Ard Biesheuvel Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hdegoede@redhat.com Cc: linux-efi@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/eboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index a8a8642d2b0b..e57665b4ba1c 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -118,7 +118,7 @@ __setup_efi_pci(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) void *romimage; status = efi_call_proto(efi_pci_io_protocol, attributes, pci, - EfiPciIoAttributeOperationGet, 0, 0, + EfiPciIoAttributeOperationGet, 0ULL, &attributes); if (status != EFI_SUCCESS) return status; -- cgit v1.2.3 From 7daf201d7fe8334e2d2364d4e8ed3394ec9af819 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Jun 2018 20:54:29 +0800 Subject: Linux 4.18-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ca2af1ab91eb..c9132594860b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 18 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Merciless Moray # *DOCUMENTATION* -- cgit v1.2.3 From d48de54a9dab5370edd2e991f78cc7996cf5483e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 28 Jun 2018 15:20:27 +0200 Subject: printk: Export is_console_locked This is a preparation patch for adding a number of WARN_CONSOLE_UNLOCKED() calls to the fbcon code, which may be built as a module (event though usually it is not). Acked-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Acked-by: Petr Mladek Reviewed-by: Daniel Vetter Signed-off-by: Hans de Goede Signed-off-by: Bartlomiej Zolnierkiewicz --- kernel/printk/printk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247808333ba4..3f041e7cbfc9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2243,6 +2243,7 @@ int is_console_locked(void) { return console_locked; } +EXPORT_SYMBOL(is_console_locked); /* * Check if we have any console that is capable of printing while cpu is -- cgit v1.2.3 From 3bd3a0e330aae4fffa8028aba2407ef615ab040b Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 28 Jun 2018 15:20:28 +0200 Subject: fbcon: Call WARN_CONSOLE_UNLOCKED() where applicable Replace comments about places where the console lock should be held with calls to WARN_CONSOLE_UNLOCKED() to assert that it is actually held. Acked-by: Steven Rostedt (VMware) Reviewed-by: Daniel Vetter Reviewed-by: Sergey Senozhatsky Signed-off-by: Hans de Goede Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/video/fbdev/core/fbcon.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index c910e74d46ff..cd8d52a967aa 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -828,6 +828,8 @@ static int set_con2fb_map(int unit, int newidx, int user) struct fb_info *oldinfo = NULL; int found, err = 0; + WARN_CONSOLE_UNLOCKED(); + if (oldidx == newidx) return 0; @@ -3044,6 +3046,8 @@ static int fbcon_fb_unbind(int idx) { int i, new_idx = -1, ret = 0; + WARN_CONSOLE_UNLOCKED(); + if (!fbcon_has_console_bind) return 0; @@ -3094,6 +3098,8 @@ static int fbcon_fb_unregistered(struct fb_info *info) { int i, idx; + WARN_CONSOLE_UNLOCKED(); + idx = info->node; for (i = first_fb_vc; i <= last_fb_vc; i++) { if (con2fb_map[i] == idx) @@ -3131,6 +3137,9 @@ static int fbcon_fb_unregistered(struct fb_info *info) static void fbcon_remap_all(int idx) { int i; + + WARN_CONSOLE_UNLOCKED(); + for (i = first_fb_vc; i <= last_fb_vc; i++) set_con2fb_map(i, idx, 0); @@ -3177,6 +3186,8 @@ static int fbcon_fb_registered(struct fb_info *info) { int ret = 0, i, idx; + WARN_CONSOLE_UNLOCKED(); + idx = info->node; fbcon_select_primary(info); -- cgit v1.2.3 From 83d83bebf40132e2d55ec58af666713cc76f9764 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 28 Jun 2018 15:20:30 +0200 Subject: console/fbcon: Add support for deferred console takeover Currently fbcon claims fbdevs as soon as they are registered and takes over the console as soon as the first fbdev gets registered. This behavior is undesirable in cases where a smooth graphical bootup is desired, in such cases we typically want the contents of the framebuffer (typically a vendor logo) to stay in place as is. The current solution for this problem (on embedded systems) is to not enable fbcon. This commit adds a new FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER config option, which when enabled defers fbcon taking over the console from the dummy console until the first text is displayed on the console. Together with the "quiet" kernel commandline option, this allows fbcon to still be used together with a smooth graphical bootup, having it take over the console as soon as e.g. an error message is logged. Note the choice to detect the first console output in the dummycon driver, rather then handling this entirely inside the fbcon code, was made after 2 failed attempts to handle this entirely inside the fbcon code. The fbcon code is woven quite tightly into the console code, making this to only feasible option. Reviewed-by: Daniel Vetter Signed-off-by: Hans de Goede Signed-off-by: Bartlomiej Zolnierkiewicz --- Documentation/fb/fbcon.txt | 7 ++++ drivers/video/console/Kconfig | 11 ++++++ drivers/video/console/dummycon.c | 67 ++++++++++++++++++++++++++++++++----- drivers/video/fbdev/core/fbcon.c | 72 ++++++++++++++++++++++++++++++++++++++++ include/linux/console.h | 5 +++ 5 files changed, 154 insertions(+), 8 deletions(-) diff --git a/Documentation/fb/fbcon.txt b/Documentation/fb/fbcon.txt index 79c22d096bbc..d4d642e1ce9c 100644 --- a/Documentation/fb/fbcon.txt +++ b/Documentation/fb/fbcon.txt @@ -155,6 +155,13 @@ C. Boot options used by text. By default, this area will be black. The 'color' value is an integer number that depends on the framebuffer driver being used. +6. fbcon=nodefer + + If the kernel is compiled with deferred fbcon takeover support, normally + the framebuffer contents, left in place by the firmware/bootloader, will + be preserved until there actually is some text is output to the console. + This option causes fbcon to bind immediately to the fbdev device. + C. Attaching, Detaching and Unloading Before going on how to attach, detach and unload the framebuffer console, an diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig index 4110ba7d7ca9..e91edef98633 100644 --- a/drivers/video/console/Kconfig +++ b/drivers/video/console/Kconfig @@ -150,6 +150,17 @@ config FRAMEBUFFER_CONSOLE_ROTATION such that other users of the framebuffer will remain normally oriented. +config FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER + bool "Framebuffer Console Deferred Takeover" + depends on FRAMEBUFFER_CONSOLE=y && DUMMY_CONSOLE=y + help + If enabled this defers the framebuffer console taking over the + console from the dummy console until the first text is displayed on + the console. This is useful in combination with the "quiet" kernel + commandline option to keep the framebuffer contents initially put up + by the firmware in place, rather then replacing the contents with a + black screen as soon as fbcon loads. + config STI_CONSOLE bool "STI text console" depends on PARISC && HAS_IOMEM diff --git a/drivers/video/console/dummycon.c b/drivers/video/console/dummycon.c index f2eafe2ed980..45ad925ad5f8 100644 --- a/drivers/video/console/dummycon.c +++ b/drivers/video/console/dummycon.c @@ -26,6 +26,65 @@ #define DUMMY_ROWS CONFIG_DUMMY_CONSOLE_ROWS #endif +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER +/* These are both protected by the console_lock */ +static RAW_NOTIFIER_HEAD(dummycon_output_nh); +static bool dummycon_putc_called; + +void dummycon_register_output_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_register(&dummycon_output_nh, nb); + + if (dummycon_putc_called) + nb->notifier_call(nb, 0, NULL); +} + +void dummycon_unregister_output_notifier(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&dummycon_output_nh, nb); +} + +static void dummycon_putc(struct vc_data *vc, int c, int ypos, int xpos) +{ + dummycon_putc_called = true; + raw_notifier_call_chain(&dummycon_output_nh, 0, NULL); +} + +static void dummycon_putcs(struct vc_data *vc, const unsigned short *s, + int count, int ypos, int xpos) +{ + int i; + + if (!dummycon_putc_called) { + /* Ignore erases */ + for (i = 0 ; i < count; i++) { + if (s[i] != vc->vc_video_erase_char) + break; + } + if (i == count) + return; + + dummycon_putc_called = true; + } + + raw_notifier_call_chain(&dummycon_output_nh, 0, NULL); +} + +static int dummycon_blank(struct vc_data *vc, int blank, int mode_switch) +{ + /* Redraw, so that we get putc(s) for output done while blanked */ + return 1; +} +#else +static void dummycon_putc(struct vc_data *vc, int c, int ypos, int xpos) { } +static void dummycon_putcs(struct vc_data *vc, const unsigned short *s, + int count, int ypos, int xpos) { } +static int dummycon_blank(struct vc_data *vc, int blank, int mode_switch) +{ + return 0; +} +#endif + static const char *dummycon_startup(void) { return "dummy device"; @@ -44,9 +103,6 @@ static void dummycon_init(struct vc_data *vc, int init) static void dummycon_deinit(struct vc_data *vc) { } static void dummycon_clear(struct vc_data *vc, int sy, int sx, int height, int width) { } -static void dummycon_putc(struct vc_data *vc, int c, int ypos, int xpos) { } -static void dummycon_putcs(struct vc_data *vc, const unsigned short *s, - int count, int ypos, int xpos) { } static void dummycon_cursor(struct vc_data *vc, int mode) { } static bool dummycon_scroll(struct vc_data *vc, unsigned int top, @@ -61,11 +117,6 @@ static int dummycon_switch(struct vc_data *vc) return 0; } -static int dummycon_blank(struct vc_data *vc, int blank, int mode_switch) -{ - return 0; -} - static int dummycon_font_set(struct vc_data *vc, struct console_font *font, unsigned int flags) { diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index cd8d52a967aa..5fb156bdcf4e 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -129,6 +129,12 @@ static inline void fbcon_map_override(void) } #endif /* CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY */ +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER +static bool deferred_takeover = true; +#else +#define deferred_takeover false +#endif + /* font data */ static char fontname[40]; @@ -499,6 +505,12 @@ static int __init fb_console_setup(char *this_opt) margin_color = simple_strtoul(options, &options, 0); continue; } +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER + if (!strcmp(options, "nodefer")) { + deferred_takeover = false; + continue; + } +#endif } return 1; } @@ -3100,6 +3112,9 @@ static int fbcon_fb_unregistered(struct fb_info *info) WARN_CONSOLE_UNLOCKED(); + if (deferred_takeover) + return 0; + idx = info->node; for (i = first_fb_vc; i <= last_fb_vc; i++) { if (con2fb_map[i] == idx) @@ -3140,6 +3155,13 @@ static void fbcon_remap_all(int idx) WARN_CONSOLE_UNLOCKED(); + if (deferred_takeover) { + for (i = first_fb_vc; i <= last_fb_vc; i++) + con2fb_map_boot[i] = idx; + fbcon_map_override(); + return; + } + for (i = first_fb_vc; i <= last_fb_vc; i++) set_con2fb_map(i, idx, 0); @@ -3191,6 +3213,11 @@ static int fbcon_fb_registered(struct fb_info *info) idx = info->node; fbcon_select_primary(info); + if (deferred_takeover) { + pr_info("fbcon: Deferring console take-over\n"); + return 0; + } + if (info_idx == -1) { for (i = first_fb_vc; i <= last_fb_vc; i++) { if (con2fb_map_boot[i] == idx) { @@ -3566,8 +3593,46 @@ static int fbcon_init_device(void) return 0; } +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER +static struct notifier_block fbcon_output_nb; + +static int fbcon_output_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + int i; + + WARN_CONSOLE_UNLOCKED(); + + pr_info("fbcon: Taking over console\n"); + + dummycon_unregister_output_notifier(&fbcon_output_nb); + deferred_takeover = false; + logo_shown = FBCON_LOGO_DONTSHOW; + + for (i = 0; i < FB_MAX; i++) { + if (registered_fb[i]) + fbcon_fb_registered(registered_fb[i]); + } + + return NOTIFY_OK; +} + +static void fbcon_register_output_notifier(void) +{ + fbcon_output_nb.notifier_call = fbcon_output_notifier; + dummycon_register_output_notifier(&fbcon_output_nb); +} +#else +static inline void fbcon_register_output_notifier(void) {} +#endif + static void fbcon_start(void) { + if (deferred_takeover) { + fbcon_register_output_notifier(); + return; + } + if (num_registered_fb) { int i; @@ -3594,6 +3659,13 @@ static void fbcon_exit(void) if (fbcon_has_exited) return; +#ifdef CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER + if (deferred_takeover) { + dummycon_unregister_output_notifier(&fbcon_output_nb); + deferred_takeover = false; + } +#endif + kfree((void *)softback_buf); softback_buf = 0UL; diff --git a/include/linux/console.h b/include/linux/console.h index dfd6b0e97855..f59f3dbca65c 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -21,6 +21,7 @@ struct console_font_op; struct console_font; struct module; struct tty_struct; +struct notifier_block; /* * this is what the terminal answers to a ESC-Z or csi0c query. @@ -220,4 +221,8 @@ static inline bool vgacon_text_force(void) { return false; } extern void console_init(void); +/* For deferred console takeover */ +void dummycon_register_output_notifier(struct notifier_block *nb); +void dummycon_unregister_output_notifier(struct notifier_block *nb); + #endif /* _LINUX_CONSOLE_H */ -- cgit v1.2.3