From ae1ff3d623905947158fd3394854c23026337810 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 13 Jul 2015 14:31:28 +0300 Subject: iommu: iova: Move iova cache management to the iova library This is necessary to separate intel-iommu from the iova library. Signed-off-by: Sakari Ailus Signed-off-by: David Woodhouse --- include/linux/iova.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iova.h b/include/linux/iova.h index 3920a19d8194..92f7177db2ce 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -68,8 +68,8 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova) return iova >> iova_shift(iovad); } -int iommu_iova_cache_init(void); -void iommu_iova_cache_destroy(void); +int iova_cache_get(void); +void iova_cache_put(void); struct iova *alloc_iova_mem(void); void free_iova_mem(struct iova *iova); -- cgit v1.2.3 From 30035e45753b708e7d47a98398500ca005e02b86 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Wed, 29 Apr 2015 12:52:04 -0400 Subject: string: provide strscpy() The strscpy() API is intended to be used instead of strlcpy(), and instead of most uses of strncpy(). - Unlike strlcpy(), it doesn't read from memory beyond (src + size). - Unlike strlcpy() or strncpy(), the API provides an easy way to check for destination buffer overflow: an -E2BIG error return value. - The provided implementation is robust in the face of the source buffer being asynchronously changed during the copy, unlike the current implementation of strlcpy(). - Unlike strncpy(), the destination buffer will be NUL-terminated if the string in the source buffer is too long. - Also unlike strncpy(), the destination buffer will not be updated beyond the NUL termination, avoiding strncpy's behavior of zeroing the entire tail end of the destination buffer. (A memset() after the strscpy() can be used if this behavior is desired.) - The implementation should be reasonably performant on all platforms since it uses the asm/word-at-a-time.h API rather than simple byte copy. Kernel-to-kernel string copy is not considered to be performance critical in any case. Signed-off-by: Chris Metcalf --- include/linux/string.h | 3 ++ lib/string.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index a8d90db9c4b0..9ef7795e65e4 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -25,6 +25,9 @@ extern char * strncpy(char *,const char *, __kernel_size_t); #ifndef __HAVE_ARCH_STRLCPY size_t strlcpy(char *, const char *, size_t); #endif +#ifndef __HAVE_ARCH_STRSCPY +ssize_t __must_check strscpy(char *, const char *, size_t); +#endif #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); #endif diff --git a/lib/string.c b/lib/string.c index 13d1e84ddb80..8dbb7b1eab50 100644 --- a/lib/string.c +++ b/lib/string.c @@ -27,6 +27,10 @@ #include #include +#include +#include +#include + #ifndef __HAVE_ARCH_STRNCASECMP /** * strncasecmp - Case insensitive, length-limited string comparison @@ -146,6 +150,90 @@ size_t strlcpy(char *dest, const char *src, size_t size) EXPORT_SYMBOL(strlcpy); #endif +#ifndef __HAVE_ARCH_STRSCPY +/** + * strscpy - Copy a C-string into a sized buffer + * @dest: Where to copy the string to + * @src: Where to copy the string from + * @count: Size of destination buffer + * + * Copy the string, or as much of it as fits, into the dest buffer. + * The routine returns the number of characters copied (not including + * the trailing NUL) or -E2BIG if the destination buffer wasn't big enough. + * The behavior is undefined if the string buffers overlap. + * The destination buffer is always NUL terminated, unless it's zero-sized. + * + * Preferred to strlcpy() since the API doesn't require reading memory + * from the src string beyond the specified "count" bytes, and since + * the return value is easier to error-check than strlcpy()'s. + * In addition, the implementation is robust to the string changing out + * from underneath it, unlike the current strlcpy() implementation. + * + * Preferred to strncpy() since it always returns a valid string, and + * doesn't unnecessarily force the tail of the destination buffer to be + * zeroed. If the zeroing is desired, it's likely cleaner to use strscpy() + * with an overflow test, then just memset() the tail of the dest buffer. + */ +ssize_t strscpy(char *dest, const char *src, size_t count) +{ + const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + size_t max = count; + long res = 0; + + if (count == 0) + return -E2BIG; + +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + /* + * If src is unaligned, don't cross a page boundary, + * since we don't know if the next page is mapped. + */ + if ((long)src & (sizeof(long) - 1)) { + size_t limit = PAGE_SIZE - ((long)src & (PAGE_SIZE - 1)); + if (limit < max) + max = limit; + } +#else + /* If src or dest is unaligned, don't do word-at-a-time. */ + if (((long) dest | (long) src) & (sizeof(long) - 1)) + max = 0; +#endif + + while (max >= sizeof(unsigned long)) { + unsigned long c, data; + + c = *(unsigned long *)(src+res); + *(unsigned long *)(dest+res) = c; + if (has_zero(c, &data, &constants)) { + data = prep_zero_mask(c, data, &constants); + data = create_zero_mask(data); + return res + find_zero(data); + } + res += sizeof(unsigned long); + count -= sizeof(unsigned long); + max -= sizeof(unsigned long); + } + + while (count) { + char c; + + c = src[res]; + dest[res] = c; + if (!c) + return res; + res++; + count--; + } + + /* Hit buffer length without finding a NUL; force NUL-termination. */ + if (res) + dest[res-1] = '\0'; + + return -E2BIG; +} +EXPORT_SYMBOL(strscpy); +#endif + #ifndef __HAVE_ARCH_STRCAT /** * strcat - Append one %NUL-terminated string to another -- cgit v1.2.3 From 66e8c57da6bf6b847a48a5a6fda59512f733ed78 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 25 Aug 2015 20:45:18 +0200 Subject: rcu: Change _wait_rcu_gp() to work around GCC bug 67055 Code like this in inline functions confuses some recent versions of gcc: const int n = const-expr; whatever_t array[n]; For more details, see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67055#c13 This compiler bug results in the following failure after 114b7fd4b (rcu: Create rcu_sync infrastructure): In file included from include/linux/rcupdate.h:429:0, from include/linux/rcu_sync.h:5, from kernel/rcu/sync.c:1: include/linux/rcutiny.h: In function 'rcu_barrier_sched': include/linux/rcutiny.h:55:20: internal compiler error: Segmentation fault static inline void rcu_barrier_sched(void) This commit therefore eliminates the constant local variable in favor of direct use of the expression. Reported-and-tested-by: Mark Salter Reported-by: Guenter Roeck Signed-off-by: Oleg Nesterov Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index ff476515f716..581abf848566 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -230,12 +230,11 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array); #define _wait_rcu_gp(checktiny, ...) \ -do { \ - call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ - const int __n = ARRAY_SIZE(__crcu_array); \ - struct rcu_synchronize __rs_array[__n]; \ - \ - __wait_rcu_gp(checktiny, __n, __crcu_array, __rs_array); \ +do { \ + call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ + struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ + __wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array), \ + __crcu_array, __rs_array); \ } while (0) #define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) -- cgit v1.2.3 From c6790aa9f4fdc26b1246ba36da2fd749663beb65 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 24 Sep 2015 10:34:23 +0300 Subject: IB/mlx5: Remove support for IB_DEVICE_LOCAL_DMA_LKEY Commit 96249d70dd70 ("IB/core: Guarantee that a local_dma_lkey is available") allows ULPs that make use of the local dma key to keep working as before by allocating a DMA MR with local permissions and converted these consumers to use the MR associated with the PD rather then device->local_dma_lkey. ConnectIB has some known issues with memory registration using the local_dma_lkey (SEND, RDMA, RECV seems to work ok). Thus don't expose support for it (remove device->local_dma_lkey setting), and take advantage of the above commit such that no regression is introduced to working systems. The local_dma_lkey support will be restored in CX4 depending on FW capability query. Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 10 +--------- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 22 ---------------------- include/linux/mlx5/device.h | 11 ----------- include/linux/mlx5/driver.h | 1 - 4 files changed, 1 insertion(+), 43 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 41d6911e244e..0ab9625911a1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -245,7 +245,6 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (MLX5_CAP_GEN(mdev, apm)) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; - props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if (MLX5_CAP_GEN(mdev, xrc)) props->device_cap_flags |= IB_DEVICE_XRC; props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; @@ -1245,18 +1244,10 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) struct ib_srq_init_attr attr; struct mlx5_ib_dev *dev; struct ib_cq_init_attr cq_attr = {.cqe = 1}; - u32 rsvd_lkey; int ret = 0; dev = container_of(devr, struct mlx5_ib_dev, devr); - ret = mlx5_core_query_special_context(dev->mdev, &rsvd_lkey); - if (ret) { - pr_err("Failed to query special context %d\n", ret); - return ret; - } - dev->ib_dev.local_dma_lkey = rsvd_lkey; - devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL); if (IS_ERR(devr->p0)) { ret = PTR_ERR(devr->p0); @@ -1418,6 +1409,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; + dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index aa0d5ffe92d8..9335e5ae18cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -200,25 +200,3 @@ int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev) return err; } - -int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey) -{ - struct mlx5_cmd_query_special_contexts_mbox_in in; - struct mlx5_cmd_query_special_contexts_mbox_out out; - int err; - - memset(&in, 0, sizeof(in)); - memset(&out, 0, sizeof(out)); - in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); - if (err) - return err; - - if (out.hdr.status) - err = mlx5_cmd_status_to_err(&out.hdr); - - *rsvd_lkey = be32_to_cpu(out.resd_lkey); - - return err; -} -EXPORT_SYMBOL(mlx5_core_query_special_context); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 8eb3b19af2a4..250b1ff8b48d 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -402,17 +402,6 @@ struct mlx5_cmd_teardown_hca_mbox_out { u8 rsvd[8]; }; -struct mlx5_cmd_query_special_contexts_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_cmd_query_special_contexts_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 dump_fill_mkey; - __be32 resd_lkey; -}; - struct mlx5_cmd_layout { u8 type; u8 rsvd0[3]; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 27b53f9a24ad..8b6d6f2154a4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -845,7 +845,6 @@ void *mlx5_get_protocol_dev(struct mlx5_core_dev *mdev, int protocol); int mlx5_register_interface(struct mlx5_interface *intf); void mlx5_unregister_interface(struct mlx5_interface *intf); int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); -int mlx5_core_query_special_context(struct mlx5_core_dev *dev, u32 *rsvd_lkey); struct mlx5_profile { u64 mask; -- cgit v1.2.3 From 5ebc76035303016ec41bb752bec156ea9fde7c34 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 17 Sep 2015 14:02:45 +0800 Subject: ACPI, PCI, irq: Do not share PCI IRQ with ISA IRQ Avoid IRQs occupied by ISA IRQs when allocating IRQs for PCI link devices, otherwise it may cause interrupt storm due to incompatible pin attributes. This issue was triggered on a KVM virtual machine, which 1) uses IRQ9 for SCI in high level mode. 2) defines an PCI interrupt link device (LNKS) with IRQ9 as the only possible irq. 3) has an PCI device referring to link device LNKS. So it causes interrupt storm when enabling the PCI device because PCI IRQ works in low level mode. Signed-off-by: Jiang Liu Acked-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- drivers/acpi/pci_irq.c | 1 + drivers/acpi/pci_link.c | 13 +++++++++++++ include/linux/acpi.h | 1 + 3 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index 6da0f9beab19..c9336751e5e3 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -372,6 +372,7 @@ static int acpi_isa_register_gsi(struct pci_dev *dev) /* Interrupt Line values above 0xF are forbidden */ if (dev->irq > 0 && (dev->irq <= 0xF) && + acpi_isa_irq_available(dev->irq) && (acpi_isa_irq_to_gsi(dev->irq, &dev_gsi) == 0)) { dev_warn(&dev->dev, "PCI INT %c: no GSI - using ISA IRQ %d\n", pin_name(dev->pin), dev->irq); diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index 3b4ea98e3ea0..246e50d22120 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -553,6 +553,13 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link) irq = link->irq.possible[i]; } } + if (acpi_irq_penalty[irq] >= PIRQ_PENALTY_ISA_ALWAYS) { + printk(KERN_ERR PREFIX "No IRQ available for %s [%s]. " + "Try pci=noacpi or acpi=off\n", + acpi_device_name(link->device), + acpi_device_bid(link->device)); + return -ENODEV; + } /* Attempt to enable the link device at this IRQ. */ if (acpi_pci_link_set(link, irq)) { @@ -821,6 +828,12 @@ void acpi_penalize_isa_irq(int irq, int active) } } +bool acpi_isa_irq_available(int irq) +{ + return irq >= 0 && (irq >= ARRAY_SIZE(acpi_irq_penalty) || + acpi_irq_penalty[irq] < PIRQ_PENALTY_ISA_ALWAYS); +} + /* * Penalize IRQ used by ACPI SCI. If ACPI SCI pin attributes conflict with * PCI IRQ attributes, mark ACPI SCI as ISA_ALWAYS so it won't be use for diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 7235c4851460..43856d19cf4d 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -217,6 +217,7 @@ struct pci_dev; int acpi_pci_irq_enable (struct pci_dev *dev); void acpi_penalize_isa_irq(int irq, int active); +bool acpi_isa_irq_available(int irq); void acpi_penalize_sci_irq(int irq, int trigger, int polarity); void acpi_pci_irq_disable (struct pci_dev *dev); -- cgit v1.2.3 From 4593fdbe7a2f44d5e64c627c715dd0bcec9bdf14 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 27 Sep 2015 02:09:20 +0900 Subject: blk-mq: fix sysfs registration/unregistration race There is a race between cpu hotplug handling and adding/deleting gendisk for blk-mq, where both are trying to register and unregister the same sysfs entries. null_add_dev --> blk_mq_init_queue --> blk_mq_init_allocated_queue --> add to 'all_q_list' (*) --> add_disk --> blk_register_queue --> blk_mq_register_disk (++) null_del_dev --> del_gendisk --> blk_unregister_queue --> blk_mq_unregister_disk (--) --> blk_cleanup_queue --> blk_mq_free_queue --> del from 'all_q_list' (*) blk_mq_queue_reinit --> blk_mq_sysfs_unregister (-) --> blk_mq_sysfs_register (+) While the request queue is added to 'all_q_list' (*), blk_mq_queue_reinit() can be called for the queue anytime by CPU hotplug callback. But blk_mq_sysfs_unregister (-) and blk_mq_sysfs_register (+) in blk_mq_queue_reinit must not be called before blk_mq_register_disk (++) and after blk_mq_unregister_disk (--) is finished. Because '/sys/block/*/mq/' is not exists. There has already been BLK_MQ_F_SYSFS_UP flag in hctx->flags which can be used to track these sysfs stuff, but it is only fixing this issue partially. In order to fix it completely, we just need per-queue flag instead of per-hctx flag with appropriate locking. So this introduces q->mq_sysfs_init_done which is properly protected with all_q_mutex. Also, we need to ensure that blk_mq_map_swqueue() is called with all_q_mutex is held. Since hctx->nr_ctx is reset temporarily and updated in blk_mq_map_swqueue(), so we should avoid blk_mq_register_hctx() seeing the temporary hctx->nr_ctx value in CPU hotplug handling or adding/deleting gendisk . Signed-off-by: Akinobu Mita Reviewed-by: Ming Lei Cc: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 30 ++++++++++++++++++++++-------- block/blk-mq.c | 6 +++--- include/linux/blk-mq.h | 1 - include/linux/blkdev.h | 2 ++ 4 files changed, 27 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 279c5d674edf..189f5ae6522a 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -343,7 +343,7 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) struct blk_mq_ctx *ctx; int i; - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + if (!hctx->nr_ctx) return; hctx_for_each_ctx(hctx, ctx, i) @@ -358,7 +358,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) struct blk_mq_ctx *ctx; int i, ret; - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + if (!hctx->nr_ctx) return 0; ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); @@ -381,6 +381,8 @@ void blk_mq_unregister_disk(struct gendisk *disk) struct blk_mq_ctx *ctx; int i, j; + blk_mq_disable_hotplug(); + queue_for_each_hw_ctx(q, hctx, i) { blk_mq_unregister_hctx(hctx); @@ -395,6 +397,9 @@ void blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&q->mq_kobj); kobject_put(&disk_to_dev(disk)->kobj); + + q->mq_sysfs_init_done = false; + blk_mq_enable_hotplug(); } static void blk_mq_sysfs_init(struct request_queue *q) @@ -425,27 +430,30 @@ int blk_mq_register_disk(struct gendisk *disk) struct blk_mq_hw_ctx *hctx; int ret, i; + blk_mq_disable_hotplug(); + blk_mq_sysfs_init(q); ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) - return ret; + goto out; kobject_uevent(&q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { - hctx->flags |= BLK_MQ_F_SYSFS_UP; ret = blk_mq_register_hctx(hctx); if (ret) break; } - if (ret) { + if (ret) blk_mq_unregister_disk(disk); - return ret; - } + else + q->mq_sysfs_init_done = true; +out: + blk_mq_enable_hotplug(); - return 0; + return ret; } EXPORT_SYMBOL_GPL(blk_mq_register_disk); @@ -454,6 +462,9 @@ void blk_mq_sysfs_unregister(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; + if (!q->mq_sysfs_init_done) + return; + queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } @@ -463,6 +474,9 @@ int blk_mq_sysfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i, ret = 0; + if (!q->mq_sysfs_init_done) + return ret; + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) diff --git a/block/blk-mq.c b/block/blk-mq.c index 2fd7283ec62b..0262131ac5f2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2035,13 +2035,13 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, goto err_hctxs; mutex_lock(&all_q_mutex); - list_add_tail(&q->all_q_node, &all_q_list); - mutex_unlock(&all_q_mutex); + list_add_tail(&q->all_q_node, &all_q_list); blk_mq_add_queue_tag_set(set, q); - blk_mq_map_swqueue(q); + mutex_unlock(&all_q_mutex); + return q; err_hctxs: diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 37d1602c4f7a..b80ba4572a31 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -145,7 +145,6 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, - BLK_MQ_F_SYSFS_UP = 1 << 3, BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99da9ebc7377..19c2e947d4d1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -456,6 +456,8 @@ struct request_queue { struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set *bio_split; + + bool mq_sysfs_init_done; }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ -- cgit v1.2.3 From 31b33dfb0a144469dd805514c9e63f4993729a48 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 28 Sep 2015 17:24:25 -0700 Subject: skbuff: Fix skb checksum partial check. Earlier patch 6ae459bda tried to detect void ckecksum partial skb by comparing pull length to checksum offset. But it does not work for all cases since checksum-offset depends on updates to skb->data. Following patch fixes it by validating checksum start offset after skb-data pointer is updated. Negative value of checksum offset start means there is no need to checksum. Fixes: 6ae459bda ("skbuff: Fix skb checksum flag on skb pull") Reported-by: Andrew Vagin Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2b0a30a6e31c..4398411236f1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2708,7 +2708,7 @@ static inline void skb_postpull_rcsum(struct sk_buff *skb, if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0)); else if (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_start_offset(skb) <= len) + skb_checksum_start_offset(skb) < 0) skb->ip_summed = CHECKSUM_NONE; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index dad4dd37e2aa..fab4599ba8b2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2958,11 +2958,12 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags); */ unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { + unsigned char *data = skb->data; + BUG_ON(len > skb->len); - skb->len -= len; - BUG_ON(skb->len < skb->data_len); - skb_postpull_rcsum(skb, skb->data, len); - return skb->data += len; + __skb_pull(skb, len); + skb_postpull_rcsum(skb, data, len); + return skb->data; } EXPORT_SYMBOL_GPL(skb_pull_rcsum); -- cgit v1.2.3 From 9ae7ce00cc1353155b1914bfc40e8362efef7d1c Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Tue, 29 Sep 2015 18:21:18 +0900 Subject: usb: renesas_usbhs: fix build warning if 64-bit architecture This patch fixes the following warning if 64-bit architecture environment: ./drivers/usb/renesas_usbhs/common.c:496:25: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] dparam->type = of_id ? (u32)of_id->data : 0; Acked-by: Geert Uytterhoeven Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi --- drivers/usb/renesas_usbhs/common.c | 2 +- include/linux/usb/renesas_usbhs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c index 7b98e1d9194c..0ce398c5482e 100644 --- a/drivers/usb/renesas_usbhs/common.c +++ b/drivers/usb/renesas_usbhs/common.c @@ -493,7 +493,7 @@ static struct renesas_usbhs_platform_info *usbhs_parse_dt(struct device *dev) return NULL; dparam = &info->driver_param; - dparam->type = of_id ? (u32)of_id->data : 0; + dparam->type = of_id ? (uintptr_t)of_id->data : 0; if (!of_property_read_u32(dev->of_node, "renesas,buswait", &tmp)) dparam->buswait_bwait = tmp; gpio = of_get_named_gpio_flags(dev->of_node, "renesas,enable-gpio", 0, diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h index 3dd5a781da99..bfb74723f151 100644 --- a/include/linux/usb/renesas_usbhs.h +++ b/include/linux/usb/renesas_usbhs.h @@ -157,7 +157,7 @@ struct renesas_usbhs_driver_param { */ int pio_dma_border; /* default is 64byte */ - u32 type; + uintptr_t type; u32 enable_gpio; /* -- cgit v1.2.3 From f4829a9b7a61e159367350008a608b062c4f6840 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 27 Sep 2015 21:01:50 +0200 Subject: blk-mq: fix racy updates of rq->errors blk_mq_complete_request may be a no-op if the request has already been completed by others means (e.g. a timeout or cancellation), but currently drivers have to set rq->errors before calling blk_mq_complete_request, which might leave us with the wrong error value. Add an error parameter to blk_mq_complete_request so that we can defer setting rq->errors until we known we won the race to complete the request. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 ++++++------ drivers/block/loop.c | 11 +++++------ drivers/block/null_blk.c | 2 +- drivers/block/nvme-core.c | 16 +++++++--------- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 19 ++++++++++--------- drivers/scsi/scsi_lib.c | 2 +- include/linux/blk-mq.h | 2 +- 8 files changed, 32 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index 31c0c6259c4c..2306330530e8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -393,14 +393,16 @@ void __blk_mq_complete_request(struct request *rq) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +void blk_mq_complete_request(struct request *rq, int error) { struct request_queue *q = rq->q; if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + if (!blk_mark_rq_complete(rq)) { + rq->errors = error; __blk_mq_complete_request(rq); + } } EXPORT_SYMBOL(blk_mq_complete_request); @@ -616,10 +618,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) { - rq->errors = -EIO; - blk_mq_complete_request(rq); - } + if (unlikely(blk_queue_dying(rq->q))) + blk_mq_complete_request(rq, -EIO); return; } if (rq->cmd_flags & REQ_NO_TIMEOUT) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f9889b6bc02c..674f800a3b57 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1486,17 +1486,16 @@ static void loop_handle_cmd(struct loop_cmd *cmd) { const bool write = cmd->rq->cmd_flags & REQ_WRITE; struct loop_device *lo = cmd->rq->q->queuedata; - int ret = -EIO; + int ret = 0; - if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { + ret = -EIO; goto failed; + } ret = do_req_filebacked(lo, cmd->rq); - failed: - if (ret) - cmd->rq->errors = -EIO; - blk_mq_complete_request(cmd->rq); + blk_mq_complete_request(cmd->rq, ret ? -EIO : 0); } static void loop_queue_write_work(struct work_struct *work) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index a295b98c6bae..1c9e4fe5aa44 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -289,7 +289,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd) case NULL_IRQ_SOFTIRQ: switch (queue_mode) { case NULL_Q_MQ: - blk_mq_complete_request(cmd->rq); + blk_mq_complete_request(cmd->rq, cmd->rq->errors); break; case NULL_Q_RQ: blk_complete_request(cmd->rq); diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 30758bdf69ea..6f04771f1019 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -618,16 +618,15 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, spin_unlock_irqrestore(req->q->queue_lock, flags); return; } + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { if (cmd_rq->ctx == CMD_CTX_CANCELLED) - req->errors = -EINTR; - else - req->errors = status; + status = -EINTR; } else { - req->errors = nvme_error_status(status); + status = nvme_error_status(status); } - } else - req->errors = 0; + } + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { u32 result = le32_to_cpup(&cqe->result); req->special = (void *)(uintptr_t)result; @@ -650,7 +649,7 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, } nvme_free_iod(nvmeq->dev, iod); - blk_mq_complete_request(req); + blk_mq_complete_request(req, status); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -863,8 +862,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ns && ns->ms && !blk_integrity_rq(req)) { if (!(ns->pi_type && ns->ms == 8) && req->cmd_type != REQ_TYPE_DRV_PRIV) { - req->errors = -EFAULT; - blk_mq_complete_request(req); + blk_mq_complete_request(req, -EFAULT); return BLK_MQ_RQ_QUEUE_OK; } } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index e93899cc6f60..6ca35495a5be 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -144,7 +144,7 @@ static void virtblk_done(struct virtqueue *vq) do { virtqueue_disable_cb(vq); while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { - blk_mq_complete_request(vbr->req); + blk_mq_complete_request(vbr->req, vbr->req->errors); req_done = true; } if (unlikely(virtqueue_is_broken(vq))) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 0823a96902f8..611170896b8c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1142,6 +1142,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) RING_IDX i, rp; unsigned long flags; struct blkfront_info *info = (struct blkfront_info *)dev_id; + int error; spin_lock_irqsave(&info->io_lock, flags); @@ -1182,37 +1183,37 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) continue; } - req->errors = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; + error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { case BLKIF_OP_DISCARD: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - req->errors = -EOPNOTSUPP; + error = -EOPNOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); } - blk_mq_complete_request(req); + blk_mq_complete_request(req, error); break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { printk(KERN_WARNING "blkfront: %s: %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - req->errors = -EOPNOTSUPP; + error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && info->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); - req->errors = -EOPNOTSUPP; + error = -EOPNOTSUPP; } - if (unlikely(req->errors)) { - if (req->errors == -EOPNOTSUPP) - req->errors = 0; + if (unlikely(error)) { + if (error == -EOPNOTSUPP) + error = 0; info->feature_flush = 0; xlvbd_flush(info); } @@ -1223,7 +1224,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " "request: %x\n", bret->status); - blk_mq_complete_request(req); + blk_mq_complete_request(req, error); break; default: BUG(); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index cbfc5990052b..126a48c6431e 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1957,7 +1957,7 @@ static int scsi_mq_prep_fn(struct request *req) static void scsi_mq_done(struct scsi_cmnd *cmd) { trace_scsi_dispatch_cmd_done(cmd); - blk_mq_complete_request(cmd->request); + blk_mq_complete_request(cmd->request, cmd->request->errors); } static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b80ba4572a31..c1b5c867ff07 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -214,7 +214,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); void blk_mq_cancel_requeue_work(struct request_queue *q); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_abort_requeue_list(struct request_queue *q); -void blk_mq_complete_request(struct request *rq); +void blk_mq_complete_request(struct request *rq, int error); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); -- cgit v1.2.3 From 0bf6cd5b9531bcc29c0a5e504b6ce2984c6fd8d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 27 Sep 2015 21:01:51 +0200 Subject: blk-mq: factor out a helper to iterate all tags for a request_queue And replace the blk_mq_tag_busy_iter with it - the driver use has been replaced with a new helper a while ago, and internal to the block we only need the new version. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 27 ++++++++++++++++++++------- block/blk-mq-tag.h | 2 ++ block/blk-mq.c | 14 +++----------- include/linux/blk-mq.h | 2 -- 4 files changed, 25 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 9115c6d59948..ed96474d75cb 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -471,17 +471,30 @@ void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, } EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); -void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv) { - struct blk_mq_tags *tags = hctx->tags; + struct blk_mq_hw_ctx *hctx; + int i; + + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); + } - if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); } -EXPORT_SYMBOL(blk_mq_tag_busy_iter); static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) { diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 9eb2cf4f01cb..d468a79f2c4a 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -58,6 +58,8 @@ extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv); enum { BLK_MQ_TAG_CACHE_MIN = 1, diff --git a/block/blk-mq.c b/block/blk-mq.c index 2306330530e8..7785ae96267a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -641,24 +641,16 @@ static void blk_mq_rq_timer(unsigned long priv) .next = 0, .next_set = 0, }; - struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - /* - * If not software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; - - blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); - } + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { + struct blk_mq_hw_ctx *hctx; + queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c1b5c867ff07..5e7d43ab61c0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -223,8 +223,6 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, - void *priv); void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); -- cgit v1.2.3 From 0610c25daa3e76e38ad5a8fae683a89ff9f71798 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 1 Oct 2015 15:37:02 -0700 Subject: memcg: fix dirty page migration The problem starts with a file backed dirty page which is charged to a memcg. Then page migration is used to move oldpage to newpage. Migration: - copies the oldpage's data to newpage - clears oldpage.PG_dirty - sets newpage.PG_dirty - uncharges oldpage from memcg - charges newpage to memcg Clearing oldpage.PG_dirty decrements the charged memcg's dirty page count. However, because newpage is not yet charged, setting newpage.PG_dirty does not increment the memcg's dirty page count. After migration completes newpage.PG_dirty is eventually cleared, often in account_page_cleaned(). At this time newpage is charged to a memcg so the memcg's dirty page count is decremented which causes underflow because the count was not previously incremented by migration. This underflow causes balance_dirty_pages() to see a very large unsigned number of dirty memcg pages which leads to aggressive throttling of buffered writes by processes in non root memcg. This issue: - can harm performance of non root memcg buffered writes. - can report too small (even negative) values in memory.stat[(total_)dirty] counters of all memcg, including the root. To avoid polluting migrate.c with #ifdef CONFIG_MEMCG checks, introduce page_memcg() and set_page_memcg() helpers. Test: 0) setup and enter limited memcg mkdir /sys/fs/cgroup/test echo 1G > /sys/fs/cgroup/test/memory.limit_in_bytes echo $$ > /sys/fs/cgroup/test/cgroup.procs 1) buffered writes baseline dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k sync grep ^dirty /sys/fs/cgroup/test/memory.stat 2) buffered writes with compaction antagonist to induce migration yes 1 > /proc/sys/vm/compact_memory & rm -rf /data/tmp/foo dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k kill % sync grep ^dirty /sys/fs/cgroup/test/memory.stat 3) buffered writes without antagonist, should match baseline rm -rf /data/tmp/foo dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k sync grep ^dirty /sys/fs/cgroup/test/memory.stat (speed, dirty residue) unpatched patched 1) 841 MB/s 0 dirty pages 886 MB/s 0 dirty pages 2) 611 MB/s -33427456 dirty pages 793 MB/s 0 dirty pages 3) 114 MB/s -33427456 dirty pages 891 MB/s 0 dirty pages Notice that unpatched baseline performance (1) fell after migration (3): 841 -> 114 MB/s. In the patched kernel, post migration performance matches baseline. Fixes: c4843a7593a9 ("memcg: add per cgroup dirty page accounting") Signed-off-by: Greg Thelen Reported-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +++++++++++++++++++++ mm/migrate.c | 12 +++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 91c08f6f0dc9..80001de019ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -905,6 +905,27 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return page->mem_cgroup; +} + +static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg) +{ + page->mem_cgroup = memcg; +} +#else +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg) +{ +} +#endif + /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/mm/migrate.c b/mm/migrate.c index 7452a00bbb50..842ecd7aaf7f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -740,6 +740,15 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (PageSwapBacked(page)) SetPageSwapBacked(newpage); + /* + * Indirectly called below, migrate_page_copy() copies PG_dirty and thus + * needs newpage's memcg set to transfer memcg dirty page accounting. + * So perform memcg migration in two steps: + * 1. set newpage->mem_cgroup (here) + * 2. clear page->mem_cgroup (below) + */ + set_page_memcg(newpage, page_memcg(page)); + mapping = page_mapping(page); if (!mapping) rc = migrate_page(mapping, newpage, page, mode); @@ -756,9 +765,10 @@ static int move_to_new_page(struct page *newpage, struct page *page, rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc != MIGRATEPAGE_SUCCESS) { + set_page_memcg(newpage, NULL); newpage->mapping = NULL; } else { - mem_cgroup_migrate(page, newpage, false); + set_page_memcg(page, NULL); if (page_was_mapped) remove_migration_ptes(page, newpage); page->mapping = NULL; -- cgit v1.2.3 From ef510194cefe0cd369ef73419cd65b0a5bb4fb5b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 1 Oct 2015 15:37:13 -0700 Subject: memcg: remove pcp_counter_lock Commit 733a572e66d2 ("memcg: make mem_cgroup_read_{stat|event}() iterate possible cpus instead of online") removed the last use of the per memcg pcp_counter_lock but forgot to remove the variable. Kill the vestigial variable. Signed-off-by: Greg Thelen Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - mm/memcontrol.c | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad800e62cb7a..6452ff4c463f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -242,7 +242,6 @@ struct mem_cgroup { * percpu counter. */ struct mem_cgroup_stat_cpu __percpu *stat; - spinlock_t pcp_counter_lock; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 03cc0a742ff1..1fedbde68f59 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4185,7 +4185,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg_wb_domain_init(memcg, GFP_KERNEL)) goto out_free_stat; - spin_lock_init(&memcg->pcp_counter_lock); return memcg; out_free_stat: -- cgit v1.2.3 From 10abc7df9277a81971924a6c03f74e86d799daf1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Oct 2015 15:50:11 +0100 Subject: irqdomain: Add an accessor for the of_node field As we're about to remove the of_node field from the irqdomain structure, introduce an accessor for it. Subsequent patches will take care of the actual repainting. Signed-off-by: Marc Zyngier Cc: Jiang Liu Cc: Jason Cooper Link: http://lkml.kernel.org/r/1444402211-1141-1-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqdomain.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index d3ca79236fb0..f644fdb06dd6 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -161,6 +161,11 @@ enum { IRQ_DOMAIN_FLAG_NONCORE = (1 << 16), }; +static inline struct device_node *irq_domain_get_of_node(struct irq_domain *d) +{ + return d->of_node; +} + #ifdef CONFIG_IRQ_DOMAIN struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, irq_hw_number_t hwirq_max, int direct_max, -- cgit v1.2.3