From 698eff6355f735d46d1b7113df8b422874cd7988 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Mar 2017 12:48:18 +0100 Subject: sched/clock, x86/perf: Fix "perf test tsc" People reported that commit: 5680d8094ffa ("sched/clock: Provide better clock continuity") broke "perf test tsc". That commit added another offset to the reported clock value; so take that into account when computing the provided offset values. Reported-by: Adrian Hunter Reported-by: Arnaldo Carvalho de Melo Tested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5680d8094ffa ("sched/clock: Provide better clock continuity") Signed-off-by: Ingo Molnar --- include/linux/sched/clock.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h index 4a68c6791207..34fe92ce1ebd 100644 --- a/include/linux/sched/clock.h +++ b/include/linux/sched/clock.h @@ -54,15 +54,16 @@ static inline u64 local_clock(void) } #else extern void sched_clock_init_late(void); -/* - * Architectures can set this to 1 if they have specified - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, - * but then during bootup it turns out that sched_clock() - * is reliable after all: - */ extern int sched_clock_stable(void); extern void clear_sched_clock_stable(void); +/* + * When sched_clock_stable(), __sched_clock_offset provides the offset + * between local_clock() and sched_clock(). + */ +extern u64 __sched_clock_offset; + + extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); -- cgit v1.2.3 From 90db10434b163e46da413d34db8d0e77404cc645 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 23 Mar 2017 18:24:19 +0100 Subject: KVM: kvm_io_bus_unregister_dev() should never fail No caller currently checks the return value of kvm_io_bus_unregister_dev(). This is evil, as all callers silently go on freeing their device. A stale reference will remain in the io_bus, getting at least used again, when the iobus gets teared down on kvm_destroy_vm() - leading to use after free errors. There is nothing the callers could do, except retrying over and over again. So let's simply remove the bus altogether, print an error and make sure no one can access this broken bus again (returning -ENOMEM on any attempt to access it). Fixes: e93f8a0f821e ("KVM: convert io_bus to SRCU") Cc: stable@vger.kernel.org # 3.4+ Reported-by: Dmitry Vyukov Reviewed-by: Cornelia Huck Signed-off-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 4 ++-- virt/kvm/eventfd.c | 3 ++- virt/kvm/kvm_main.c | 42 +++++++++++++++++++++++++----------------- 3 files changed, 29 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c14ad9809da..d0250744507a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -162,8 +162,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, int len, void *val); int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, struct kvm_io_device *dev); -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev); +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev); struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a29786dd9522..4d28a9ddbee0 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -870,7 +870,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, continue; kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); - kvm->buses[bus_idx]->ioeventfd_count--; + if (kvm->buses[bus_idx]) + kvm->buses[bus_idx]->ioeventfd_count--; ioeventfd_release(p); ret = 0; break; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7445566fadc1..ef1aa7f1ed7a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -728,7 +728,8 @@ static void kvm_destroy_vm(struct kvm *kvm) spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { - kvm_io_bus_destroy(kvm->buses[i]); + if (kvm->buses[i]) + kvm_io_bus_destroy(kvm->buses[i]); kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); @@ -3476,6 +3477,8 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; r = __kvm_io_bus_write(vcpu, bus, &range, val); return r < 0 ? r : 0; } @@ -3493,6 +3496,8 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; /* First try the device referenced by cookie. */ if ((cookie >= 0) && (cookie < bus->dev_count) && @@ -3543,6 +3548,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, }; bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); + if (!bus) + return -ENOMEM; r = __kvm_io_bus_read(vcpu, bus, &range, val); return r < 0 ? r : 0; } @@ -3555,6 +3562,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; + if (!bus) + return -ENOMEM; + /* exclude ioeventfd which is limited by maximum fd */ if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) return -ENOSPC; @@ -3574,45 +3584,41 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, } /* Caller must hold slots_lock. */ -int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, - struct kvm_io_device *dev) +void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, + struct kvm_io_device *dev) { - int i, r; + int i; struct kvm_io_bus *new_bus, *bus; bus = kvm->buses[bus_idx]; - - /* - * It's possible the bus being released before hand. If so, - * we're done here. - */ if (!bus) - return 0; + return; - r = -ENOENT; for (i = 0; i < bus->dev_count; i++) if (bus->range[i].dev == dev) { - r = 0; break; } - if (r) - return r; + if (i == bus->dev_count) + return; new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * sizeof(struct kvm_io_range)), GFP_KERNEL); - if (!new_bus) - return -ENOMEM; + if (!new_bus) { + pr_err("kvm: failed to shrink bus, removing it completely\n"); + goto broken; + } memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); new_bus->dev_count--; memcpy(new_bus->range + i, bus->range + i + 1, (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); +broken: rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); kfree(bus); - return r; + return; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, @@ -3625,6 +3631,8 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, srcu_idx = srcu_read_lock(&kvm->srcu); bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); + if (!bus) + goto out_unlock; dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); if (dev_idx < 0) -- cgit v1.2.3 From 07de36b378a58f1d1426829acf0ab7cf86f651f3 Mon Sep 17 00:00:00 2001 From: Alexander Kochetkov Date: Wed, 22 Mar 2017 17:32:49 +0300 Subject: clockevents: Fix syntax error in clkevt-of macro The patch fix syntax errors introduced by commit 0c8893c9095d ("clockevents: Add a clkevt-of mechanism like clksrc-of"). Fixes: 0c8893c9095d ("clockevents: Add a clkevt-of mechanism like clksrc-of") Signed-off-by: Alexander Kochetkov Signed-off-by: Daniel Lezcano --- drivers/clocksource/clkevt-probe.c | 2 +- include/linux/clockchips.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/clocksource/clkevt-probe.c b/drivers/clocksource/clkevt-probe.c index 8c30fec86094..eb89b502acbd 100644 --- a/drivers/clocksource/clkevt-probe.c +++ b/drivers/clocksource/clkevt-probe.c @@ -17,7 +17,7 @@ #include #include -#include +#include extern struct of_device_id __clkevt_of_table[]; diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 5d3053c34fb3..6d7edc3082f9 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -229,7 +229,7 @@ static inline void tick_setup_hrtimer_broadcast(void) { } #ifdef CONFIG_CLKEVT_PROBE extern int clockevent_probe(void); -#els +#else static inline int clockevent_probe(void) { return 0; } #endif -- cgit v1.2.3 From 8d09617b076fd03ee9ae124abce94dda17bf3723 Mon Sep 17 00:00:00 2001 From: Alexander Kochetkov Date: Wed, 22 Mar 2017 17:29:06 +0300 Subject: vmlinux.lds: Add __clkevt_of_table to kernel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code introduced by commit 0c8893c9095d ("clockevents: Add a clkevt-of mechanism like clksrc-of") refer to __clkevt_of_table what doesn't exist in the vmlinux. As a result kernel build failed with error: "clkevt-probe.c:63: undefined reference to `__clkevt_of_table’" Fixes: 0c8893c9095d ("clockevents: Add a clkevt-of mechanism like clksrc-of") Signed-off-by: Alexander Kochetkov Signed-off-by: Daniel Lezcano --- include/asm-generic/vmlinux.lds.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0968d13b3885..8c6b525eb0fa 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -173,6 +173,7 @@ KEEP(*(__##name##_of_table_end)) #define CLKSRC_OF_TABLES() OF_TABLE(CONFIG_CLKSRC_OF, clksrc) +#define CLKEVT_OF_TABLES() OF_TABLE(CONFIG_CLKEVT_OF, clkevt) #define IRQCHIP_OF_MATCH_TABLE() OF_TABLE(CONFIG_IRQCHIP, irqchip) #define CLK_OF_TABLES() OF_TABLE(CONFIG_COMMON_CLK, clk) #define IOMMU_OF_TABLES() OF_TABLE(CONFIG_OF_IOMMU, iommu) @@ -559,6 +560,7 @@ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ CLKSRC_OF_TABLES() \ + CLKEVT_OF_TABLES() \ IOMMU_OF_TABLES() \ CPU_METHOD_OF_TABLES() \ CPUIDLE_METHOD_OF_TABLES() \ -- cgit v1.2.3 From f9ba3501d50317697811ff3c48f623f08d616fc8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 27 Mar 2017 00:21:15 +0800 Subject: sctp: change to save MSG_MORE flag into assoc David Laight noticed the support for MSG_MORE with datamsg->force_delay didn't really work as we expected, as the first msg with MSG_MORE set would always block the following chunks' dequeuing. This Patch is to rewrite it by saving the MSG_MORE flag into assoc as David Laight suggested. asoc->force_delay is used to save MSG_MORE flag before a msg is sent. All chunks in queue would not be sent out if asoc->force_delay is set by the msg with MSG_MORE flag, until a new msg without MSG_MORE flag clears asoc->force_delay. Note that this change would not affect the flush is generated by other triggers, like asoc->state != ESTABLISHED, queue size > pmtu etc. v1->v2: Not clear asoc->force_delay after sending the msg with MSG_MORE flag. Fixes: 4ea0c32f5f42 ("sctp: add support for MSG_MORE") Signed-off-by: Xin Long Acked-by: David Laight Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 +- net/sctp/output.c | 2 +- net/sctp/socket.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 592decebac75..8caa5ee9e290 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -499,7 +499,6 @@ struct sctp_datamsg { /* Did the messenge fail to send? */ int send_error; u8 send_failed:1, - force_delay:1, can_delay; /* should this message be Nagle delayed */ }; @@ -1878,6 +1877,7 @@ struct sctp_association { __u8 need_ecne:1, /* Need to send an ECNE Chunk? */ temp:1, /* Is it a temporary association? */ + force_delay:1, prsctp_enable:1, reconf_enable:1; diff --git a/net/sctp/output.c b/net/sctp/output.c index 1224421036b3..73fd178007a3 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -704,7 +704,7 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet, */ if ((sctp_sk(asoc->base.sk)->nodelay || inflight == 0) && - !chunk->msg->force_delay) + !asoc->force_delay) /* Nothing unacked */ return SCTP_XMIT_OK; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 0f378ea2ae38..baa269a0d52e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1965,7 +1965,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) err = PTR_ERR(datamsg); goto out_free; } - datamsg->force_delay = !!(msg->msg_flags & MSG_MORE); + asoc->force_delay = !!(msg->msg_flags & MSG_MORE); /* Now send the (possibly) fragmented message. */ list_for_each_entry(chunk, &datamsg->chunks, frag_list) { -- cgit v1.2.3 From fe25deb7737ce6c0879ccf79c99fa1221d428bf2 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Mon, 27 Mar 2017 11:21:25 +0200 Subject: drm/ttm, drm/vmwgfx: Relax permission checking when opening surfaces Previously, when a surface was opened using a legacy (non prime) handle, it was verified to have been created by a client in the same master realm. Relax this so that opening is also allowed recursively if the client already has the surface open. This works around a regression in svga mesa where opening of a shared surface is used recursively to obtain surface information. Cc: Signed-off-by: Thomas Hellstrom Reviewed-by: Sinclair Yeh --- drivers/gpu/drm/ttm/ttm_object.c | 10 +++++++--- drivers/gpu/drm/vmwgfx/vmwgfx_fence.c | 6 ++---- drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 4 ++-- drivers/gpu/drm/vmwgfx/vmwgfx_surface.c | 22 +++++++++------------- include/drm/ttm/ttm_object.h | 5 ++++- 5 files changed, 24 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/ttm/ttm_object.c b/drivers/gpu/drm/ttm/ttm_object.c index fdb451e3ec01..d750140bafbc 100644 --- a/drivers/gpu/drm/ttm/ttm_object.c +++ b/drivers/gpu/drm/ttm/ttm_object.c @@ -179,7 +179,7 @@ int ttm_base_object_init(struct ttm_object_file *tfile, if (unlikely(ret != 0)) goto out_err0; - ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL); + ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL, false); if (unlikely(ret != 0)) goto out_err1; @@ -318,7 +318,8 @@ EXPORT_SYMBOL(ttm_ref_object_exists); int ttm_ref_object_add(struct ttm_object_file *tfile, struct ttm_base_object *base, - enum ttm_ref_type ref_type, bool *existed) + enum ttm_ref_type ref_type, bool *existed, + bool require_existed) { struct drm_open_hash *ht = &tfile->ref_hash[ref_type]; struct ttm_ref_object *ref; @@ -345,6 +346,9 @@ int ttm_ref_object_add(struct ttm_object_file *tfile, } rcu_read_unlock(); + if (require_existed) + return -EPERM; + ret = ttm_mem_global_alloc(mem_glob, sizeof(*ref), false, false); if (unlikely(ret != 0)) @@ -635,7 +639,7 @@ int ttm_prime_fd_to_handle(struct ttm_object_file *tfile, prime = (struct ttm_prime_object *) dma_buf->priv; base = &prime->base; *handle = base->hash.key; - ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL); + ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL, false); dma_buf_put(dma_buf); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c index 4076063e0fdd..6b2708b4eafe 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fence.c @@ -1075,10 +1075,8 @@ int vmw_fence_event_ioctl(struct drm_device *dev, void *data, (void) vmw_fence_obj_reference(fence); if (user_fence_rep != NULL) { - bool existed; - - ret = ttm_ref_object_add(tfile, base, - TTM_REF_USAGE, &existed); + ret = ttm_ref_object_add(vmw_fp->tfile, base, + TTM_REF_USAGE, NULL, false); if (unlikely(ret != 0)) { DRM_ERROR("Failed to reference a fence " "object.\n"); diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c index 65b3f0369636..bf23153d4f55 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c @@ -589,7 +589,7 @@ static int vmw_user_dmabuf_synccpu_grab(struct vmw_user_dma_buffer *user_bo, return ret; ret = ttm_ref_object_add(tfile, &user_bo->prime.base, - TTM_REF_SYNCCPU_WRITE, &existed); + TTM_REF_SYNCCPU_WRITE, &existed, false); if (ret != 0 || existed) ttm_bo_synccpu_write_release(&user_bo->dma.base); @@ -773,7 +773,7 @@ int vmw_user_dmabuf_reference(struct ttm_object_file *tfile, *handle = user_bo->prime.base.hash.key; return ttm_ref_object_add(tfile, &user_bo->prime.base, - TTM_REF_USAGE, NULL); + TTM_REF_USAGE, NULL, false); } /* diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c index f410502cb075..adc023fe67f3 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c @@ -891,17 +891,16 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv, uint32_t handle; struct ttm_base_object *base; int ret; + bool require_exist = false; if (handle_type == DRM_VMW_HANDLE_PRIME) { ret = ttm_prime_fd_to_handle(tfile, u_handle, &handle); if (unlikely(ret != 0)) return ret; } else { - if (unlikely(drm_is_render_client(file_priv))) { - DRM_ERROR("Render client refused legacy " - "surface reference.\n"); - return -EACCES; - } + if (unlikely(drm_is_render_client(file_priv))) + require_exist = true; + if (ACCESS_ONCE(vmw_fpriv(file_priv)->locked_master)) { DRM_ERROR("Locked master refused legacy " "surface reference.\n"); @@ -929,17 +928,14 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv, /* * Make sure the surface creator has the same - * authenticating master. + * authenticating master, or is already registered with us. */ if (drm_is_primary_client(file_priv) && - user_srf->master != file_priv->master) { - DRM_ERROR("Trying to reference surface outside of" - " master domain.\n"); - ret = -EACCES; - goto out_bad_resource; - } + user_srf->master != file_priv->master) + require_exist = true; - ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL); + ret = ttm_ref_object_add(tfile, base, TTM_REF_USAGE, NULL, + require_exist); if (unlikely(ret != 0)) { DRM_ERROR("Could not add a reference to a surface.\n"); goto out_bad_resource; diff --git a/include/drm/ttm/ttm_object.h b/include/drm/ttm/ttm_object.h index ed953f98f0e1..1487011fe057 100644 --- a/include/drm/ttm/ttm_object.h +++ b/include/drm/ttm/ttm_object.h @@ -229,6 +229,8 @@ extern void ttm_base_object_unref(struct ttm_base_object **p_base); * @ref_type: The type of reference. * @existed: Upon completion, indicates that an identical reference object * already existed, and the refcount was upped on that object instead. + * @require_existed: Fail with -EPERM if an identical ref object didn't + * already exist. * * Checks that the base object is shareable and adds a ref object to it. * @@ -243,7 +245,8 @@ extern void ttm_base_object_unref(struct ttm_base_object **p_base); */ extern int ttm_ref_object_add(struct ttm_object_file *tfile, struct ttm_base_object *base, - enum ttm_ref_type ref_type, bool *existed); + enum ttm_ref_type ref_type, bool *existed, + bool require_existed); extern bool ttm_ref_object_exists(struct ttm_object_file *tfile, struct ttm_base_object *base); -- cgit v1.2.3 From 3dbcc105d5561e18ccd0842c7baab1c835562a37 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 30 Mar 2017 01:00:53 +0800 Subject: sctp: alloc stream info when initializing asoc When sending a msg without asoc established, sctp will send INIT packet first and then enqueue chunks. Before receiving INIT_ACK, stream info is not yet alloced. But enqueuing chunks needs to access stream info, like out stream state and out stream cnt. This patch is to fix it by allocing out stream info when initializing an asoc, allocing in stream and re-allocing out stream when processing init. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 ++- net/sctp/associola.c | 7 ++++++- net/sctp/sm_make_chunk.c | 9 ++------- net/sctp/stream.c | 43 +++++++++++++++++++++++++++++++++++-------- 4 files changed, 45 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8caa5ee9e290..a127b7c2c3c9 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -377,7 +377,8 @@ typedef struct sctp_sender_hb_info { __u64 hb_nonce; } sctp_sender_hb_info_t; -struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp); +int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp); +int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp); void sctp_stream_free(struct sctp_stream *stream); void sctp_stream_clear(struct sctp_stream *stream); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 0439a1a68367..0b26df5f6188 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -246,6 +246,9 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a if (!sctp_ulpq_init(&asoc->ulpq, asoc)) goto fail_init; + if (sctp_stream_new(asoc, gfp)) + goto fail_init; + /* Assume that peer would support both address types unless we are * told otherwise. */ @@ -264,7 +267,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a /* AUTH related initializations */ INIT_LIST_HEAD(&asoc->endpoint_shared_keys); if (sctp_auth_asoc_copy_shkeys(ep, asoc, gfp)) - goto fail_init; + goto stream_free; asoc->active_key_id = ep->active_key_id; asoc->prsctp_enable = ep->prsctp_enable; @@ -287,6 +290,8 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a return asoc; +stream_free: + sctp_stream_free(asoc->stream); fail_init: sock_put(asoc->base.sk); sctp_endpoint_put(asoc->ep); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 969a30c7bb54..118faff6a332 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2460,15 +2460,10 @@ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, * association. */ if (!asoc->temp) { - int error; - - asoc->stream = sctp_stream_new(asoc->c.sinit_max_instreams, - asoc->c.sinit_num_ostreams, gfp); - if (!asoc->stream) + if (sctp_stream_init(asoc, gfp)) goto clean_up; - error = sctp_assoc_set_id(asoc, gfp); - if (error) + if (sctp_assoc_set_id(asoc, gfp)) goto clean_up; } diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 1c6cc04fa3a4..bbed997e1c5f 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -35,33 +35,60 @@ #include #include -struct sctp_stream *sctp_stream_new(__u16 incnt, __u16 outcnt, gfp_t gfp) +int sctp_stream_new(struct sctp_association *asoc, gfp_t gfp) { struct sctp_stream *stream; int i; stream = kzalloc(sizeof(*stream), gfp); if (!stream) - return NULL; + return -ENOMEM; - stream->outcnt = outcnt; + stream->outcnt = asoc->c.sinit_num_ostreams; stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp); if (!stream->out) { kfree(stream); - return NULL; + return -ENOMEM; } for (i = 0; i < stream->outcnt; i++) stream->out[i].state = SCTP_STREAM_OPEN; - stream->incnt = incnt; + asoc->stream = stream; + + return 0; +} + +int sctp_stream_init(struct sctp_association *asoc, gfp_t gfp) +{ + struct sctp_stream *stream = asoc->stream; + int i; + + /* Initial stream->out size may be very big, so free it and alloc + * a new one with new outcnt to save memory. + */ + kfree(stream->out); + stream->outcnt = asoc->c.sinit_num_ostreams; + stream->out = kcalloc(stream->outcnt, sizeof(*stream->out), gfp); + if (!stream->out) + goto nomem; + + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; + + stream->incnt = asoc->c.sinit_max_instreams; stream->in = kcalloc(stream->incnt, sizeof(*stream->in), gfp); if (!stream->in) { kfree(stream->out); - kfree(stream); - return NULL; + goto nomem; } - return stream; + return 0; + +nomem: + asoc->stream = NULL; + kfree(stream); + + return -ENOMEM; } void sctp_stream_free(struct sctp_stream *stream) -- cgit v1.2.3 From 597b7305dd8bafdb3aef4957d97128bc90af8e9f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 31 Mar 2017 15:11:47 -0700 Subject: mm: move mm_percpu_wq initialization earlier Yang Li has reported that drain_all_pages triggers a WARN_ON which means that this function is called earlier than the mm_percpu_wq is initialized on arm64 with CMA configured: WARNING: CPU: 2 PID: 1 at mm/page_alloc.c:2423 drain_all_pages+0x244/0x25c Modules linked in: CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.11.0-rc1-next-20170310-00027-g64dfbc5 #127 Hardware name: Freescale Layerscape 2088A RDB Board (DT) task: ffffffc07c4a6d00 task.stack: ffffffc07c4a8000 PC is at drain_all_pages+0x244/0x25c LR is at start_isolate_page_range+0x14c/0x1f0 [...] drain_all_pages+0x244/0x25c start_isolate_page_range+0x14c/0x1f0 alloc_contig_range+0xec/0x354 cma_alloc+0x100/0x1fc dma_alloc_from_contiguous+0x3c/0x44 atomic_pool_init+0x7c/0x208 arm64_dma_init+0x44/0x4c do_one_initcall+0x38/0x128 kernel_init_freeable+0x1a0/0x240 kernel_init+0x10/0xfc ret_from_fork+0x10/0x20 Fix this by moving the whole setup_vmstat which is an initcall right now to init_mm_internals which will be called right after the WQ subsystem is initialized. Link: http://lkml.kernel.org/r/20170315164021.28532-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Yang Li Tested-by: Yang Li Tested-by: Xiaolong Ye Cc: Mel Gorman Cc: Vlastimil Babka Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ init/main.c | 2 ++ mm/vmstat.c | 4 +--- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f01c88f0800..00a8fa7e366a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,8 @@ struct user_struct; struct writeback_control; struct bdi_writeback; +void init_mm_internals(void); + #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; diff --git a/init/main.c b/init/main.c index f9c9d9948203..b0c11cbf5ddf 100644 --- a/init/main.c +++ b/init/main.c @@ -1022,6 +1022,8 @@ static noinline void __init kernel_init_freeable(void) workqueue_init(); + init_mm_internals(); + do_pre_smp_initcalls(); lockup_detector_init(); diff --git a/mm/vmstat.c b/mm/vmstat.c index b1947f0cbee2..89f95396ec46 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1764,7 +1764,7 @@ static int vmstat_cpu_dead(unsigned int cpu) #endif -static int __init setup_vmstat(void) +void __init init_mm_internals(void) { #ifdef CONFIG_SMP int ret; @@ -1792,9 +1792,7 @@ static int __init setup_vmstat(void) proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); #endif - return 0; } -module_init(setup_vmstat) #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) -- cgit v1.2.3 From 553af430e7c981e6e8fa5007c5b7b5773acc63dd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 31 Mar 2017 15:11:50 -0700 Subject: mm: rmap: fix huge file mmap accounting in the memcg stats Huge pages are accounted as single units in the memcg's "file_mapped" counter. Account the correct number of base pages, like we do in the corresponding node counter. Link: http://lkml.kernel.org/r/20170322005111.3156-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/rmap.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5af377303880..bb7250c45cb8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -740,6 +740,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_stat_index idx, + int nr) +{ +} + static inline void mem_cgroup_inc_page_stat(struct page *page, enum mem_cgroup_stat_index idx) { diff --git a/mm/rmap.c b/mm/rmap.c index 49ed681ccc7b..f6838015810f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1159,7 +1159,7 @@ void page_add_file_rmap(struct page *page, bool compound) goto out; } __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1199,7 +1199,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); -- cgit v1.2.3 From 906f2a51c941e251ca196d5128953d9899a608ef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 31 Mar 2017 15:11:58 -0700 Subject: mm: fix section name for .data..ro_after_init A section name for .data..ro_after_init was added by both: commit d07a980c1b8d ("s390: add proper __ro_after_init support") and commit d7c19b066dcf ("mm: kmemleak: scan .data.ro_after_init") The latter adds incorrect wrapping around the existing s390 section, and came later. I'd prefer the s390 naming, so this moves the s390-specific name up to the asm-generic/sections.h and renames the section as used by kmemleak (and in the future, kernel/extable.c). Link: http://lkml.kernel.org/r/20170327192213.GA129375@beast Signed-off-by: Kees Cook Acked-by: Heiko Carstens [s390 parts] Acked-by: Jakub Kicinski Cc: Eddie Kovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/sections.h | 1 - arch/s390/kernel/vmlinux.lds.S | 2 -- include/asm-generic/sections.h | 6 +++--- include/asm-generic/vmlinux.lds.h | 4 ++-- mm/kmemleak.c | 2 +- 5 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 5ce29fe100ba..fbd9116eb17b 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -4,6 +4,5 @@ #include extern char _eshared[], _ehead[]; -extern char __start_ro_after_init[], __end_ro_after_init[]; #endif diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 5ccf95396251..72307f108c40 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -63,11 +63,9 @@ SECTIONS . = ALIGN(PAGE_SIZE); __start_ro_after_init = .; - __start_data_ro_after_init = .; .data..ro_after_init : { *(.data..ro_after_init) } - __end_data_ro_after_init = .; EXCEPTION_TABLE(16) . = ALIGN(PAGE_SIZE); __end_ro_after_init = .; diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 4df64a1fc09e..532372c6cf15 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -14,8 +14,8 @@ * [_sdata, _edata]: contains .data.* sections, may also contain .rodata.* * and/or .init.* sections. * [__start_rodata, __end_rodata]: contains .rodata.* sections - * [__start_data_ro_after_init, __end_data_ro_after_init]: - * contains data.ro_after_init section + * [__start_ro_after_init, __end_ro_after_init]: + * contains .data..ro_after_init section * [__init_begin, __init_end]: contains .init.* sections, but .init.text.* * may be out of this range on some architectures. * [_sinittext, _einittext]: contains .init.text.* sections @@ -33,7 +33,7 @@ extern char _data[], _sdata[], _edata[]; extern char __bss_start[], __bss_stop[]; extern char __init_begin[], __init_end[]; extern char _sinittext[], _einittext[]; -extern char __start_data_ro_after_init[], __end_data_ro_after_init[]; +extern char __start_ro_after_init[], __end_ro_after_init[]; extern char _end[]; extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[]; extern char __kprobes_text_start[], __kprobes_text_end[]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0968d13b3885..f9f21e2c59f3 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -260,9 +260,9 @@ */ #ifndef RO_AFTER_INIT_DATA #define RO_AFTER_INIT_DATA \ - __start_data_ro_after_init = .; \ + __start_ro_after_init = .; \ *(.data..ro_after_init) \ - __end_data_ro_after_init = .; + __end_ro_after_init = .; #endif /* diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 26c874e90b12..20036d4f9f13 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1416,7 +1416,7 @@ static void kmemleak_scan(void) /* data/bss scanning */ scan_large_block(_sdata, _edata); scan_large_block(__bss_start, __bss_stop); - scan_large_block(__start_data_ro_after_init, __end_data_ro_after_init); + scan_large_block(__start_ro_after_init, __end_ro_after_init); #ifdef CONFIG_SMP /* per-cpu sections scanning */ -- cgit v1.2.3 From b0845ce58379d11dcad4cdb6824a6410de260216 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 31 Mar 2017 15:12:04 -0700 Subject: kasan: report only the first error by default Disable kasan after the first report. There are several reasons for this: - Single bug quite often has multiple invalid memory accesses causing storm in the dmesg. - Write OOB access might corrupt metadata so the next report will print bogus alloc/free stacktraces. - Reports after the first easily could be not bugs by itself but just side effects of the first one. Given that multiple reports usually only do harm, it makes sense to disable kasan after the first one. If user wants to see all the reports, the boot-time parameter kasan_multi_shot must be used. [aryabinin@virtuozzo.com: wrote changelog and doc, added missing include] Link: http://lkml.kernel.org/r/20170323154416.30257-1-aryabinin@virtuozzo.com Signed-off-by: Mark Rutland Signed-off-by: Andrey Ryabinin Cc: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 6 +++++ include/linux/kasan.h | 3 +++ lib/test_kasan.c | 10 +++++++ mm/kasan/kasan.h | 5 ---- mm/kasan/report.c | 36 +++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2ba45caabada..facc20a3f962 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1725,6 +1725,12 @@ kernel and module base offset ASLR (Address Space Layout Randomization). + kasan_multi_shot + [KNL] Enforce KASAN (Kernel Address Sanitizer) to print + report on every invalid memory access. Without this + parameter KASAN will print report only for the first + invalid access. + keepinitrd [HW,ARM] kernelcore= [KNL,X86,IA-64,PPC] diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5734480c9590..a5c7046f26b4 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -76,6 +76,9 @@ size_t ksize(const void *); static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); } size_t kasan_metadata_size(struct kmem_cache *cache); +bool kasan_save_enable_multi_shot(void); +void kasan_restore_multi_shot(bool enabled); + #else /* CONFIG_KASAN */ static inline void kasan_unpoison_shadow(const void *address, size_t size) {} diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 0b1d3140fbb8..a25c9763fce1 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -20,6 +20,7 @@ #include #include #include +#include /* * Note: test functions are marked noinline so that their names appear in @@ -474,6 +475,12 @@ static noinline void __init use_after_scope_test(void) static int __init kmalloc_tests_init(void) { + /* + * Temporarily enable multi-shot mode. Otherwise, we'd only get a + * report for the first case. + */ + bool multishot = kasan_save_enable_multi_shot(); + kmalloc_oob_right(); kmalloc_oob_left(); kmalloc_node_oob_right(); @@ -499,6 +506,9 @@ static int __init kmalloc_tests_init(void) ksize_unpoisons_memory(); copy_user_test(); use_after_scope_test(); + + kasan_restore_multi_shot(multishot); + return -EAGAIN; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1c260e6b3b3c..dd2dea8eb077 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -96,11 +96,6 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool kasan_report_enabled(void) -{ - return !current->kasan_depth; -} - void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_double_free(struct kmem_cache *cache, void *object, diff --git a/mm/kasan/report.c b/mm/kasan/report.c index f479365530b6..ab42a0803f16 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -13,7 +13,9 @@ * */ +#include #include +#include #include #include #include @@ -293,6 +295,40 @@ static void kasan_report_error(struct kasan_access_info *info) kasan_end_report(&flags); } +static unsigned long kasan_flags; + +#define KASAN_BIT_REPORTED 0 +#define KASAN_BIT_MULTI_SHOT 1 + +bool kasan_save_enable_multi_shot(void) +{ + return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); + +void kasan_restore_multi_shot(bool enabled) +{ + if (!enabled) + clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); +} +EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); + +static int __init kasan_set_multi_shot(char *str) +{ + set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); + return 1; +} +__setup("kasan_multi_shot", kasan_set_multi_shot); + +static inline bool kasan_report_enabled(void) +{ + if (current->kasan_depth) + return false; + if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + return true; + return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); +} + void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { -- cgit v1.2.3 From bf17aa36c0f199f5b254262e77eaefda7da0f50b Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 1 Mar 2017 18:22:01 -0800 Subject: nvme: Correct NVMF enum values to match NVMe-oF rev 1.0 The enum values for QPTYPE, PRTYPE and CMS are off by 1 from the values defined in figure 42 of the NVM Express over Fabrics 1.0: http://www.nvmexpress.org/wp-content/uploads/NVMe_over_Fabrics_1_0_Gold_20160605-1.pdf Fix our enums to match the final spec. Signed-off-by: Roland Dreier Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- include/linux/nvme.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index c43d435d4225..9061780b141f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -64,26 +64,26 @@ enum { * RDMA_QPTYPE field */ enum { - NVMF_RDMA_QPTYPE_CONNECTED = 0, /* Reliable Connected */ - NVMF_RDMA_QPTYPE_DATAGRAM = 1, /* Reliable Datagram */ + NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */ + NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ }; /* RDMA QP Service Type codes for Discovery Log Page entry TSAS * RDMA_QPTYPE field */ enum { - NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 0, /* No Provider Specified */ - NVMF_RDMA_PRTYPE_IB = 1, /* InfiniBand */ - NVMF_RDMA_PRTYPE_ROCE = 2, /* InfiniBand RoCE */ - NVMF_RDMA_PRTYPE_ROCEV2 = 3, /* InfiniBand RoCEV2 */ - NVMF_RDMA_PRTYPE_IWARP = 4, /* IWARP */ + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */ + NVMF_RDMA_PRTYPE_IB = 2, /* InfiniBand */ + NVMF_RDMA_PRTYPE_ROCE = 3, /* InfiniBand RoCE */ + NVMF_RDMA_PRTYPE_ROCEV2 = 4, /* InfiniBand RoCEV2 */ + NVMF_RDMA_PRTYPE_IWARP = 5, /* IWARP */ }; /* RDMA Connection Management Service Type codes for Discovery Log Page * entry TSAS RDMA_CMS field */ enum { - NVMF_RDMA_CMS_RDMA_CM = 0, /* Sockets based enpoint addressing */ + NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ }; #define NVMF_AQ_DEPTH 32 -- cgit v1.2.3 From 47071aee6a1956524b9929b3b821f6d2f8cae23c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 31 Mar 2017 18:32:10 +0100 Subject: statx: Reserve the top bit of the mask for future struct expansion Reserve the top bit of the mask for future expansion of the statx struct and give an error if statx() sees it set. All the other bits are ignored if we see them set but don't support the bit; we just clear the bit in the returned mask. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/stat.c | 2 ++ include/uapi/linux/stat.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/fs/stat.c b/fs/stat.c index ab27f2868588..0c7e6cdc435c 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -562,6 +562,8 @@ SYSCALL_DEFINE5(statx, struct kstat stat; int error; + if (mask & STATX__RESERVED) + return -EINVAL; if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 51a6b86e3700..0869b9eaa8ce 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -152,6 +152,7 @@ struct statx { #define STATX_BASIC_STATS 0x000007ffU /* The stuff in the normal stat struct */ #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ #define STATX_ALL 0x00000fffU /* All currently supported flags */ +#define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ /* * Attributes to be found in stx_attributes -- cgit v1.2.3 From 3209f68b3ca4667069923a325c88b21131bfdf9f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 31 Mar 2017 18:32:17 +0100 Subject: statx: Include a mask for stx_attributes in struct statx Include a mask in struct stat to indicate which bits of stx_attributes the filesystem actually supports. This would also be useful if we add another system call that allows you to do a 'bulk attribute set' and pass in a statx struct with the masks appropriately set to say what you want to set. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/ext4/inode.c | 6 ++++++ fs/stat.c | 1 + include/linux/stat.h | 1 + include/uapi/linux/stat.h | 4 ++-- samples/statx/test-statx.c | 12 ++++++++---- 5 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5d02b922afa3..b9ffa9f4191f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5413,6 +5413,12 @@ int ext4_getattr(const struct path *path, struct kstat *stat, if (flags & EXT4_NODUMP_FL) stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + generic_fillattr(inode, stat); return 0; } diff --git a/fs/stat.c b/fs/stat.c index 0c7e6cdc435c..c6c963b2546b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -527,6 +527,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_ino = stat->ino; tmp.stx_size = stat->size; tmp.stx_blocks = stat->blocks; + tmp.stx_attributes_mask = stat->attributes_mask; tmp.stx_atime.tv_sec = stat->atime.tv_sec; tmp.stx_atime.tv_nsec = stat->atime.tv_nsec; tmp.stx_btime.tv_sec = stat->btime.tv_sec; diff --git a/include/linux/stat.h b/include/linux/stat.h index c76e524fb34b..64b6b3aece21 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -26,6 +26,7 @@ struct kstat { unsigned int nlink; uint32_t blksize; /* Preferred I/O size */ u64 attributes; + u64 attributes_mask; #define KSTAT_ATTR_FS_IOC_FLAGS \ (STATX_ATTR_COMPRESSED | \ STATX_ATTR_IMMUTABLE | \ diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 0869b9eaa8ce..d538897b8e08 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -114,7 +114,7 @@ struct statx { __u64 stx_ino; /* Inode number */ __u64 stx_size; /* File size */ __u64 stx_blocks; /* Number of 512-byte blocks allocated */ - __u64 __spare1[1]; + __u64 stx_attributes_mask; /* Mask to show what's supported in stx_attributes */ /* 0x40 */ struct statx_timestamp stx_atime; /* Last access time */ struct statx_timestamp stx_btime; /* File creation time */ @@ -155,7 +155,7 @@ struct statx { #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ /* - * Attributes to be found in stx_attributes + * Attributes to be found in stx_attributes and masked in stx_attributes_mask. * * These give information about the features or the state of a file that might * be of use to ordinary userspace programs such as GUIs or ls rather than diff --git a/samples/statx/test-statx.c b/samples/statx/test-statx.c index 8571d766331d..d4d77b09412c 100644 --- a/samples/statx/test-statx.c +++ b/samples/statx/test-statx.c @@ -141,8 +141,8 @@ static void dump_statx(struct statx *stx) if (stx->stx_mask & STATX_BTIME) print_time(" Birth: ", &stx->stx_btime); - if (stx->stx_attributes) { - unsigned char bits; + if (stx->stx_attributes_mask) { + unsigned char bits, mbits; int loop, byte; static char attr_representation[64 + 1] = @@ -160,14 +160,18 @@ static void dump_statx(struct statx *stx) printf("Attributes: %016llx (", stx->stx_attributes); for (byte = 64 - 8; byte >= 0; byte -= 8) { bits = stx->stx_attributes >> byte; + mbits = stx->stx_attributes_mask >> byte; for (loop = 7; loop >= 0; loop--) { int bit = byte + loop; - if (bits & 0x80) + if (!(mbits & 0x80)) + putchar('.'); /* Not supported */ + else if (bits & 0x80) putchar(attr_representation[63 - bit]); else - putchar('-'); + putchar('-'); /* Not set */ bits <<= 1; + mbits <<= 1; } if (byte) putchar(' '); -- cgit v1.2.3 From df2729c3238ed89fb8ccf850d38c732858a5bade Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 1 Apr 2017 17:15:59 +0800 Subject: sctp: check for dst and pathmtu update in sctp_packet_config This patch is to move sctp_transport_dst_check into sctp_packet_config from sctp_packet_transmit and add pathmtu check in sctp_packet_config. With this fix, sctp can update dst or pathmtu before appending chunks, which can void dropping packets in sctp_packet_transmit when dst is obsolete or dst's mtu is changed. This patch is also to improve some other codes in sctp_packet_config. It updates packet max_size with gso_max_size, checks for dst and pathmtu, and appends ecne chunk only when packet is empty and asoc is not NULL. It makes sctp flush work better, as we only need to set up them once for one flush schedule. It's also safe, since asoc is NULL only when the packet is created by sctp_ootb_pkt_new in which it just gets the new dst, no need to do more things for it other than set packet with transport's pathmtu. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 17 ++++++++++--- net/sctp/output.c | 67 ++++++++++++++++++++++++++----------------------- 2 files changed, 50 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 1f71ee5ab518..d75caa7a629b 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -596,12 +596,23 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr) */ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t) { - if (t->dst && (!dst_check(t->dst, t->dst_cookie) || - t->pathmtu != max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)), - SCTP_DEFAULT_MINSEGMENT))) + if (t->dst && !dst_check(t->dst, t->dst_cookie)) sctp_transport_dst_release(t); return t->dst; } +static inline bool sctp_transport_pmtu_check(struct sctp_transport *t) +{ + __u32 pmtu = max_t(size_t, SCTP_TRUNC4(dst_mtu(t->dst)), + SCTP_DEFAULT_MINSEGMENT); + + if (t->pathmtu == pmtu) + return true; + + t->pathmtu = pmtu; + + return false; +} + #endif /* __net_sctp_h__ */ diff --git a/net/sctp/output.c b/net/sctp/output.c index 73fd178007a3..ec4d50a38713 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -86,43 +86,53 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, { struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; + struct sock *sk; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); - packet->vtag = vtag; - if (asoc && tp->dst) { - struct sock *sk = asoc->base.sk; - - rcu_read_lock(); - if (__sk_dst_get(sk) != tp->dst) { - dst_hold(tp->dst); - sk_setup_caps(sk, tp->dst); - } - - if (sk_can_gso(sk)) { - struct net_device *dev = tp->dst->dev; + /* do the following jobs only once for a flush schedule */ + if (!sctp_packet_empty(packet)) + return; - packet->max_size = dev->gso_max_size; - } else { - packet->max_size = asoc->pathmtu; - } - rcu_read_unlock(); + /* set packet max_size with pathmtu */ + packet->max_size = tp->pathmtu; + if (!asoc) + return; - } else { - packet->max_size = tp->pathmtu; + /* update dst or transport pathmtu if in need */ + sk = asoc->base.sk; + if (!sctp_transport_dst_check(tp)) { + sctp_transport_route(tp, NULL, sctp_sk(sk)); + if (asoc->param_flags & SPP_PMTUD_ENABLE) + sctp_assoc_sync_pmtu(sk, asoc); + } else if (!sctp_transport_pmtu_check(tp)) { + if (asoc->param_flags & SPP_PMTUD_ENABLE) + sctp_assoc_sync_pmtu(sk, asoc); } - if (ecn_capable && sctp_packet_empty(packet)) { - struct sctp_chunk *chunk; + /* If there a is a prepend chunk stick it on the list before + * any other chunks get appended. + */ + if (ecn_capable) { + struct sctp_chunk *chunk = sctp_get_ecne_prepend(asoc); - /* If there a is a prepend chunk stick it on the list before - * any other chunks get appended. - */ - chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } + + if (!tp->dst) + return; + + /* set packet max_size with gso_max_size if gso is enabled*/ + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + packet->max_size = sk_can_gso(sk) ? tp->dst->dev->gso_max_size + : asoc->pathmtu; + rcu_read_unlock(); } /* Initialize the packet structure. */ @@ -582,12 +592,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sh->vtag = htonl(packet->vtag); sh->checksum = 0; - /* update dst if in need */ - if (!sctp_transport_dst_check(tp)) { - sctp_transport_route(tp, NULL, sctp_sk(sk)); - if (asoc && asoc->param_flags & SPP_PMTUD_ENABLE) - sctp_assoc_sync_pmtu(sk, asoc); - } + /* drop packet if no dst */ dst = dst_clone(tp->dst); if (!dst) { IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); -- cgit v1.2.3 From 5b0d2cc2805897c14257f6dbb949639c499c3c25 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Sat, 18 Mar 2017 13:56:56 +0100 Subject: KVM: arm64: Ensure LRs are clear when they should be We currently have some code to clear the list registers on GICv3, but we never call this code, because the caller got nuked when removing the old vgic. We also used to have a similar GICv2 part, but that got lost in the process too. Let's reintroduce the logic for GICv2 and call the logic when we initialize the use of hypervisors on the CPU, for example when first loading KVM or when exiting a low power state. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm/kvm/arm.c | 3 +++ include/kvm/arm_vgic.h | 1 + virt/kvm/arm/vgic/vgic-init.c | 19 +++++++++++++++++++ virt/kvm/arm/vgic/vgic-v2.c | 15 +++++++++++++++ virt/kvm/arm/vgic/vgic.h | 2 ++ 5 files changed, 40 insertions(+) (limited to 'include') diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 96dba7cd8be7..314eb6abe1ff 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -1124,6 +1124,9 @@ static void cpu_hyp_reinit(void) if (__hyp_get_vectors() == hyp_default_vectors) cpu_init_hyp_mode(NULL); } + + if (vgic_present) + kvm_vgic_init_cpu_hardware(); } static void cpu_hyp_reset(void) diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index b72dd2ad5f44..c0b3d999c266 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -295,6 +295,7 @@ void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu); void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu); int kvm_vgic_map_resources(struct kvm *kvm); int kvm_vgic_hyp_init(void); +void kvm_vgic_init_cpu_hardware(void); int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, bool level); diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 276139a24e6f..702f8108608d 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -391,6 +391,25 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data) return IRQ_HANDLED; } +/** + * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware + * + * For a specific CPU, initialize the GIC VE hardware. + */ +void kvm_vgic_init_cpu_hardware(void) +{ + BUG_ON(preemptible()); + + /* + * We want to make sure the list registers start out clear so that we + * only have the program the used registers. + */ + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_init_lrs(); + else + kvm_call_hyp(__vgic_v3_init_lrs); +} + /** * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable * according to the host GIC model. Accordingly calls either diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index b834ecdf3225..94cf4b9b6471 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -36,6 +36,21 @@ static unsigned long *u64_to_bitmask(u64 *val) return (unsigned long *)val; } +static inline void vgic_v2_write_lr(int lr, u32 val) +{ + void __iomem *base = kvm_vgic_global_state.vctrl_base; + + writel_relaxed(val, base + GICH_LR0 + (lr * 4)); +} + +void vgic_v2_init_lrs(void) +{ + int i; + + for (i = 0; i < kvm_vgic_global_state.nr_lr; i++) + vgic_v2_write_lr(i, 0); +} + void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index db28f7cadab2..91566f5aac9b 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -130,6 +130,8 @@ int vgic_v2_map_resources(struct kvm *kvm); int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type); +void vgic_v2_init_lrs(void); + static inline void vgic_get_irq_kref(struct vgic_irq *irq) { if (irq->intid < VGIC_MIN_LPI) -- cgit v1.2.3 From 6d56111c92d247bb64301029fe88365aa4caf16e Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 21 Mar 2017 22:05:22 +0100 Subject: KVM: arm/arm64: vgic: Fix GICC_PMR uaccess on GICv3 and clarify ABI As an oversight, for GICv2, we accidentally export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask field in the lower 5 bits of a word, meaning that userspace must always use the lower 5 bits to communicate with the KVM device and must shift the value left by 3 places to obtain the actual priority mask level. Since GICv3 supports the full 8 bits of priority masking in the ICH_VMCR, we have to fix the value we export when emulating a GICv2 on top of a hardware GICv3 and exporting the emulated GICv2 state to userspace. Take the chance to clarify this aspect of the ABI. Reviewed-by: Marc Zyngier Signed-off-by: Christoffer Dall --- Documentation/virtual/kvm/devices/arm-vgic.txt | 6 ++++++ include/linux/irqchip/arm-gic.h | 3 +++ virt/kvm/arm/vgic/vgic-mmio-v2.c | 20 ++++++++++++++++++-- virt/kvm/arm/vgic/vgic-v2.c | 8 ++++---- virt/kvm/arm/vgic/vgic.h | 9 ++++++++- 5 files changed, 39 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt index 76e61c883347..b2f60ca8b60c 100644 --- a/Documentation/virtual/kvm/devices/arm-vgic.txt +++ b/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -83,6 +83,12 @@ Groups: Bits for undefined preemption levels are RAZ/WI. + For historical reasons and to provide ABI compatibility with userspace we + export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask + field in the lower 5 bits of a word, meaning that userspace must always + use the lower 5 bits to communicate with the KVM device and must shift the + value left by 3 places to obtain the actual priority mask level. + Limitations: - Priorities are not implemented, and registers are RAZ/WI - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2. diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index eafc965b3eb8..dc30f3d057eb 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -96,6 +96,9 @@ #define GICH_MISR_EOI (1 << 0) #define GICH_MISR_U (1 << 1) +#define GICV_PMR_PRIORITY_SHIFT 3 +#define GICV_PMR_PRIORITY_MASK (0x1f << GICV_PMR_PRIORITY_SHIFT) + #ifndef __ASSEMBLY__ #include diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index a3ad7ff95c9b..0a4283ed9aa7 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -229,7 +229,15 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, val = vmcr.ctlr; break; case GIC_CPU_PRIMASK: - val = vmcr.pmr; + /* + * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the + * the PMR field as GICH_VMCR.VMPriMask rather than + * GICC_PMR.Priority, so we expose the upper five bits of + * priority mask to userspace using the lower bits in the + * unsigned long. + */ + val = (vmcr.pmr & GICV_PMR_PRIORITY_MASK) >> + GICV_PMR_PRIORITY_SHIFT; break; case GIC_CPU_BINPOINT: val = vmcr.bpr; @@ -262,7 +270,15 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, vmcr.ctlr = val; break; case GIC_CPU_PRIMASK: - vmcr.pmr = val; + /* + * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the + * the PMR field as GICH_VMCR.VMPriMask rather than + * GICC_PMR.Priority, so we expose the upper five bits of + * priority mask to userspace using the lower bits in the + * unsigned long. + */ + vmcr.pmr = (val << GICV_PMR_PRIORITY_SHIFT) & + GICV_PMR_PRIORITY_MASK; break; case GIC_CPU_BINPOINT: vmcr.bpr = val; diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index 94cf4b9b6471..b637d9c7afe3 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -206,8 +206,8 @@ void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GICH_VMCR_ALIAS_BINPOINT_MASK; vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK; - vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & - GICH_VMCR_PRIMASK_MASK; + vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) << + GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr; } @@ -222,8 +222,8 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) GICH_VMCR_ALIAS_BINPOINT_SHIFT; vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT; - vmcrp->pmr = (vmcr & GICH_VMCR_PRIMASK_MASK) >> - GICH_VMCR_PRIMASK_SHIFT; + vmcrp->pmr = ((vmcr & GICH_VMCR_PRIMASK_MASK) >> + GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT; } void vgic_v2_enable(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 91566f5aac9b..6cf557e9f718 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -81,11 +81,18 @@ static inline bool irq_is_pending(struct vgic_irq *irq) return irq->pending_latch || irq->line_level; } +/* + * This struct provides an intermediate representation of the fields contained + * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC + * state to userspace can generate either GICv2 or GICv3 CPU interface + * registers regardless of the hardware backed GIC used. + */ struct vgic_vmcr { u32 ctlr; u32 abpr; u32 bpr; - u32 pmr; + u32 pmr; /* Priority mask field in the GICC_PMR and + * ICC_PMR_EL1 priority field format */ /* Below member variable are valid only for GICv3 */ u32 grpen0; u32 grpen1; -- cgit v1.2.3 From b2376407f98920c9b0c411948675f58a9640be35 Mon Sep 17 00:00:00 2001 From: Vic Yang Date: Fri, 24 Mar 2017 18:44:01 +0100 Subject: mfd: cros-ec: Fix host command buffer size For SPI, we can get up to 32 additional bytes for response preamble. The current overhead (2 bytes) may cause problems when we try to receive a big response. Update it to 32 bytes. Without this fix we could see a kernel BUG when we receive a big response from the Chrome EC when is connected via SPI. Signed-off-by: Vic Yang Tested-by: Enric Balletbo i Serra Signed-off-by: Lee Jones --- include/linux/mfd/cros_ec.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 7a01c94496f1..3eef9fb9968a 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -35,10 +35,11 @@ * Max bus-specific overhead incurred by request/responses. * I2C requires 1 additional byte for requests. * I2C requires 2 additional bytes for responses. + * SPI requires up to 32 additional bytes for responses. * */ #define EC_PROTO_VERSION_UNKNOWN 0 #define EC_MAX_REQUEST_OVERHEAD 1 -#define EC_MAX_RESPONSE_OVERHEAD 2 +#define EC_MAX_RESPONSE_OVERHEAD 32 /* * Command interface between EC and AP, for LPC, I2C and SPI interfaces. -- cgit v1.2.3 From 3ebfdf082184d04f6e73b30cd9446613dc7f8c02 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 4 Apr 2017 13:39:55 +0800 Subject: sctp: get sock from transport in sctp_transport_update_pmtu This patch is almost to revert commit 02f3d4ce9e81 ("sctp: Adjust PMTU updates to accomodate route invalidation."). As t->asoc can't be NULL in sctp_transport_update_pmtu, it could get sk from asoc, and no need to pass sk into that function. It is also to remove some duplicated codes from that function. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 5 ++--- include/net/sctp/structs.h | 6 +++--- net/sctp/associola.c | 6 +++--- net/sctp/input.c | 4 ++-- net/sctp/output.c | 4 ++-- net/sctp/socket.c | 6 +++--- net/sctp/transport.c | 19 +++++++------------ 7 files changed, 22 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index d75caa7a629b..069582ee5d7f 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -448,10 +448,9 @@ static inline int sctp_frag_point(const struct sctp_association *asoc, int pmtu) return frag; } -static inline void sctp_assoc_pending_pmtu(struct sock *sk, struct sctp_association *asoc) +static inline void sctp_assoc_pending_pmtu(struct sctp_association *asoc) { - - sctp_assoc_sync_pmtu(sk, asoc); + sctp_assoc_sync_pmtu(asoc); asoc->pmtu_pending = 0; } diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index a127b7c2c3c9..138f8615acf0 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -952,8 +952,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *, sctp_lower_cwnd_t); void sctp_transport_burst_limited(struct sctp_transport *); void sctp_transport_burst_reset(struct sctp_transport *); unsigned long sctp_transport_timeout(struct sctp_transport *); -void sctp_transport_reset(struct sctp_transport *); -void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32); +void sctp_transport_reset(struct sctp_transport *t); +void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); @@ -1954,7 +1954,7 @@ void sctp_assoc_update(struct sctp_association *old, __u32 sctp_association_get_next_tsn(struct sctp_association *); -void sctp_assoc_sync_pmtu(struct sock *, struct sctp_association *); +void sctp_assoc_sync_pmtu(struct sctp_association *asoc); void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); void sctp_assoc_set_primary(struct sctp_association *, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 0b26df5f6188..a9708da28eb5 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1412,7 +1412,7 @@ sctp_assoc_choose_alter_transport(struct sctp_association *asoc, /* Update the association's pmtu and frag_point by going through all the * transports. This routine is called when a transport's PMTU has changed. */ -void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc) +void sctp_assoc_sync_pmtu(struct sctp_association *asoc) { struct sctp_transport *t; __u32 pmtu = 0; @@ -1424,8 +1424,8 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc) list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (t->pmtu_pending && t->dst) { - sctp_transport_update_pmtu(sk, t, - SCTP_TRUNC4(dst_mtu(t->dst))); + sctp_transport_update_pmtu( + t, SCTP_TRUNC4(dst_mtu(t->dst))); t->pmtu_pending = 0; } if (!pmtu || (t->pathmtu < pmtu)) diff --git a/net/sctp/input.c b/net/sctp/input.c index 2a28ab20487f..0e06a278d2a9 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -401,10 +401,10 @@ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, if (t->param_flags & SPP_PMTUD_ENABLE) { /* Update transports view of the MTU */ - sctp_transport_update_pmtu(sk, t, pmtu); + sctp_transport_update_pmtu(t, pmtu); /* Update association pmtu. */ - sctp_assoc_sync_pmtu(sk, asoc); + sctp_assoc_sync_pmtu(asoc); } /* Retransmit with the new pmtu setting. diff --git a/net/sctp/output.c b/net/sctp/output.c index ec4d50a38713..1409a875ad8e 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -105,10 +105,10 @@ void sctp_packet_config(struct sctp_packet *packet, __u32 vtag, if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sctp_sk(sk)); if (asoc->param_flags & SPP_PMTUD_ENABLE) - sctp_assoc_sync_pmtu(sk, asoc); + sctp_assoc_sync_pmtu(asoc); } else if (!sctp_transport_pmtu_check(tp)) { if (asoc->param_flags & SPP_PMTUD_ENABLE) - sctp_assoc_sync_pmtu(sk, asoc); + sctp_assoc_sync_pmtu(asoc); } /* If there a is a prepend chunk stick it on the list before diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 12fbae2c1002..c1401f43d40f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1907,7 +1907,7 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len) } if (asoc->pmtu_pending) - sctp_assoc_pending_pmtu(sk, asoc); + sctp_assoc_pending_pmtu(asoc); /* If fragmentation is disabled and the message length exceeds the * association fragmentation point, return EMSGSIZE. The I-D @@ -2435,7 +2435,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, if ((params->spp_flags & SPP_PMTUD_DISABLE) && params->spp_pathmtu) { if (trans) { trans->pathmtu = params->spp_pathmtu; - sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc); + sctp_assoc_sync_pmtu(asoc); } else if (asoc) { asoc->pathmtu = params->spp_pathmtu; } else { @@ -2451,7 +2451,7 @@ static int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, (trans->param_flags & ~SPP_PMTUD) | pmtud_change; if (update) { sctp_transport_pmtu(trans, sctp_opt2sk(sp)); - sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc); + sctp_assoc_sync_pmtu(asoc); } } else if (asoc) { asoc->param_flags = diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 3379668af368..721eeebfcd8a 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -251,14 +251,13 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } -void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 pmtu) +void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) { - struct dst_entry *dst; + struct dst_entry *dst = sctp_transport_dst_check(t); if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", - __func__, pmtu, - SCTP_DEFAULT_MINSEGMENT); + __func__, pmtu, SCTP_DEFAULT_MINSEGMENT); /* Use default minimum segment size and disable * pmtu discovery on this transport. */ @@ -267,17 +266,13 @@ void sctp_transport_update_pmtu(struct sock *sk, struct sctp_transport *t, u32 p t->pathmtu = pmtu; } - dst = sctp_transport_dst_check(t); - if (!dst) - t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); - if (dst) { - dst->ops->update_pmtu(dst, sk, NULL, pmtu); - + dst->ops->update_pmtu(dst, t->asoc->base.sk, NULL, pmtu); dst = sctp_transport_dst_check(t); - if (!dst) - t->af_specific->get_dst(t, &t->saddr, &t->fl, sk); } + + if (!dst) + t->af_specific->get_dst(t, &t->saddr, &t->fl, t->asoc->base.sk); } /* Caches the dst entry and source address for a transport's destination -- cgit v1.2.3 From 6118714275f0a313ecc296a87ed1af32d9691bed Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Thu, 30 Mar 2017 09:16:39 -0700 Subject: pinctrl: core: Fix pinctrl_register_and_init() with pinctrl_enable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent pinctrl changes to allow dynamic allocation of pins exposed one more issue with the pinctrl pins claimed early by the controller itself. This caused a regression for IMX6 pinctrl hogs. Before enabling the pin controller driver we need to wait until it has been properly initialized, then claim the hogs, and only then enable it. To fix the regression, split the code into pinctrl_claim_hogs() and pinctrl_enable(). And then let's require that pinctrl_enable() is always called by the pin controller driver when ready after calling pinctrl_register_and_init(). Depends-on: 950b0d91dc10 ("pinctrl: core: Fix regression caused by delayed work for hogs") Fixes: df61b366af26 ("pinctrl: core: Use delayed work for hogs") Fixes: e566fc11ea76 ("pinctrl: imx: use generic pinctrl helpers for managing groups") Cc: Haojian Zhuang Cc: Masahiro Yamada Cc: Mika Penttilä Cc: Mika Westerberg Cc: Nishanth Menon Cc: Shawn Guo Cc: Stefan Agner Tested-by: Geert Uytterhoeven Tested-by: Gary Bisson Tested-by: Fabio Estevam Signed-off-by: Tony Lindgren Signed-off-by: Linus Walleij --- Documentation/pinctrl.txt | 8 ++- drivers/pinctrl/core.c | 97 +++++++++++++++++++++------------ drivers/pinctrl/freescale/pinctrl-imx.c | 2 +- drivers/pinctrl/pinctrl-single.c | 2 +- drivers/pinctrl/sh-pfc/pinctrl.c | 11 +++- drivers/pinctrl/ti/pinctrl-ti-iodelay.c | 2 + include/linux/pinctrl/pinctrl.h | 3 +- 7 files changed, 83 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt index 54bd5faa8782..f2af35f6d6b2 100644 --- a/Documentation/pinctrl.txt +++ b/Documentation/pinctrl.txt @@ -77,9 +77,15 @@ static struct pinctrl_desc foo_desc = { int __init foo_probe(void) { + int error; + struct pinctrl_dev *pctl; - return pinctrl_register_and_init(&foo_desc, , NULL, &pctl); + error = pinctrl_register_and_init(&foo_desc, , NULL, &pctl); + if (error) + return error; + + return pinctrl_enable(pctl); } To enable the pinctrl subsystem and the subgroups for PINMUX and PINCONF and diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index d69046537b75..32822b0d9cd0 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -2010,29 +2010,57 @@ out_err: return ERR_PTR(ret); } -static int pinctrl_create_and_start(struct pinctrl_dev *pctldev) +static int pinctrl_claim_hogs(struct pinctrl_dev *pctldev) { pctldev->p = create_pinctrl(pctldev->dev, pctldev); - if (!IS_ERR(pctldev->p)) { - kref_get(&pctldev->p->users); - pctldev->hog_default = - pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT); - if (IS_ERR(pctldev->hog_default)) { - dev_dbg(pctldev->dev, - "failed to lookup the default state\n"); - } else { - if (pinctrl_select_state(pctldev->p, - pctldev->hog_default)) - dev_err(pctldev->dev, - "failed to select default state\n"); - } + if (PTR_ERR(pctldev->p) == -ENODEV) { + dev_dbg(pctldev->dev, "no hogs found\n"); - pctldev->hog_sleep = - pinctrl_lookup_state(pctldev->p, - PINCTRL_STATE_SLEEP); - if (IS_ERR(pctldev->hog_sleep)) - dev_dbg(pctldev->dev, - "failed to lookup the sleep state\n"); + return 0; + } + + if (IS_ERR(pctldev->p)) { + dev_err(pctldev->dev, "error claiming hogs: %li\n", + PTR_ERR(pctldev->p)); + + return PTR_ERR(pctldev->p); + } + + kref_get(&pctldev->p->users); + pctldev->hog_default = + pinctrl_lookup_state(pctldev->p, PINCTRL_STATE_DEFAULT); + if (IS_ERR(pctldev->hog_default)) { + dev_dbg(pctldev->dev, + "failed to lookup the default state\n"); + } else { + if (pinctrl_select_state(pctldev->p, + pctldev->hog_default)) + dev_err(pctldev->dev, + "failed to select default state\n"); + } + + pctldev->hog_sleep = + pinctrl_lookup_state(pctldev->p, + PINCTRL_STATE_SLEEP); + if (IS_ERR(pctldev->hog_sleep)) + dev_dbg(pctldev->dev, + "failed to lookup the sleep state\n"); + + return 0; +} + +int pinctrl_enable(struct pinctrl_dev *pctldev) +{ + int error; + + error = pinctrl_claim_hogs(pctldev); + if (error) { + dev_err(pctldev->dev, "could not claim hogs: %i\n", + error); + mutex_destroy(&pctldev->mutex); + kfree(pctldev); + + return error; } mutex_lock(&pinctrldev_list_mutex); @@ -2043,6 +2071,7 @@ static int pinctrl_create_and_start(struct pinctrl_dev *pctldev) return 0; } +EXPORT_SYMBOL_GPL(pinctrl_enable); /** * pinctrl_register() - register a pin controller device @@ -2065,25 +2094,30 @@ struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc, if (IS_ERR(pctldev)) return pctldev; - error = pinctrl_create_and_start(pctldev); - if (error) { - mutex_destroy(&pctldev->mutex); - kfree(pctldev); - + error = pinctrl_enable(pctldev); + if (error) return ERR_PTR(error); - } return pctldev; } EXPORT_SYMBOL_GPL(pinctrl_register); +/** + * pinctrl_register_and_init() - register and init pin controller device + * @pctldesc: descriptor for this pin controller + * @dev: parent device for this pin controller + * @driver_data: private pin controller data for this pin controller + * @pctldev: pin controller device + * + * Note that pinctrl_enable() still needs to be manually called after + * this once the driver is ready. + */ int pinctrl_register_and_init(struct pinctrl_desc *pctldesc, struct device *dev, void *driver_data, struct pinctrl_dev **pctldev) { struct pinctrl_dev *p; - int error; p = pinctrl_init_controller(pctldesc, dev, driver_data); if (IS_ERR(p)) @@ -2097,15 +2131,6 @@ int pinctrl_register_and_init(struct pinctrl_desc *pctldesc, */ *pctldev = p; - error = pinctrl_create_and_start(p); - if (error) { - mutex_destroy(&p->mutex); - kfree(p); - *pctldev = NULL; - - return error; - } - return 0; } EXPORT_SYMBOL_GPL(pinctrl_register_and_init); diff --git a/drivers/pinctrl/freescale/pinctrl-imx.c b/drivers/pinctrl/freescale/pinctrl-imx.c index a7ace9e1ad81..74bd90dfd7b1 100644 --- a/drivers/pinctrl/freescale/pinctrl-imx.c +++ b/drivers/pinctrl/freescale/pinctrl-imx.c @@ -790,7 +790,7 @@ int imx_pinctrl_probe(struct platform_device *pdev, dev_info(&pdev->dev, "initialized IMX pinctrl driver\n"); - return 0; + return pinctrl_enable(ipctl->pctl); free: imx_free_resources(ipctl); diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c index 8b2d45e85bae..9c267dcda094 100644 --- a/drivers/pinctrl/pinctrl-single.c +++ b/drivers/pinctrl/pinctrl-single.c @@ -1781,7 +1781,7 @@ static int pcs_probe(struct platform_device *pdev) dev_info(pcs->dev, "%i pins at pa %p size %u\n", pcs->desc.npins, pcs->base, pcs->size); - return 0; + return pinctrl_enable(pcs->pctl); free: pcs_free_resources(pcs); diff --git a/drivers/pinctrl/sh-pfc/pinctrl.c b/drivers/pinctrl/sh-pfc/pinctrl.c index 08150a321be6..a70157f0acf4 100644 --- a/drivers/pinctrl/sh-pfc/pinctrl.c +++ b/drivers/pinctrl/sh-pfc/pinctrl.c @@ -816,6 +816,13 @@ int sh_pfc_register_pinctrl(struct sh_pfc *pfc) pmx->pctl_desc.pins = pmx->pins; pmx->pctl_desc.npins = pfc->info->nr_pins; - return devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx, - &pmx->pctl); + ret = devm_pinctrl_register_and_init(pfc->dev, &pmx->pctl_desc, pmx, + &pmx->pctl); + if (ret) { + dev_err(pfc->dev, "could not register: %i\n", ret); + + return ret; + } + + return pinctrl_enable(pmx->pctl); } diff --git a/drivers/pinctrl/ti/pinctrl-ti-iodelay.c b/drivers/pinctrl/ti/pinctrl-ti-iodelay.c index 717e3404900c..362c50918c13 100644 --- a/drivers/pinctrl/ti/pinctrl-ti-iodelay.c +++ b/drivers/pinctrl/ti/pinctrl-ti-iodelay.c @@ -893,6 +893,8 @@ static int ti_iodelay_probe(struct platform_device *pdev) platform_set_drvdata(pdev, iod); + return pinctrl_enable(iod->pctl); + exit_out: of_node_put(np); return ret; diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 8ce2d87a238b..5e45385c5bdc 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -145,8 +145,9 @@ struct pinctrl_desc { extern int pinctrl_register_and_init(struct pinctrl_desc *pctldesc, struct device *dev, void *driver_data, struct pinctrl_dev **pctldev); +extern int pinctrl_enable(struct pinctrl_dev *pctldev); -/* Please use pinctrl_register_and_init() instead */ +/* Please use pinctrl_register_and_init() and pinctrl_enable() instead */ extern struct pinctrl_dev *pinctrl_register(struct pinctrl_desc *pctldesc, struct device *dev, void *driver_data); -- cgit v1.2.3 From 54d5329d425650fafaf90660a139c771d2d49cae Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 7 Apr 2017 08:52:27 -0600 Subject: blk-mq-sched: fix crash in switch error path In elevator_switch(), if blk_mq_init_sched() fails, we attempt to fall back to the original scheduler. However, at this point, we've already torn down the original scheduler's tags, so this causes a crash. Doing the fallback like the legacy elevator path is much harder for mq, so fix it by just falling back to none, instead. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 13 +++++-- block/blk-mq-sched.h | 2 +- block/blk-mq.c | 2 -- block/blk-sysfs.c | 2 +- block/elevator.c | 94 +++++++++++++++++++++++++++--------------------- include/linux/elevator.h | 2 +- 6 files changed, 67 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0bb13bb51daa..e8c2ed654ef0 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -451,7 +451,7 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, return ret; } -void blk_mq_sched_teardown(struct request_queue *q) +static void blk_mq_sched_tags_teardown(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; @@ -513,10 +513,19 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; err: - blk_mq_sched_teardown(q); + blk_mq_sched_tags_teardown(q); + q->elevator = NULL; return ret; } +void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) +{ + if (e->type->ops.mq.exit_sched) + e->type->ops.mq.exit_sched(e); + blk_mq_sched_tags_teardown(q); + q->elevator = NULL; +} + int blk_mq_sched_init(struct request_queue *q) { int ret; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 19db25e0c95a..e704956e0862 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -33,7 +33,7 @@ void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx, struct request *(*get_rq)(struct blk_mq_hw_ctx *)); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); -void blk_mq_sched_teardown(struct request_queue *q); +void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx); diff --git a/block/blk-mq.c b/block/blk-mq.c index 72e744cd638c..cfb7c97b14ec 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2240,8 +2240,6 @@ void blk_mq_release(struct request_queue *q) struct blk_mq_hw_ctx *hctx; unsigned int i; - blk_mq_sched_teardown(q); - /* hctx kobj stays in hctx */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index c44b321335f3..37f0b3ad635e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -816,7 +816,7 @@ static void blk_release_queue(struct kobject *kobj) if (q->elevator) { ioc_clear_queue(q); - elevator_exit(q->elevator); + elevator_exit(q, q->elevator); } blk_exit_rl(&q->root_rl); diff --git a/block/elevator.c b/block/elevator.c index f236ef1d2be9..dbeecf7be719 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -252,11 +252,11 @@ int elevator_init(struct request_queue *q, char *name) } EXPORT_SYMBOL(elevator_init); -void elevator_exit(struct elevator_queue *e) +void elevator_exit(struct request_queue *q, struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); if (e->uses_mq && e->type->ops.mq.exit_sched) - e->type->ops.mq.exit_sched(e); + blk_mq_exit_sched(q, e); else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) e->type->ops.sq.elevator_exit_fn(e); mutex_unlock(&e->sysfs_lock); @@ -941,6 +941,45 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); +static int elevator_switch_mq(struct request_queue *q, + struct elevator_type *new_e) +{ + int ret; + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + if (q->elevator) { + if (q->elevator->registered) + elv_unregister_queue(q); + ioc_clear_queue(q); + elevator_exit(q, q->elevator); + } + + ret = blk_mq_init_sched(q, new_e); + if (ret) + goto out; + + if (new_e) { + ret = elv_register_queue(q); + if (ret) { + elevator_exit(q, q->elevator); + goto out; + } + } + + if (new_e) + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + else + blk_add_trace_msg(q, "elv switch: none"); + +out: + blk_mq_unfreeze_queue(q); + blk_mq_start_stopped_hw_queues(q, true); + return ret; + +} + /* * switch to new_e io scheduler. be careful not to introduce deadlocks - * we don't free the old io scheduler, before we have allocated what we @@ -953,10 +992,8 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) bool old_registered = false; int err; - if (q->mq_ops) { - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - } + if (q->mq_ops) + return elevator_switch_mq(q, new_e); /* * Turn on BYPASS and drain all requests w/ elevator private data. @@ -968,11 +1005,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) if (old) { old_registered = old->registered; - if (old->uses_mq) - blk_mq_sched_teardown(q); - - if (!q->mq_ops) - blk_queue_bypass_start(q); + blk_queue_bypass_start(q); /* unregister and clear all auxiliary data of the old elevator */ if (old_registered) @@ -982,53 +1015,32 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) } /* allocate, init and register new elevator */ - if (q->mq_ops) - err = blk_mq_init_sched(q, new_e); - else - err = new_e->ops.sq.elevator_init_fn(q, new_e); + err = new_e->ops.sq.elevator_init_fn(q, new_e); if (err) goto fail_init; - if (new_e) { - err = elv_register_queue(q); - if (err) - goto fail_register; - } + err = elv_register_queue(q); + if (err) + goto fail_register; /* done, kill the old one and finish */ if (old) { - elevator_exit(old); - if (!q->mq_ops) - blk_queue_bypass_end(q); + elevator_exit(q, old); + blk_queue_bypass_end(q); } - if (q->mq_ops) { - blk_mq_unfreeze_queue(q); - blk_mq_start_stopped_hw_queues(q, true); - } - - if (new_e) - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); - else - blk_add_trace_msg(q, "elv switch: none"); + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); return 0; fail_register: - if (q->mq_ops) - blk_mq_sched_teardown(q); - elevator_exit(q->elevator); + elevator_exit(q, q->elevator); fail_init: /* switch failed, restore and re-register old elevator */ if (old) { q->elevator = old; elv_register_queue(q); - if (!q->mq_ops) - blk_queue_bypass_end(q); - } - if (q->mq_ops) { - blk_mq_unfreeze_queue(q); - blk_mq_start_stopped_hw_queues(q, true); + blk_queue_bypass_end(q); } return err; diff --git a/include/linux/elevator.h b/include/linux/elevator.h index aebecc4ed088..22d39e8d4de1 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -211,7 +211,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *); extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); extern int elevator_init(struct request_queue *, char *); -extern void elevator_exit(struct elevator_queue *); +extern void elevator_exit(struct request_queue *, struct elevator_queue *); extern int elevator_change(struct request_queue *, const char *); extern bool elv_bio_merge_ok(struct request *, struct bio *); extern struct elevator_queue *elevator_alloc(struct request_queue *, -- cgit v1.2.3 From 7587a5ae7eef0439f7be31f1b5959af062bbc5ec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 7 Apr 2017 11:16:52 -0700 Subject: blk-mq: Introduce blk_mq_delay_run_hw_queue() Introduce a function that runs a hardware queue unconditionally after a delay. Note: there is already a function that stops and restarts a hardware queue after a delay, namely blk_mq_delay_queue(). This function will be used in the next patch in this series. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Long Li Cc: K. Y. Srinivasan Signed-off-by: Jens Axboe --- block/blk-mq.c | 32 ++++++++++++++++++++++++++++++-- include/linux/blk-mq.h | 2 ++ 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index f1be5c3eb600..ad45ae743186 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1135,7 +1135,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) return hctx->next_cpu; } -void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, + unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx) || !blk_mq_hw_queue_mapped(hctx))) @@ -1152,7 +1153,24 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) put_cpu(); } - kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); + if (msecs == 0) + kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->run_work); + else + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->delayed_run_work, + msecs_to_jiffies(msecs)); +} + +void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) +{ + __blk_mq_delay_run_hw_queue(hctx, true, msecs); +} +EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); + +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +{ + __blk_mq_delay_run_hw_queue(hctx, async, 0); } void blk_mq_run_hw_queues(struct request_queue *q, bool async) @@ -1255,6 +1273,15 @@ static void blk_mq_run_work_fn(struct work_struct *work) __blk_mq_run_hw_queue(hctx); } +static void blk_mq_delayed_run_work_fn(struct work_struct *work) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work); + + __blk_mq_run_hw_queue(hctx); +} + static void blk_mq_delay_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; @@ -1962,6 +1989,7 @@ static int blk_mq_init_hctx(struct request_queue *q, node = hctx->numa_node = set->numa_node; INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn); INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b296a9006117..9382c5da7a2e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -51,6 +51,7 @@ struct blk_mq_hw_ctx { atomic_t nr_active; + struct delayed_work delayed_run_work; struct delayed_work delay_work; struct hlist_node cpuhp_dead; @@ -238,6 +239,7 @@ void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); +void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, -- cgit v1.2.3 From 6d8c6c0f97ad8a3517c42b179c1dc8e77397d0e2 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 7 Apr 2017 12:40:09 -0600 Subject: blk-mq: Restart a single queue if tag sets are shared To improve scalability, if hardware queues are shared, restart a single hardware queue in round-robin fashion. Rename blk_mq_sched_restart_queues() to reflect the new semantics. Remove blk_mq_sched_mark_restart_queue() because this function has no callers. Remove flag QUEUE_FLAG_RESTART because this patch removes the code that uses this flag. Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 63 ++++++++++++++++++++++++++++++++++++++++++-------- block/blk-mq-sched.h | 16 +------------ block/blk-mq.c | 2 +- include/linux/blkdev.h | 1 - 4 files changed, 55 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index e8c2ed654ef0..c974a1bbf4cb 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -318,25 +318,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, return true; } -static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - if (blk_mq_hctx_has_pending(hctx)) + if (blk_mq_hctx_has_pending(hctx)) { blk_mq_run_hw_queue(hctx, true); + return true; + } } + return false; } -void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) -{ - struct request_queue *q = hctx->queue; - unsigned int i; +/** + * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list + * @pos: loop cursor. + * @skip: the list element that will not be examined. Iteration starts at + * @skip->next. + * @head: head of the list to examine. This list must have at least one + * element, namely @skip. + * @member: name of the list_head structure within typeof(*pos). + */ +#define list_for_each_entry_rcu_rr(pos, skip, head, member) \ + for ((pos) = (skip); \ + (pos = (pos)->member.next != (head) ? list_entry_rcu( \ + (pos)->member.next, typeof(*pos), member) : \ + list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ + (pos) != (skip); ) - if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { - if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_sched_restart_hctx(hctx); +/* + * Called after a driver tag has been freed to check whether a hctx needs to + * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware + * queues in a round-robin fashion if the tag set of @hctx is shared with other + * hardware queues. + */ +void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) +{ + struct blk_mq_tags *const tags = hctx->tags; + struct blk_mq_tag_set *const set = hctx->queue->tag_set; + struct request_queue *const queue = hctx->queue, *q; + struct blk_mq_hw_ctx *hctx2; + unsigned int i, j; + + if (set->flags & BLK_MQ_F_TAG_SHARED) { + rcu_read_lock(); + list_for_each_entry_rcu_rr(q, queue, &set->tag_list, + tag_set_list) { + queue_for_each_hw_ctx(q, hctx2, i) + if (hctx2->tags == tags && + blk_mq_sched_restart_hctx(hctx2)) + goto done; + } + j = hctx->queue_num + 1; + for (i = 0; i < queue->nr_hw_queues; i++, j++) { + if (j == queue->nr_hw_queues) + j = 0; + hctx2 = queue->queue_hw_ctx[j]; + if (hctx2->tags == tags && + blk_mq_sched_restart_hctx(hctx2)) + break; } +done: + rcu_read_unlock(); } else { blk_mq_sched_restart_hctx(hctx); } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index e704956e0862..3a9e6e40558b 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -19,7 +19,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); -void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async, bool can_block); @@ -136,20 +136,6 @@ static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -/* - * Mark a hardware queue and the request queue it belongs to as needing a - * restart. - */ -static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx) -{ - struct request_queue *q = hctx->queue; - - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); - if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) - set_bit(QUEUE_FLAG_RESTART, &q->queue_flags); -} - static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); diff --git a/block/blk-mq.c b/block/blk-mq.c index ad45ae743186..572966f49596 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -348,7 +348,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) blk_mq_sched_completed_request(hctx, rq); - blk_mq_sched_restart_queues(hctx); + blk_mq_sched_restart(hctx); blk_queue_exit(q); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5a7da607ca04..7548f332121a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -610,7 +610,6 @@ struct request_queue { #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ -#define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From d79bf21e0e02f8ad24219f0587cc599a7e67f6c6 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Fri, 7 Apr 2017 16:04:48 -0700 Subject: vmlinux.lds: add missing VMLINUX_SYMBOL macros When __{start,end}_ro_after_init is referenced from C code, we run into the following build errors on blackfin: kernel/extable.c:169: undefined reference to `__start_ro_after_init' kernel/extable.c:169: undefined reference to `__end_ro_after_init' The build error is due to the fact that blackfin is one of the few arches that prepends an underscore '_' to all symbols defined in C. Fix this by wrapping __{start,end}_ro_after_init in vmlinux.lds.h with VMLINUX_SYMBOL(), which adds the necessary prefix for arches that have HAVE_UNDERSCORE_SYMBOL_PREFIX. Link: http://lkml.kernel.org/r/1491259387-15869-1-git-send-email-jeyu@redhat.com Signed-off-by: Jessica Yu Acked-by: Kees Cook Cc: Arnd Bergmann Cc: Eddie Kovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/vmlinux.lds.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7cdfe167074f..143db9c523e2 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -261,9 +261,9 @@ */ #ifndef RO_AFTER_INIT_DATA #define RO_AFTER_INIT_DATA \ - __start_ro_after_init = .; \ + VMLINUX_SYMBOL(__start_ro_after_init) = .; \ *(.data..ro_after_init) \ - __end_ro_after_init = .; + VMLINUX_SYMBOL(__end_ro_after_init) = .; #endif /* -- cgit v1.2.3