From 4ad69b80e886a845f56ce0a3d10211208693d92b Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 9 Apr 2018 15:59:20 +0200 Subject: clk: honor CLK_MUX_ROUND_CLOSEST in generic clk mux CLK_MUX_ROUND_CLOSEST is part of the clk_mux documentation but clk_mux directly calls __clk_mux_determine_rate(), which overrides the flag. As result, if clk_mux is instantiated with CLK_MUX_ROUND_CLOSEST, the flag will be ignored and the clock rounded down. To solve this, this patch expose clk_mux_determine_rate_flags() in the clk-provider API and uses it in the determine_rate() callback of clk_mux. Fixes: 15a02c1f6dd7 ("clk: Add __clk_mux_determine_rate_closest") Signed-off-by: Jerome Brunet Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 210a890008f9..1d25e149c1c5 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -765,6 +765,9 @@ int __clk_mux_determine_rate(struct clk_hw *hw, int __clk_determine_rate(struct clk_hw *core, struct clk_rate_request *req); int __clk_mux_determine_rate_closest(struct clk_hw *hw, struct clk_rate_request *req); +int clk_mux_determine_rate_flags(struct clk_hw *hw, + struct clk_rate_request *req, + unsigned long flags); void clk_hw_reparent(struct clk_hw *hw, struct clk_hw *new_parent); void clk_hw_set_rate_range(struct clk_hw *hw, unsigned long min_rate, unsigned long max_rate); -- cgit v1.2.3 From 6899b32b5b2dee358936b82b8363b716607a138f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 23 Apr 2018 18:09:21 +0100 Subject: bpf: disable and restore preemption in __BPF_PROG_RUN_ARRAY Running bpf programs requires disabled preemption, however at least some* of the BPF_PROG_RUN_ARRAY users do not follow this rule. To fix this bug, and also to make it not happen in the future, let's add explicit preemption disabling/re-enabling to the __BPF_PROG_RUN_ARRAY code. * for example: [ 17.624472] RIP: 0010:__cgroup_bpf_run_filter_sk+0x1c4/0x1d0 ... [ 17.640890] inet6_create+0x3eb/0x520 [ 17.641405] __sock_create+0x242/0x340 [ 17.641939] __sys_socket+0x57/0xe0 [ 17.642370] ? trace_hardirqs_off_thunk+0x1a/0x1c [ 17.642944] SyS_socket+0xa/0x10 [ 17.643357] do_syscall_64+0x79/0x220 [ 17.643879] entry_SYSCALL_64_after_hwframe+0x42/0xb7 Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 486e65e3db26..dc586cc64bc2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -351,6 +351,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog **_prog, *__prog; \ struct bpf_prog_array *_array; \ u32 _ret = 1; \ + preempt_disable(); \ rcu_read_lock(); \ _array = rcu_dereference(array); \ if (unlikely(check_non_null && !_array))\ @@ -362,6 +363,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, } \ _out: \ rcu_read_unlock(); \ + preempt_enable_no_resched(); \ _ret; \ }) -- cgit v1.2.3 From ba6b8de423f8d0dee48d6030288ed81c03ddf9f0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 23 Apr 2018 15:39:23 -0700 Subject: bpf: sockmap, map_release does not hold refcnt for pinned maps Relying on map_release hook to decrement the reference counts when a map is removed only works if the map is not being pinned. In the pinned case the ref is decremented immediately and the BPF programs released. After this BPF programs may not be in-use which is not what the user would expect. This patch moves the release logic into bpf_map_put_uref() and brings sockmap in-line with how a similar case is handled in prog array maps. Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- kernel/bpf/arraymap.c | 3 ++- kernel/bpf/sockmap.c | 4 ++-- kernel/bpf/syscall.c | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc586cc64bc2..469b20e1dd7e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -31,6 +31,7 @@ struct bpf_map_ops { void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); + void (*map_release_uref)(struct bpf_map *map); /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); @@ -436,7 +437,6 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); -void bpf_fd_array_map_clear(struct bpf_map *map); int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 14750e7c5ee4..027107f4be53 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -476,7 +476,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; @@ -495,6 +495,7 @@ const struct bpf_map_ops prog_array_map_ops = { .map_fd_get_ptr = prog_fd_array_get_ptr, .map_fd_put_ptr = prog_fd_array_put_ptr, .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, + .map_release_uref = bpf_fd_array_map_clear, }; static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index a3b21385e947..a73d484b6e4c 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1831,7 +1831,7 @@ static int sock_map_update_elem(struct bpf_map *map, return err; } -static void sock_map_release(struct bpf_map *map, struct file *map_file) +static void sock_map_release(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); struct bpf_prog *orig; @@ -1855,7 +1855,7 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, - .map_release = sock_map_release, + .map_release_uref = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4ca46df19c9a..ebfe9f29dae8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -257,8 +257,8 @@ static void bpf_map_free_deferred(struct work_struct *work) static void bpf_map_put_uref(struct bpf_map *map) { if (atomic_dec_and_test(&map->usercnt)) { - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - bpf_fd_array_map_clear(map); + if (map->ops->map_release_uref) + map->ops->map_release_uref(map); } } -- cgit v1.2.3 From fcd58037f28bf70eb17157a51fbf94d466634a7d Mon Sep 17 00:00:00 2001 From: Arnaud Pouliquen Date: Tue, 10 Apr 2018 10:57:25 +0200 Subject: remoteproc: fix crashed parameter logic on stop call Fix rproc_add_subdev parameter name and inverse the crashed logic. Fixes: 880f5b388252 ("remoteproc: Pass type of shutdown to subdev remove") Reviewed-by: Alex Elder Signed-off-by: Arnaud Pouliquen Signed-off-by: Bjorn Andersson --- drivers/remoteproc/remoteproc_core.c | 4 ++-- include/linux/remoteproc.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 6d9c5832ce47..a9609d971f7f 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -1163,7 +1163,7 @@ int rproc_trigger_recovery(struct rproc *rproc) if (ret) return ret; - ret = rproc_stop(rproc, false); + ret = rproc_stop(rproc, true); if (ret) goto unlock_mutex; @@ -1316,7 +1316,7 @@ void rproc_shutdown(struct rproc *rproc) if (!atomic_dec_and_test(&rproc->power)) goto out; - ret = rproc_stop(rproc, true); + ret = rproc_stop(rproc, false); if (ret) { atomic_inc(&rproc->power); goto out; diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index d09a9c7af109..dfdaede9139e 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -569,7 +569,7 @@ static inline struct rproc *vdev_to_rproc(struct virtio_device *vdev) void rproc_add_subdev(struct rproc *rproc, struct rproc_subdev *subdev, int (*probe)(struct rproc_subdev *subdev), - void (*remove)(struct rproc_subdev *subdev, bool graceful)); + void (*remove)(struct rproc_subdev *subdev, bool crashed)); void rproc_remove_subdev(struct rproc *rproc, struct rproc_subdev *subdev); -- cgit v1.2.3 From bf0ddaba65ddbb2715af97041da8e7a45b2d8628 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 26 Apr 2018 00:21:59 -0700 Subject: blk-mq: fix sysfs inflight counter When the blk-mq inflight implementation was added, /proc/diskstats was converted to use it, but /sys/block/$dev/inflight was not. Fix it by adding another helper to count in-flight requests by data direction. Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant") Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++++++++++++ block/blk-mq.h | 4 +++- block/genhd.c | 12 ++++++++++++ block/partition-generic.c | 10 ++++++---- include/linux/genhd.h | 4 +++- 5 files changed, 43 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5450cbc61f8d..9ce9cac16c3f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -115,6 +115,25 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); } +static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, + bool reserved) +{ + struct mq_inflight *mi = priv; + + if (rq->part == mi->part) + mi->inflight[rq_data_dir(rq)]++; +} + +void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + struct mq_inflight mi = { .part = part, .inflight = inflight, }; + + inflight[0] = inflight[1] = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); +} + void blk_freeze_queue_start(struct request_queue *q) { int freeze_depth; diff --git a/block/blk-mq.h b/block/blk-mq.h index 89b5cd3a6c70..e1bb420dc5d6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -188,7 +188,9 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) } void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); + unsigned int inflight[2]); +void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) { diff --git a/block/genhd.c b/block/genhd.c index dc7e089373b9..c4513fe1adda 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -82,6 +82,18 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part, } } +void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + if (q->mq_ops) { + blk_mq_in_flight_rw(q, part, inflight); + return; + } + + inflight[0] = atomic_read(&part->in_flight[0]); + inflight[1] = atomic_read(&part->in_flight[1]); +} + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); diff --git a/block/partition-generic.c b/block/partition-generic.c index 08dabcd8b6ae..db57cced9b98 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -145,13 +145,15 @@ ssize_t part_stat_show(struct device *dev, jiffies_to_msecs(part_stat_read(p, time_in_queue))); } -ssize_t part_inflight_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight[2]; - return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]), - atomic_read(&p->in_flight[1])); + part_in_flight_rw(q, p, inflight); + return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } #ifdef CONFIG_FAIL_MAKE_REQUEST diff --git a/include/linux/genhd.h b/include/linux/genhd.h index c826b0b5232a..6cb8a5789668 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -368,7 +368,9 @@ static inline void free_part_stats(struct hd_struct *part) part_stat_add(cpu, gendiskp, field, -subnd) void part_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); + unsigned int inflight[2]); +void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, -- cgit v1.2.3 From 6082d9c9c94a408d7409b5f2e4e42ac9e8b16d0d Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 12 Apr 2018 09:49:11 +0000 Subject: net/mlx5: Fix mlx5_get_vector_affinity function Adding the vector offset when calling to mlx5_vector2eqn() is wrong. This is because mlx5_vector2eqn() checks if EQ index is equal to vector number and the fact that the internal completion vectors that mlx5 allocates don't get an EQ index. The second problem here is that using effective_affinity_mask gives the same CPU for different vectors. This leads to unmapped queues when calling it from blk_mq_rdma_map_queues(). This doesn't happen when using affinity_hint mask. Fixes: 2572cf57d75a ("mlx5: fix mlx5_get_vector_affinity to start from completion vector 0") Fixes: 05e0cc84e00c ("net/mlx5: Fix get vector affinity helper function") Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg --- drivers/infiniband/hw/mlx5/main.c | 2 +- include/linux/mlx5/driver.h | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index daa919e5a442..241cf4ff9901 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4757,7 +4757,7 @@ mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - return mlx5_get_vector_affinity(dev->mdev, comp_vector); + return mlx5_get_vector_affinity_hint(dev->mdev, comp_vector); } /* The mlx5_ib_multiport_mutex should be held when calling this function */ diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 767d193c269a..2a156c5dfadd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1284,25 +1284,19 @@ enum { }; static inline const struct cpumask * -mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector) +mlx5_get_vector_affinity_hint(struct mlx5_core_dev *dev, int vector) { - const struct cpumask *mask; struct irq_desc *desc; unsigned int irq; int eqn; int err; - err = mlx5_vector2eqn(dev, MLX5_EQ_VEC_COMP_BASE + vector, &eqn, &irq); + err = mlx5_vector2eqn(dev, vector, &eqn, &irq); if (err) return NULL; desc = irq_to_desc(irq); -#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK - mask = irq_data_get_effective_affinity_mask(&desc->irq_data); -#else - mask = desc->irq_common_data.affinity; -#endif - return mask; + return desc->affinity_hint; } #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From ed769520727edbf526e0f510e6c501fc6ba3824a Mon Sep 17 00:00:00 2001 From: Joel Pepper Date: Thu, 26 Apr 2018 20:26:08 +0200 Subject: usb: gadget: composite Allow for larger configuration descriptors The composite framework allows us to create gadgets composed from many different functions, which need to fit into a single configuration descriptor. Some functions (like uvc) can produce configuration descriptors upwards of 2500 bytes on their own. This patch increases the limit from 1024 bytes to 4096. Signed-off-by: Joel Pepper Signed-off-by: Felipe Balbi --- include/linux/usb/composite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 4b6b9283fa7b..8675e145ea8b 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -52,7 +52,7 @@ #define USB_GADGET_DELAYED_STATUS 0x7fff /* Impossibly large value */ /* big enough to hold our biggest descriptor */ -#define USB_COMP_EP0_BUFSIZ 1024 +#define USB_COMP_EP0_BUFSIZ 4096 /* OS feature descriptor length <= 4kB */ #define USB_COMP_EP0_OS_DESC_BUFSIZ 4096 -- cgit v1.2.3 From 8236b0ae31c837d2b3a2565c5f8d77f637e824cc Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 2 May 2018 07:07:55 +0900 Subject: bdi: wake up concurrent wb_shutdown() callers. syzbot is reporting hung tasks at wait_on_bit(WB_shutting_down) in wb_shutdown() [1]. This seems to be because commit 5318ce7d46866e1d ("bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy()") forgot to call wake_up_bit(WB_shutting_down) after clear_bit(WB_shutting_down). Introduce a helper function clear_and_wake_up_bit() and use it, in order to avoid similar errors in future. [1] https://syzkaller.appspot.com/bug?id=b297474817af98d5796bc544e1bb806fc3da0e5e Signed-off-by: Tetsuo Handa Reported-by: syzbot Fixes: 5318ce7d46866e1d ("bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy()") Cc: Tejun Heo Reviewed-by: Jan Kara Suggested-by: Linus Torvalds Signed-off-by: Jens Axboe --- include/linux/wait_bit.h | 17 +++++++++++++++++ mm/backing-dev.c | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 9318b2166439..2b0072fa5e92 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -305,4 +305,21 @@ do { \ __ret; \ }) +/** + * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit + * + * @bit: the bit of the word being waited on + * @word: the word being waited on, a kernel virtual address + * + * You can use this helper if bitflags are manipulated atomically rather than + * non-atomically under a lock. + */ +static inline void clear_and_wake_up_bit(int bit, void *word) +{ + clear_bit_unlock(bit, word); + /* See wake_up_bit() for which memory barrier you need to use. */ + smp_mb__after_atomic(); + wake_up_bit(word, bit); +} + #endif /* _LINUX_WAIT_BIT_H */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 023190c69dce..fa5e6d7406d1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -383,7 +383,7 @@ static void wb_shutdown(struct bdi_writeback *wb) * the barrier provided by test_and_clear_bit() above. */ smp_wmb(); - clear_bit(WB_shutting_down, &wb->state); + clear_and_wake_up_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) -- cgit v1.2.3 From 23b8392201e0681b76630c4cea68e1a2e1821ec6 Mon Sep 17 00:00:00 2001 From: Bhadram Varka Date: Wed, 2 May 2018 20:43:58 +0530 Subject: net: phy: broadcom: add support for BCM89610 PHY It adds support for BCM89610 (Single-Port 10/100/1000BASE-T) transceiver which is used in P3310 Tegra186 platform. Signed-off-by: Bhadram Varka Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 10 ++++++++++ include/linux/brcmphy.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 3bb6b66dc7bf..f9c25912eb98 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -720,6 +720,15 @@ static struct phy_driver broadcom_drivers[] = { .get_strings = bcm_phy_get_strings, .get_stats = bcm53xx_phy_get_stats, .probe = bcm53xx_phy_probe, +}, { + .phy_id = PHY_ID_BCM89610, + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM89610", + .features = PHY_GBIT_FEATURES, + .flags = PHY_HAS_INTERRUPT, + .config_init = bcm54xx_config_init, + .ack_interrupt = bcm_phy_ack_intr, + .config_intr = bcm_phy_config_intr, } }; module_phy_driver(broadcom_drivers); @@ -741,6 +750,7 @@ static struct mdio_device_id __maybe_unused broadcom_tbl[] = { { PHY_ID_BCMAC131, 0xfffffff0 }, { PHY_ID_BCM5241, 0xfffffff0 }, { PHY_ID_BCM5395, 0xfffffff0 }, + { PHY_ID_BCM89610, 0xfffffff0 }, { } }; diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index d3339dd48b1a..b324e01ccf2d 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -25,6 +25,7 @@ #define PHY_ID_BCM54612E 0x03625e60 #define PHY_ID_BCM54616S 0x03625d10 #define PHY_ID_BCM57780 0x03625d90 +#define PHY_ID_BCM89610 0x03625cd0 #define PHY_ID_BCM7250 0xae025280 #define PHY_ID_BCM7260 0xae025190 -- cgit v1.2.3 From 0010f7052d6cb71c4b120238e28cd3fa413913d1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 4 May 2018 16:57:30 +0200 Subject: libceph: add osd_req_op_extent_osd_data_bvecs() ... and store num_bvecs for client code's convenience. Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton Reviewed-by: "Yan, Zheng" --- drivers/block/rbd.c | 4 +++- include/linux/ceph/osd_client.h | 12 ++++++++++-- net/ceph/osd_client.c | 27 +++++++++++++++++++++++---- 3 files changed, 36 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8e8b04cc569a..33b36fea1d73 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2366,7 +2366,9 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, - obj_req->copyup_bvecs, bytes); + obj_req->copyup_bvecs, + obj_req->copyup_bvec_count, + bytes); switch (obj_req->img_request->op_type) { case OBJ_OP_WRITE: diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 528ccc943cee..96bb32285989 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -77,7 +77,10 @@ struct ceph_osd_data { u32 bio_length; }; #endif /* CONFIG_BLOCK */ - struct ceph_bvec_iter bvec_pos; + struct { + struct ceph_bvec_iter bvec_pos; + u32 num_bvecs; + }; }; }; @@ -412,6 +415,10 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, struct ceph_bio_iter *bio_pos, u32 bio_length); #endif /* CONFIG_BLOCK */ +void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, + unsigned int which, + struct bio_vec *bvecs, u32 num_bvecs, + u32 bytes); void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_bvec_iter *bvec_pos); @@ -426,7 +433,8 @@ extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, bool own_pages); void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, unsigned int which, - struct bio_vec *bvecs, u32 bytes); + struct bio_vec *bvecs, u32 num_bvecs, + u32 bytes); extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ea2a6c9fb7ce..d2667e5dddc3 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -157,10 +157,12 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, #endif /* CONFIG_BLOCK */ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, - struct ceph_bvec_iter *bvec_pos) + struct ceph_bvec_iter *bvec_pos, + u32 num_bvecs) { osd_data->type = CEPH_OSD_DATA_TYPE_BVECS; osd_data->bvec_pos = *bvec_pos; + osd_data->num_bvecs = num_bvecs; } #define osd_req_op_data(oreq, whch, typ, fld) \ @@ -237,6 +239,22 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); #endif /* CONFIG_BLOCK */ +void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, + unsigned int which, + struct bio_vec *bvecs, u32 num_bvecs, + u32 bytes) +{ + struct ceph_osd_data *osd_data; + struct ceph_bvec_iter it = { + .bvecs = bvecs, + .iter = { .bi_size = bytes }, + }; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs); + void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, unsigned int which, struct ceph_bvec_iter *bvec_pos) @@ -244,7 +262,7 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, struct ceph_osd_data *osd_data; osd_data = osd_req_op_data(osd_req, which, extent, osd_data); - ceph_osd_data_bvecs_init(osd_data, bvec_pos); + ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0); } EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); @@ -287,7 +305,8 @@ EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, unsigned int which, - struct bio_vec *bvecs, u32 bytes) + struct bio_vec *bvecs, u32 num_bvecs, + u32 bytes) { struct ceph_osd_data *osd_data; struct ceph_bvec_iter it = { @@ -296,7 +315,7 @@ void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, }; osd_data = osd_req_op_data(osd_req, which, cls, request_data); - ceph_osd_data_bvecs_init(osd_data, &it); + ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs); osd_req->r_ops[which].cls.indata_len += bytes; osd_req->r_ops[which].indata_len += bytes; } -- cgit v1.2.3 From 27ae357fa82be5ab73b2ef8d39dcb8ca2563483a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 11 May 2018 16:02:04 -0700 Subject: mm, oom: fix concurrent munlock and oom reaper unmap, v3 Since exit_mmap() is done without the protection of mm->mmap_sem, it is possible for the oom reaper to concurrently operate on an mm until MMF_OOM_SKIP is set. This allows munlock_vma_pages_all() to concurrently run while the oom reaper is operating on a vma. Since munlock_vma_pages_range() depends on clearing VM_LOCKED from vm_flags before actually doing the munlock to determine if any other vmas are locking the same memory, the check for VM_LOCKED in the oom reaper is racy. This is especially noticeable on architectures such as powerpc where clearing a huge pmd requires serialize_against_pte_lookup(). If the pmd is zapped by the oom reaper during follow_page_mask() after the check for pmd_none() is bypassed, this ends up deferencing a NULL ptl or a kernel oops. Fix this by manually freeing all possible memory from the mm before doing the munlock and then setting MMF_OOM_SKIP. The oom reaper can not run on the mm anymore so the munlock is safe to do in exit_mmap(). It also matches the logic that the oom reaper currently uses for determining when to set MMF_OOM_SKIP itself, so there's no new risk of excessive oom killing. This issue fixes CVE-2018-1000200. Link: http://lkml.kernel.org/r/alpine.DEB.2.21.1804241526320.238665@chino.kir.corp.google.com Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: David Rientjes Suggested-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 ++ mm/mmap.c | 44 +++++++++++++++++------------ mm/oom_kill.c | 81 ++++++++++++++++++++++++++++------------------------- 3 files changed, 71 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 5bad038ac012..6adac113e96d 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -95,6 +95,8 @@ static inline int check_stable_address_space(struct mm_struct *mm) return 0; } +void __oom_reap_task_mm(struct mm_struct *mm); + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/mm/mmap.c b/mm/mmap.c index 9d5968d1e8e3..d6836566e4e5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3024,6 +3024,32 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); + if (unlikely(mm_is_oom_victim(mm))) { + /* + * Manually reap the mm to free as much memory as possible. + * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard + * this mm from further consideration. Taking mm->mmap_sem for + * write after setting MMF_OOM_SKIP will guarantee that the oom + * reaper will not run on this mm again after mmap_sem is + * dropped. + * + * Nothing can be holding mm->mmap_sem here and the above call + * to mmu_notifier_release(mm) ensures mmu notifier callbacks in + * __oom_reap_task_mm() will not block. + * + * This needs to be done before calling munlock_vma_pages_all(), + * which clears VM_LOCKED, otherwise the oom reaper cannot + * reliably test it. + */ + mutex_lock(&oom_lock); + __oom_reap_task_mm(mm); + mutex_unlock(&oom_lock); + + set_bit(MMF_OOM_SKIP, &mm->flags); + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } + if (mm->locked_vm) { vma = mm->mmap; while (vma) { @@ -3045,24 +3071,6 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); - - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Wait for oom_reap_task() to stop working on this - * mm. Because MMF_OOM_SKIP is already set before - * calling down_read(), oom_reap_task() will not run - * on this "mm" post up_write(). - * - * mm_is_oom_victim() cannot be set from under us - * either because victim->mm is already set to NULL - * under task_lock before calling mmput and oom_mm is - * set not NULL by the OOM killer only if victim->mm - * is found not NULL while holding the task_lock. - */ - set_bit(MMF_OOM_SKIP, &mm->flags); - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); - } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ff992fa8760a..8ba6cb88cf58 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -469,7 +469,6 @@ bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } - #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -480,16 +479,54 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) +void __oom_reap_task_mm(struct mm_struct *mm) { - struct mmu_gather tlb; struct vm_area_struct *vma; + + /* + * Tell all users of get_user/copy_from_user etc... that the content + * is no longer stable. No barriers really needed because unmapping + * should imply barriers already and the reader would hit a page fault + * if it stumbled over a reaped memory. + */ + set_bit(MMF_UNSTABLE, &mm->flags); + + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (!can_madv_dontneed_vma(vma)) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { + const unsigned long start = vma->vm_start; + const unsigned long end = vma->vm_end; + struct mmu_gather tlb; + + tlb_gather_mmu(&tlb, mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end); + unmap_page_range(&tlb, vma, start, end, NULL); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + } + } +} + +static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) +{ bool ret = true; /* * We have to make sure to not race with the victim exit path * and cause premature new oom victim selection: - * __oom_reap_task_mm exit_mm + * oom_reap_task_mm exit_mm * mmget_not_zero * mmput * atomic_dec_and_test @@ -534,39 +571,8 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) trace_start_task_reaping(tsk->pid); - /* - * Tell all users of get_user/copy_from_user etc... that the content - * is no longer stable. No barriers really needed because unmapping - * should imply barriers already and the reader would hit a page fault - * if it stumbled over a reaped memory. - */ - set_bit(MMF_UNSTABLE, &mm->flags); - - for (vma = mm->mmap ; vma; vma = vma->vm_next) { - if (!can_madv_dontneed_vma(vma)) - continue; + __oom_reap_task_mm(mm); - /* - * Only anonymous pages have a good chance to be dropped - * without additional steps which we cannot afford as we - * are OOM already. - * - * We do not even care about fs backed pages because all - * which are reclaimable have already been reclaimed and - * we do not want to block exit_mmap by keeping mm ref - * count elevated without a good reason. - */ - if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - const unsigned long start = vma->vm_start; - const unsigned long end = vma->vm_end; - - tlb_gather_mmu(&tlb, mm, start, end); - mmu_notifier_invalidate_range_start(mm, start, end); - unmap_page_range(&tlb, vma, start, end, NULL); - mmu_notifier_invalidate_range_end(mm, start, end); - tlb_finish_mmu(&tlb, start, end); - } - } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), @@ -587,14 +593,13 @@ static void oom_reap_task(struct task_struct *tsk) struct mm_struct *mm = tsk->signal->oom_mm; /* Retry the down_read_trylock(mmap_sem) a few times */ - while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) + while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || test_bit(MMF_OOM_SKIP, &mm->flags)) goto done; - pr_info("oom_reaper: unable to reap pid:%d (%s)\n", task_pid_nr(tsk), tsk->comm); debug_show_all_locks(); -- cgit v1.2.3 From 2075b16e32c26e4031b9fd3cbe26c54676a8fcb5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 11 May 2018 16:02:14 -0700 Subject: rbtree: include rcu.h Since commit c1adf20052d8 ("Introduce rb_replace_node_rcu()") rbtree_augmented.h uses RCU related data structures but does not include the header file. It works as long as it gets somehow included before that and fails otherwise. Link: http://lkml.kernel.org/r/20180504103159.19938-1-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 1 + include/linux/rbtree_latch.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 6bfd2b581f75..af8a61be2d8d 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -26,6 +26,7 @@ #include #include +#include /* * Please note - only struct rb_augment_callbacks and the prototypes for diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h index ece43e882b56..7d012faa509a 100644 --- a/include/linux/rbtree_latch.h +++ b/include/linux/rbtree_latch.h @@ -35,6 +35,7 @@ #include #include +#include struct latch_tree_node { struct rb_node node[2]; -- cgit v1.2.3