From e275d2109cdaea8b4554b9eb8a828bdb8f8ba068 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 26 Oct 2020 10:08:47 +0200 Subject: bus: ti-sysc: Fix reset status check for modules with quirks Commit d46f9fbec719 ("bus: ti-sysc: Use optional clocks on for enable and wait for softreset bit") started showing a "OCP softreset timed out" warning on enable if the interconnect target module is not out of reset. This caused the warning to be often triggered for i2c and hdq while the devices are working properly. Turns out that some interconnect target modules seem to have an unusable reset status bits unless the module specific reset quirks are activated. Let's just skip the reset status check for those modules as we only want to activate the reset quirks when doing a reset, and not on enable. This way we don't see the bogus "OCP softreset timed out" warnings during boot. Fixes: d46f9fbec719 ("bus: ti-sysc: Use optional clocks on for enable and wait for softreset bit") Signed-off-by: Tony Lindgren --- include/linux/platform_data/ti-sysc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index c59999ce044e..240dce553a0b 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -50,6 +50,7 @@ struct sysc_regbits { s8 emufree_shift; }; +#define SYSC_MODULE_QUIRK_ENA_RESETDONE BIT(25) #define SYSC_MODULE_QUIRK_PRUSS BIT(24) #define SYSC_MODULE_QUIRK_DSS_RESET BIT(23) #define SYSC_MODULE_QUIRK_RTC_UNLOCK BIT(22) -- cgit v1.2.3 From f51778db088b2407ec177f2f4da0f6290602aa3f Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 2 Nov 2020 12:43:27 +1100 Subject: swiotlb: using SIZE_MAX needs limits.h included After merging the drm-misc tree, linux-next build (arm multi_v7_defconfig) failed like this: In file included from drivers/gpu/drm/nouveau/nouveau_ttm.c:26: include/linux/swiotlb.h: In function 'swiotlb_max_mapping_size': include/linux/swiotlb.h:99:9: error: 'SIZE_MAX' undeclared (first use in this function) 99 | return SIZE_MAX; | ^~~~~~~~ include/linux/swiotlb.h:7:1: note: 'SIZE_MAX' is defined in header ''; did you forget to '#include '? 6 | #include +++ |+#include 7 | #include include/linux/swiotlb.h:99:9: note: each undeclared identifier is reported only once for each function it appears in 99 | return SIZE_MAX; | ^~~~~~~~ Caused by commit abe420bfae52 ("swiotlb: Introduce swiotlb_max_mapping_size()") but only exposed by commit "drm/nouveu: fix swiotlb include" Fix it by including linux/limits.h as appropriate. Fixes: abe420bfae52 ("swiotlb: Introduce swiotlb_max_mapping_size()") Signed-off-by: Stephen Rothwell Link: https://lore.kernel.org/r/20201102124327.2f82b2a7@canb.auug.org.au Signed-off-by: Michael S. Tsirkin --- include/linux/swiotlb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 046bb94bd4d6..fa5122c6711e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -5,6 +5,7 @@ #include #include #include +#include struct device; struct page; -- cgit v1.2.3 From 93bd813c17763177cf87e96c2313bd4dd747d234 Mon Sep 17 00:00:00 2001 From: Jack Yu Date: Thu, 5 Nov 2020 11:08:04 +0800 Subject: ASoC: rt1015: add delay to fix pop noise from speaker Add delay to fix pop noise from speaker. Signed-off-by: Jack Yu Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20201105030804.31115-1-jack.yu@realtek.com Signed-off-by: Mark Brown --- Documentation/devicetree/bindings/sound/rt1015.txt | 6 ++++++ include/sound/rt1015.h | 15 +++++++++++++++ sound/soc/codecs/rt1015.c | 20 ++++++++++++++++++++ sound/soc/codecs/rt1015.h | 2 ++ 4 files changed, 43 insertions(+) create mode 100644 include/sound/rt1015.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/sound/rt1015.txt b/Documentation/devicetree/bindings/sound/rt1015.txt index fcfd02d8d32f..e498966d436f 100644 --- a/Documentation/devicetree/bindings/sound/rt1015.txt +++ b/Documentation/devicetree/bindings/sound/rt1015.txt @@ -8,10 +8,16 @@ Required properties: - reg : The I2C address of the device. +Optional properties: + +- realtek,power-up-delay-ms + Set a delay time for flush work to be completed, + this value is adjustable depending on platform. Example: rt1015: codec@28 { compatible = "realtek,rt1015"; reg = <0x28>; + realtek,power-up-delay-ms = <50>; }; diff --git a/include/sound/rt1015.h b/include/sound/rt1015.h new file mode 100644 index 000000000000..70a7538d4c89 --- /dev/null +++ b/include/sound/rt1015.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * linux/sound/rt1015.h -- Platform data for RT1015 + * + * Copyright 2020 Realtek Microelectronics + */ + +#ifndef __LINUX_SND_RT1015_H +#define __LINUX_SND_RT1015_H + +struct rt1015_platform_data { + unsigned int power_up_delay_ms; +}; + +#endif diff --git a/sound/soc/codecs/rt1015.c b/sound/soc/codecs/rt1015.c index 25fe2ddedd54..967193518349 100644 --- a/sound/soc/codecs/rt1015.c +++ b/sound/soc/codecs/rt1015.c @@ -27,10 +27,15 @@ #include #include #include +#include #include "rl6231.h" #include "rt1015.h" +static const struct rt1015_platform_data i2s_default_platform_data = { + .power_up_delay_ms = 50, +}; + static const struct reg_default rt1015_reg[] = { { 0x0000, 0x0000 }, { 0x0004, 0xa000 }, @@ -650,6 +655,7 @@ static int rt1015_amp_drv_event(struct snd_soc_dapm_widget *w, case SND_SOC_DAPM_POST_PMU: if (rt1015->hw_config == RT1015_HW_28) schedule_delayed_work(&rt1015->flush_work, msecs_to_jiffies(10)); + msleep(rt1015->pdata.power_up_delay_ms); break; default: break; @@ -1067,9 +1073,16 @@ static struct acpi_device_id rt1015_acpi_match[] = { MODULE_DEVICE_TABLE(acpi, rt1015_acpi_match); #endif +static void rt1015_parse_dt(struct rt1015_priv *rt1015, struct device *dev) +{ + device_property_read_u32(dev, "realtek,power-up-delay-ms", + &rt1015->pdata.power_up_delay_ms); +} + static int rt1015_i2c_probe(struct i2c_client *i2c, const struct i2c_device_id *id) { + struct rt1015_platform_data *pdata = dev_get_platdata(&i2c->dev); struct rt1015_priv *rt1015; int ret; unsigned int val; @@ -1081,6 +1094,13 @@ static int rt1015_i2c_probe(struct i2c_client *i2c, i2c_set_clientdata(i2c, rt1015); + rt1015->pdata = i2s_default_platform_data; + + if (pdata) + rt1015->pdata = *pdata; + else + rt1015_parse_dt(rt1015, &i2c->dev); + rt1015->regmap = devm_regmap_init_i2c(i2c, &rt1015_regmap); if (IS_ERR(rt1015->regmap)) { ret = PTR_ERR(rt1015->regmap); diff --git a/sound/soc/codecs/rt1015.h b/sound/soc/codecs/rt1015.h index d3fdd30aca6d..15cadb361ec3 100644 --- a/sound/soc/codecs/rt1015.h +++ b/sound/soc/codecs/rt1015.h @@ -12,6 +12,7 @@ #ifndef __RT1015_H__ #define __RT1015_H__ +#include #define RT1015_DEVICE_ID_VAL 0x1011 #define RT1015_DEVICE_ID_VAL2 0x1015 @@ -380,6 +381,7 @@ enum { struct rt1015_priv { struct snd_soc_component *component; + struct rt1015_platform_data pdata; struct regmap *regmap; int sysclk; int sysclk_src; -- cgit v1.2.3 From 3084db0e0d5076cd48408274ab0911cd3ccdae88 Mon Sep 17 00:00:00 2001 From: Daniel Latypov Date: Mon, 2 Nov 2020 15:23:04 -0800 Subject: kunit: fix display of failed expectations for strings Currently the following expectation KUNIT_EXPECT_STREQ(test, "hi", "bye"); will produce: Expected "hi" == "bye", but "hi" == 1625079497 "bye" == 1625079500 After this patch: Expected "hi" == "bye", but "hi" == hi "bye" == bye KUNIT_INIT_BINARY_STR_ASSERT_STRUCT() was written but just mistakenly not actually used by KUNIT_EXPECT_STREQ() and friends. Signed-off-by: Daniel Latypov Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Shuah Khan --- include/kunit/test.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index db1b0ae666c4..df60be7e22ca 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -1105,7 +1105,7 @@ do { \ KUNIT_ASSERTION(test, \ strcmp(__left, __right) op 0, \ kunit_binary_str_assert, \ - KUNIT_INIT_BINARY_ASSERT_STRUCT(test, \ + KUNIT_INIT_BINARY_STR_ASSERT_STRUCT(test, \ assert_type, \ #op, \ #left, \ -- cgit v1.2.3 From 5e844cc37a5cbaa460e68f9a989d321d63088a89 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 11 Nov 2020 20:07:10 +0100 Subject: spi: Introduce device-managed SPI controller allocation SPI driver probing currently comprises two steps, whereas removal comprises only one step: spi_alloc_master() spi_register_controller() spi_unregister_controller() That's because spi_unregister_controller() calls device_unregister() instead of device_del(), thereby releasing the reference on the spi_controller which was obtained by spi_alloc_master(). An SPI driver's private data is contained in the same memory allocation as the spi_controller struct. Thus, once spi_unregister_controller() has been called, the private data is inaccessible. But some drivers need to access it after spi_unregister_controller() to perform further teardown steps. Introduce devm_spi_alloc_master() and devm_spi_alloc_slave(), which release a reference on the spi_controller struct only after the driver has unbound, thereby keeping the memory allocation accessible. Change spi_unregister_controller() to not release a reference if the spi_controller was allocated by one of these new devm functions. The present commit is small enough to be backportable to stable. It allows fixing drivers which use the private data in their ->remove() hook after it's been freed. It also allows fixing drivers which neglect to release a reference on the spi_controller in the probe error path. Long-term, most SPI drivers shall be moved over to the devm functions introduced herein. The few that can't shall be changed in a treewide commit to explicitly release the last reference on the controller. That commit shall amend spi_unregister_controller() to no longer release a reference, thereby completing the migration. As a result, the behaviour will be less surprising and more consistent with subsystems such as IIO, which also includes the private data in the allocation of the generic iio_dev struct, but calls device_del() in iio_device_unregister(). Signed-off-by: Lukas Wunner Link: https://lore.kernel.org/r/272bae2ef08abd21388c98e23729886663d19192.1605121038.git.lukas@wunner.de Signed-off-by: Mark Brown --- drivers/spi/spi.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++- include/linux/spi/spi.h | 19 ++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 7566482c052c..05c75f890ace 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2442,6 +2442,49 @@ struct spi_controller *__spi_alloc_controller(struct device *dev, } EXPORT_SYMBOL_GPL(__spi_alloc_controller); +static void devm_spi_release_controller(struct device *dev, void *ctlr) +{ + spi_controller_put(*(struct spi_controller **)ctlr); +} + +/** + * __devm_spi_alloc_controller - resource-managed __spi_alloc_controller() + * @dev: physical device of SPI controller + * @size: how much zeroed driver-private data to allocate + * @slave: whether to allocate an SPI master (false) or SPI slave (true) + * Context: can sleep + * + * Allocate an SPI controller and automatically release a reference on it + * when @dev is unbound from its driver. Drivers are thus relieved from + * having to call spi_controller_put(). + * + * The arguments to this function are identical to __spi_alloc_controller(). + * + * Return: the SPI controller structure on success, else NULL. + */ +struct spi_controller *__devm_spi_alloc_controller(struct device *dev, + unsigned int size, + bool slave) +{ + struct spi_controller **ptr, *ctlr; + + ptr = devres_alloc(devm_spi_release_controller, sizeof(*ptr), + GFP_KERNEL); + if (!ptr) + return NULL; + + ctlr = __spi_alloc_controller(dev, size, slave); + if (ctlr) { + *ptr = ctlr; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return ctlr; +} +EXPORT_SYMBOL_GPL(__devm_spi_alloc_controller); + #ifdef CONFIG_OF static int of_spi_get_gpio_numbers(struct spi_controller *ctlr) { @@ -2778,6 +2821,11 @@ int devm_spi_register_controller(struct device *dev, } EXPORT_SYMBOL_GPL(devm_spi_register_controller); +static int devm_spi_match_controller(struct device *dev, void *res, void *ctlr) +{ + return *(struct spi_controller **)res == ctlr; +} + static int __unregister(struct device *dev, void *null) { spi_unregister_device(to_spi_device(dev)); @@ -2819,7 +2867,15 @@ void spi_unregister_controller(struct spi_controller *ctlr) list_del(&ctlr->list); mutex_unlock(&board_lock); - device_unregister(&ctlr->dev); + device_del(&ctlr->dev); + + /* Release the last reference on the controller if its driver + * has not yet been converted to devm_spi_alloc_master/slave(). + */ + if (!devres_find(ctlr->dev.parent, devm_spi_release_controller, + devm_spi_match_controller, ctlr)) + put_device(&ctlr->dev); + /* free bus id */ mutex_lock(&board_lock); if (found == ctlr) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 99380c0825db..b390fdac1587 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -734,6 +734,25 @@ static inline struct spi_controller *spi_alloc_slave(struct device *host, return __spi_alloc_controller(host, size, true); } +struct spi_controller *__devm_spi_alloc_controller(struct device *dev, + unsigned int size, + bool slave); + +static inline struct spi_controller *devm_spi_alloc_master(struct device *dev, + unsigned int size) +{ + return __devm_spi_alloc_controller(dev, size, false); +} + +static inline struct spi_controller *devm_spi_alloc_slave(struct device *dev, + unsigned int size) +{ + if (!IS_ENABLED(CONFIG_SPI_SLAVE)) + return NULL; + + return __devm_spi_alloc_controller(dev, size, true); +} + extern int spi_register_controller(struct spi_controller *ctlr); extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); -- cgit v1.2.3 From c3213d260a23e263ef85ba21ac68c9e7578020b5 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Thu, 12 Nov 2020 15:17:32 -0500 Subject: SUNRPC: Fix oops in the rpc_xdr_buf event class Backchannel rpc tasks don't have task->tk_client set, so it's necessary to check it for NULL before dereferencing. Fixes: c509f15a5801 ("SUNRPC: Split the xdr_buf event class") Signed-off-by: Scott Mayhew Reviewed-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/trace/events/sunrpc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 2477014e3fa6..2a03263b5f9d 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -68,7 +68,8 @@ DECLARE_EVENT_CLASS(rpc_xdr_buf_class, TP_fast_assign( __entry->task_id = task->tk_pid; - __entry->client_id = task->tk_client->cl_clid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; __entry->head_base = xdr->head[0].iov_base; __entry->head_len = xdr->head[0].iov_len; __entry->tail_base = xdr->tail[0].iov_base; -- cgit v1.2.3 From 8cf8821e15cd553339a5b48ee555a0439c2b2742 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Thu, 12 Nov 2020 20:58:15 -0500 Subject: net: Exempt multicast addresses from five-second neighbor lifetime Commit 58956317c8de ("neighbor: Improve garbage collection") guarantees neighbour table entries a five-second lifetime. Processes which make heavy use of multicast can fill the neighour table with multicast addresses in five seconds. At that point, neighbour entries can't be GC-ed because they aren't five seconds old yet, the kernel log starts to fill up with "neighbor table overflow!" messages, and sends start to fail. This patch allows multicast addresses to be thrown out before they've lived out their five seconds. This makes room for non-multicast addresses and makes messages to all addresses more reliable in these circumstances. Fixes: 58956317c8de ("neighbor: Improve garbage collection") Signed-off-by: Jeff Dike Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20201113015815.31397-1-jdike@akamai.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 1 + net/core/neighbour.c | 2 ++ net/ipv4/arp.c | 6 ++++++ net/ipv6/ndisc.c | 7 +++++++ 4 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 81ee17594c32..22ced1381ede 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -204,6 +204,7 @@ struct neigh_table { int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); + int (*is_multicast)(const void *pkey); bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack); char *id; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8e39e28b0a8d..9500d28a43b0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -235,6 +235,8 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if ((n->nud_state == NUD_FAILED) || + (tbl->is_multicast && + tbl->is_multicast(n->primary_key)) || time_after(tref, n->updated)) remove = true; write_unlock(&n->lock); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 687971d83b4e..922dd73e5740 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -125,6 +125,7 @@ static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); +static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, @@ -156,6 +157,7 @@ struct neigh_table arp_tbl = { .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, + .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, @@ -928,6 +930,10 @@ static void parp_redo(struct sk_buff *skb) arp_process(dev_net(skb->dev), NULL, skb); } +static int arp_is_multicast(const void *pkey) +{ + return ipv4_is_multicast(*((__be32 *)pkey)); +} /* * Receive an arp request from the device layer. diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 27f29b957ee7..76717478f173 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -81,6 +81,7 @@ static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); static int pndisc_constructor(struct pneigh_entry *n); static void pndisc_destructor(struct pneigh_entry *n); static void pndisc_redo(struct sk_buff *skb); +static int ndisc_is_multicast(const void *pkey); static const struct neigh_ops ndisc_generic_ops = { .family = AF_INET6, @@ -115,6 +116,7 @@ struct neigh_table nd_tbl = { .pconstructor = pndisc_constructor, .pdestructor = pndisc_destructor, .proxy_redo = pndisc_redo, + .is_multicast = ndisc_is_multicast, .allow_add = ndisc_allow_add, .id = "ndisc_cache", .parms = { @@ -1706,6 +1708,11 @@ static void pndisc_redo(struct sk_buff *skb) kfree_skb(skb); } +static int ndisc_is_multicast(const void *pkey) +{ + return ipv6_addr_is_multicast((struct in6_addr *)pkey); +} + static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); -- cgit v1.2.3 From 9c2e14b48119b39446031d29d994044ae958d8fc Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Tue, 10 Nov 2020 16:16:40 -0800 Subject: ip_tunnels: Set tunnel option flag when tunnel metadata is present Currently, we may set the tunnel option flag when the size of metadata is zero. For example, we set TUNNEL_GENEVE_OPT in the receive function no matter the geneve option is present or not. As this may result in issues on the tunnel flags consumers, this patch fixes the issue. Related discussion: * https://lore.kernel.org/netdev/1604448694-19351-1-git-send-email-yihung.wei@gmail.com/T/#u Fixes: 256c87c17c53 ("net: check tunnel option type in tunnel flags") Signed-off-by: Yi-Hung Wei Link: https://lore.kernel.org/r/1605053800-74072-1-git-send-email-yihung.wei@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/geneve.c | 3 +-- include/net/ip_tunnels.h | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index d07008a818df..1426bfc009bc 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -224,8 +224,7 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, if (ip_tunnel_collect_metadata() || gs->collect_md) { __be16 flags; - flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | - (gnvh->oam ? TUNNEL_OAM : 0) | + flags = TUNNEL_KEY | (gnvh->oam ? TUNNEL_OAM : 0) | (gnvh->critical ? TUNNEL_CRIT_OPT : 0); tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags, diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 02ccd32542d0..61620677b034 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -478,9 +478,11 @@ static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, const void *from, int len, __be16 flags) { - memcpy(ip_tunnel_info_opts(info), from, len); info->options_len = len; - info->key.tun_flags |= flags; + if (len > 0) { + memcpy(ip_tunnel_info_opts(info), from, len); + info->key.tun_flags |= flags; + } } static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) @@ -526,7 +528,6 @@ static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, __be16 flags) { info->options_len = 0; - info->key.tun_flags |= flags; } #endif /* CONFIG_INET */ -- cgit v1.2.3 From cef397038167ac15d085914493d6c86385773709 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Nov 2020 17:52:58 +0100 Subject: arch: pgtable: define MAX_POSSIBLE_PHYSMEM_BITS where needed Stefan Agner reported a bug when using zsram on 32-bit Arm machines with RAM above the 4GB address boundary: Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = a27bd01c [00000000] *pgd=236a0003, *pmd=1ffa64003 Internal error: Oops: 207 [#1] SMP ARM Modules linked in: mdio_bcm_unimac(+) brcmfmac cfg80211 brcmutil raspberrypi_hwmon hci_uart crc32_arm_ce bcm2711_thermal phy_generic genet CPU: 0 PID: 123 Comm: mkfs.ext4 Not tainted 5.9.6 #1 Hardware name: BCM2711 PC is at zs_map_object+0x94/0x338 LR is at zram_bvec_rw.constprop.0+0x330/0xa64 pc : [] lr : [] psr: 60000013 sp : e376bbe0 ip : 00000000 fp : c1e2921c r10: 00000002 r9 : c1dda730 r8 : 00000000 r7 : e8ff7a00 r6 : 00000000 r5 : 02f9ffa0 r4 : e3710000 r3 : 000fdffe r2 : c1e0ce80 r1 : ebf979a0 r0 : 00000000 Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user Control: 30c5383d Table: 235c2a80 DAC: fffffffd Process mkfs.ext4 (pid: 123, stack limit = 0x495a22e6) Stack: (0xe376bbe0 to 0xe376c000) As it turns out, zsram needs to know the maximum memory size, which is defined in MAX_PHYSMEM_BITS when CONFIG_SPARSEMEM is set, or in MAX_POSSIBLE_PHYSMEM_BITS on the x86 architecture. The same problem will be hit on all 32-bit architectures that have a physical address space larger than 4GB and happen to not enable sparsemem and include asm/sparsemem.h from asm/pgtable.h. After the initial discussion, I suggested just always defining MAX_POSSIBLE_PHYSMEM_BITS whenever CONFIG_PHYS_ADDR_T_64BIT is set, or provoking a build error otherwise. This addresses all configurations that can currently have this runtime bug, but leaves all other configurations unchanged. I looked up the possible number of bits in source code and datasheets, here is what I found: - on ARC, CONFIG_ARC_HAS_PAE40 controls whether 32 or 40 bits are used - on ARM, CONFIG_LPAE enables 40 bit addressing, without it we never support more than 32 bits, even though supersections in theory allow up to 40 bits as well. - on MIPS, some MIPS32r1 or later chips support 36 bits, and MIPS32r5 XPA supports up to 60 bits in theory, but 40 bits are more than anyone will ever ship - On PowerPC, there are three different implementations of 36 bit addressing, but 32-bit is used without CONFIG_PTE_64BIT - On RISC-V, the normal page table format can support 34 bit addressing. There is no highmem support on RISC-V, so anything above 2GB is unused, but it might be useful to eventually support CONFIG_ZRAM for high pages. Fixes: 61989a80fb3a ("staging: zsmalloc: zsmalloc memory allocation library") Fixes: 02390b87a945 ("mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS") Acked-by: Thomas Bogendoerfer Reviewed-by: Stefan Agner Tested-by: Stefan Agner Acked-by: Mike Rapoport Link: https://lore.kernel.org/linux-mm/bdfa44bf1c570b05d6c70898e2bbb0acf234ecdf.1604762181.git.stefan@agner.ch/ Signed-off-by: Arnd Bergmann --- arch/arc/include/asm/pgtable.h | 2 ++ arch/arm/include/asm/pgtable-2level.h | 2 ++ arch/arm/include/asm/pgtable-3level.h | 2 ++ arch/mips/include/asm/pgtable-32.h | 3 +++ arch/powerpc/include/asm/book3s/32/pgtable.h | 2 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 ++ arch/riscv/include/asm/pgtable-32.h | 2 ++ include/linux/pgtable.h | 13 +++++++++++++ 8 files changed, 28 insertions(+) (limited to 'include') diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index f1ed17edb085..163641726a2b 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -134,8 +134,10 @@ #ifdef CONFIG_ARC_HAS_PAE40 #define PTE_BITS_NON_RWX_IN_PD1 (0xff00000000 | PAGE_MASK | _PAGE_CACHEABLE) +#define MAX_POSSIBLE_PHYSMEM_BITS 40 #else #define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK | _PAGE_CACHEABLE) +#define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif /************************************************************************** diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 3502c2f746ca..baf7d0204eb5 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -75,6 +75,8 @@ #define PTE_HWTABLE_OFF (PTE_HWTABLE_PTRS * sizeof(pte_t)) #define PTE_HWTABLE_SIZE (PTRS_PER_PTE * sizeof(u32)) +#define MAX_POSSIBLE_PHYSMEM_BITS 32 + /* * PMD_SHIFT determines the size of the area a second-level page table can map * PGDIR_SHIFT determines what a third-level page table entry can map diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index fbb6693c3352..2b85d175e999 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -25,6 +25,8 @@ #define PTE_HWTABLE_OFF (0) #define PTE_HWTABLE_SIZE (PTRS_PER_PTE * sizeof(u64)) +#define MAX_POSSIBLE_PHYSMEM_BITS 40 + /* * PGDIR_SHIFT determines the size a top-level page table entry can map. */ diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index a950fc1ddb4d..6c0532d7b211 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -154,6 +154,7 @@ static inline void pmd_clear(pmd_t *pmdp) #if defined(CONFIG_XPA) +#define MAX_POSSIBLE_PHYSMEM_BITS 40 #define pte_pfn(x) (((unsigned long)((x).pte_high >> _PFN_SHIFT)) | (unsigned long)((x).pte_low << _PAGE_PRESENT_SHIFT)) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) @@ -169,6 +170,7 @@ pfn_pte(unsigned long pfn, pgprot_t prot) #elif defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #define pte_pfn(x) ((unsigned long)((x).pte_high >> 6)) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) @@ -183,6 +185,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #else +#define MAX_POSSIBLE_PHYSMEM_BITS 32 #ifdef CONFIG_CPU_VR41XX #define pte_pfn(x) ((unsigned long)((x).pte >> (PAGE_SHIFT + 2))) #define pfn_pte(pfn, prot) __pte(((pfn) << (PAGE_SHIFT + 2)) | pgprot_val(prot)) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 36443cda8dcf..1376be95e975 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -36,8 +36,10 @@ static inline bool pte_user(pte_t pte) */ #ifdef CONFIG_PTE_64BIT #define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #else #define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif /* diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index ee2243ba96cf..96522f7f0618 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -153,8 +153,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); */ #if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) #define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #else #define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif /* diff --git a/arch/riscv/include/asm/pgtable-32.h b/arch/riscv/include/asm/pgtable-32.h index b0ab66e5fdb1..5b2e79e5bfa5 100644 --- a/arch/riscv/include/asm/pgtable-32.h +++ b/arch/riscv/include/asm/pgtable-32.h @@ -14,4 +14,6 @@ #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 34 + #endif /* _ASM_RISCV_PGTABLE_32_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 71125a4676c4..e237004d498d 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1427,6 +1427,19 @@ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ +#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) +#ifdef CONFIG_PHYS_ADDR_T_64BIT +/* + * ZSMALLOC needs to know the highest PFN on 32-bit architectures + * with physical address space extension, but falls back to + * BITS_PER_LONG otherwise. + */ +#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition +#else +#define MAX_POSSIBLE_PHYSMEM_BITS 32 +#endif +#endif + #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 -- cgit v1.2.3 From dd8088d5a8969dc2b42f71d7bc01c25c61a78066 Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Tue, 10 Nov 2020 17:29:32 +0800 Subject: PM: runtime: Add pm_runtime_resume_and_get to deal with usage counter In many case, we need to check return value of pm_runtime_get_sync, but it brings a trouble to the usage counter processing. Many callers forget to decrease the usage counter when it failed, which could resulted in reference leak. It has been discussed a lot[0][1]. So we add a function to deal with the usage counter for better coding. [0]https://lkml.org/lkml/2020/6/14/88 [1]https://patchwork.ozlabs.org/project/linux-tegra/list/?series=178139 Signed-off-by: Zhang Qilong Acked-by: Rafael J. Wysocki Signed-off-by: Jakub Kicinski --- include/linux/pm_runtime.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 4b708f4e8eed..b492ae00cc90 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -386,6 +386,27 @@ static inline int pm_runtime_get_sync(struct device *dev) return __pm_runtime_resume(dev, RPM_GET_PUT); } +/** + * pm_runtime_resume_and_get - Bump up usage counter of a device and resume it. + * @dev: Target device. + * + * Resume @dev synchronously and if that is successful, increment its runtime + * PM usage counter. Return 0 if the runtime PM usage counter of @dev has been + * incremented or a negative error code otherwise. + */ +static inline int pm_runtime_resume_and_get(struct device *dev) +{ + int ret; + + ret = __pm_runtime_resume(dev, RPM_GET_PUT); + if (ret < 0) { + pm_runtime_put_noidle(dev); + return ret; + } + + return 0; +} + /** * pm_runtime_put - Drop device usage counter and queue up "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From 9d9e937b1c8be97b424e3e11938e183fcde905c0 Mon Sep 17 00:00:00 2001 From: Georg Kohmann Date: Wed, 11 Nov 2020 12:50:25 +0100 Subject: ipv6/netfilter: Discard first fragment not including all headers Packets are processed even though the first fragment don't include all headers through the upper layer header. This breaks TAHI IPv6 Core Conformance Test v6LC.1.3.6. Referring to RFC8200 SECTION 4.5: "If the first fragment does not include all headers through an Upper-Layer header, then that fragment should be discarded and an ICMP Parameter Problem, Code 3, message should be sent to the source of the fragment, with the Pointer field set to zero." The fragment needs to be validated the same way it is done in commit 2efdaaaf883a ("IPv6: reply ICMP error if the first fragment don't include all headers") for ipv6. Wrap the validation into a common function, ipv6_frag_thdr_truncated() to check for truncation in the upper layer header. This validation does not fullfill all aspects of RFC 8200, section 4.5, but is at the moment sufficient to pass mentioned TAHI test. In netfilter, utilize the fragment offset returned by find_prev_fhdr() to let ipv6_frag_thdr_truncated() start it's traverse from the fragment header. Return 0 to drop the fragment in the netfilter. This is the same behaviour as used on other protocol errors in this function, e.g. when nf_ct_frag6_queue() returns -EPROTO. The Fragment will later be picked up by ipv6_frag_rcv() in reassembly.c. ipv6_frag_rcv() will then send an appropriate ICMP Parameter Problem message back to the source. References commit 2efdaaaf883a ("IPv6: reply ICMP error if the first fragment don't include all headers") Signed-off-by: Georg Kohmann Acked-by: Pablo Neira Ayuso Link: https://lore.kernel.org/r/20201111115025.28879-1-geokohma@cisco.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 2 ++ net/ipv6/netfilter/nf_conntrack_reasm.c | 9 ++++++ net/ipv6/reassembly.c | 55 +++++++++++++++++++++------------ 3 files changed, 46 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index bd1f396cc9c7..637cc6dd12b7 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1064,6 +1064,8 @@ int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp, bool ipv6_ext_hdr(u8 nexthdr); +bool ipv6_frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp); + enum { IP6_FH_F_FRAG = (1 << 0), IP6_FH_F_AUTH = (1 << 1), diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 054d287eb13d..b9cc0b330dbe 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -440,6 +440,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { u16 savethdr = skb->transport_header; + u8 nexthdr = NEXTHDR_FRAGMENT; int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; @@ -455,6 +456,14 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) return 0; + /* Discard the first fragment if it does not include all headers + * RFC 8200, Section 4.5 + */ + if (ipv6_frag_thdr_truncated(skb, fhoff, &nexthdr)) { + pr_debug("Drop incomplete fragment\n"); + return 0; + } + if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) return -ENOMEM; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index c8cf1bbad74a..e3869bac9c88 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -318,15 +318,43 @@ out_fail: return -1; } +/* Check if the upper layer header is truncated in the first fragment. */ +bool ipv6_frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp) +{ + u8 nexthdr = *nexthdrp; + __be16 frag_off; + int offset; + + offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off); + if (offset < 0 || (frag_off & htons(IP6_OFFSET))) + return false; + switch (nexthdr) { + case NEXTHDR_TCP: + offset += sizeof(struct tcphdr); + break; + case NEXTHDR_UDP: + offset += sizeof(struct udphdr); + break; + case NEXTHDR_ICMP: + offset += sizeof(struct icmp6hdr); + break; + default: + offset += 1; + } + if (offset > skb->len) + return true; + return false; +} +EXPORT_SYMBOL(ipv6_frag_thdr_truncated); + static int ipv6_frag_rcv(struct sk_buff *skb) { struct frag_hdr *fhdr; struct frag_queue *fq; const struct ipv6hdr *hdr = ipv6_hdr(skb); struct net *net = dev_net(skb_dst(skb)->dev); - __be16 frag_off; - int iif, offset; u8 nexthdr; + int iif; if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; @@ -362,24 +390,11 @@ static int ipv6_frag_rcv(struct sk_buff *skb) * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; - offset = ipv6_skip_exthdr(skb, skb_transport_offset(skb), &nexthdr, &frag_off); - if (offset >= 0) { - /* Check some common protocols' header */ - if (nexthdr == IPPROTO_TCP) - offset += sizeof(struct tcphdr); - else if (nexthdr == IPPROTO_UDP) - offset += sizeof(struct udphdr); - else if (nexthdr == IPPROTO_ICMPV6) - offset += sizeof(struct icmp6hdr); - else - offset += 1; - - if (!(frag_off & htons(IP6_OFFSET)) && offset > skb->len) { - __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), - IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); - return -1; - } + if (ipv6_frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { + __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); + return -1; } iif = skb->dev ? skb->dev->ifindex : 0; -- cgit v1.2.3 From fe0a8a95e7134d0b44cd407bc0085b9ba8d8fe31 Mon Sep 17 00:00:00 2001 From: Lee Duncan Date: Fri, 6 Nov 2020 11:33:17 -0800 Subject: scsi: libiscsi: Fix NOP race condition iSCSI NOPs are sometimes "lost", mistakenly sent to the user-land iscsid daemon instead of handled in the kernel, as they should be, resulting in a message from the daemon like: iscsid: Got nop in, but kernel supports nop handling. This can occur because of the new forward- and back-locks, and the fact that an iSCSI NOP response can occur before processing of the NOP send is complete. This can result in "conn->ping_task" being NULL in iscsi_nop_out_rsp(), when the pointer is actually in the process of being set. To work around this, we add a new state to the "ping_task" pointer. In addition to NULL (not assigned) and a pointer (assigned), we add the state "being set", which is signaled with an INVALID pointer (using "-1"). Link: https://lore.kernel.org/r/20201106193317.16993-1-leeman.duncan@gmail.com Reviewed-by: Mike Christie Signed-off-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/libiscsi.c | 23 +++++++++++++++-------- include/scsi/libiscsi.h | 3 +++ 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 1e9c3171fa9f..f9314f1393fb 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -533,8 +533,8 @@ static void iscsi_complete_task(struct iscsi_task *task, int state) if (conn->task == task) conn->task = NULL; - if (conn->ping_task == task) - conn->ping_task = NULL; + if (READ_ONCE(conn->ping_task) == task) + WRITE_ONCE(conn->ping_task, NULL); /* release get from queueing */ __iscsi_put_task(task); @@ -738,6 +738,9 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct iscsi_hdr *hdr, task->conn->session->age); } + if (unlikely(READ_ONCE(conn->ping_task) == INVALID_SCSI_TASK)) + WRITE_ONCE(conn->ping_task, task); + if (!ihost->workq) { if (iscsi_prep_mgmt_task(conn, task)) goto free_task; @@ -941,8 +944,11 @@ static int iscsi_send_nopout(struct iscsi_conn *conn, struct iscsi_nopin *rhdr) struct iscsi_nopout hdr; struct iscsi_task *task; - if (!rhdr && conn->ping_task) - return -EINVAL; + if (!rhdr) { + if (READ_ONCE(conn->ping_task)) + return -EINVAL; + WRITE_ONCE(conn->ping_task, INVALID_SCSI_TASK); + } memset(&hdr, 0, sizeof(struct iscsi_nopout)); hdr.opcode = ISCSI_OP_NOOP_OUT | ISCSI_OP_IMMEDIATE; @@ -957,11 +963,12 @@ static int iscsi_send_nopout(struct iscsi_conn *conn, struct iscsi_nopin *rhdr) task = __iscsi_conn_send_pdu(conn, (struct iscsi_hdr *)&hdr, NULL, 0); if (!task) { + if (!rhdr) + WRITE_ONCE(conn->ping_task, NULL); iscsi_conn_printk(KERN_ERR, conn, "Could not send nopout\n"); return -EIO; } else if (!rhdr) { /* only track our nops */ - conn->ping_task = task; conn->last_ping = jiffies; } @@ -984,7 +991,7 @@ static int iscsi_nop_out_rsp(struct iscsi_task *task, struct iscsi_conn *conn = task->conn; int rc = 0; - if (conn->ping_task != task) { + if (READ_ONCE(conn->ping_task) != task) { /* * If this is not in response to one of our * nops then it must be from userspace. @@ -1923,7 +1930,7 @@ static void iscsi_start_tx(struct iscsi_conn *conn) */ static int iscsi_has_ping_timed_out(struct iscsi_conn *conn) { - if (conn->ping_task && + if (READ_ONCE(conn->ping_task) && time_before_eq(conn->last_recv + (conn->recv_timeout * HZ) + (conn->ping_timeout * HZ), jiffies)) return 1; @@ -2058,7 +2065,7 @@ enum blk_eh_timer_return iscsi_eh_cmd_timed_out(struct scsi_cmnd *sc) * Checking the transport already or nop from a cmd timeout still * running */ - if (conn->ping_task) { + if (READ_ONCE(conn->ping_task)) { task->have_checked_conn = true; rc = BLK_EH_RESET_TIMER; goto done; diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index c25fb86ffae9..b3bbd10eb3f0 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -132,6 +132,9 @@ struct iscsi_task { void *dd_data; /* driver/transport data */ }; +/* invalid scsi_task pointer */ +#define INVALID_SCSI_TASK (struct iscsi_task *)-1l + static inline int iscsi_task_has_unsol_data(struct iscsi_task *task) { return task->unsol_r2t.data_length > task->unsol_r2t.sent; -- cgit v1.2.3 From f97bb5272d9e95d400d6c8643ebb146b3e3e7842 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 09:08:41 +0100 Subject: sched: Fix data-race in wakeup Mel reported that on some ARM64 platforms loadavg goes bananas and Will tracked it down to the following race: CPU0 CPU1 schedule() prev->sched_contributes_to_load = X; deactivate_task(prev); try_to_wake_up() if (p->on_rq &&) // false if (smp_load_acquire(&p->on_cpu) && // true ttwu_queue_wakelist()) p->sched_remote_wakeup = Y; smp_store_release(prev->on_cpu, 0); where both p->sched_contributes_to_load and p->sched_remote_wakeup are in the same word, and thus the stores X and Y race (and can clobber one another's data). Whereas prior to commit c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") the p->on_cpu handoff serialized access to p->sched_remote_wakeup (just as it still does with p->sched_contributes_to_load) that commit broke that by calling ttwu_queue_wakelist() with p->on_cpu != 0. However, due to p->XXX = X ttwu() schedule() if (p->on_rq && ...) // false smp_mb__after_spinlock() if (smp_load_acquire(&p->on_cpu) && deactivate_task() ttwu_queue_wakelist()) p->on_rq = 0; p->sched_remote_wakeup = Y; We can be sure any 'current' store is complete and 'current' is guaranteed asleep. Therefore we can move p->sched_remote_wakeup into the current flags word. Note: while the observed failure was loadavg accounting gone wrong due to ttwu() cobbering p->sched_contributes_to_load, the reverse problem is also possible where schedule() clobbers p->sched_remote_wakeup, this could result in enqueue_entity() wrecking ->vruntime and causing scheduling artifacts. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Mel Gorman Debugged-by: Will Deacon Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117083016.GK3121392@hirez.programming.kicks-ass.net --- include/linux/sched.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d383cf09e78f..0e91b451d2a2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -769,7 +769,6 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; - unsigned sched_remote_wakeup:1; #ifdef CONFIG_PSI unsigned sched_psi_wake_requeue:1; #endif @@ -779,6 +778,21 @@ struct task_struct { /* Unserialized, strictly 'current' */ + /* + * This field must not be in the scheduler word above due to wakelist + * queueing no longer being serialized by p->on_cpu. However: + * + * p->XXX = X; ttwu() + * schedule() if (p->on_rq && ..) // false + * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true + * deactivate_task() ttwu_queue_wakelist()) + * p->on_rq = 0; p->sched_remote_wakeup = Y; + * + * guarantees all stores of 'current' are visible before + * ->sched_remote_wakeup gets used, so it can be in this word. + */ + unsigned sched_remote_wakeup:1; + /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; -- cgit v1.2.3 From 2279f540ea7d05f22d2f0c4224319330228586bc Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 17 Nov 2020 07:14:32 +0100 Subject: sched/deadline: Fix priority inheritance with multiple scheduling classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Glenn reported that "an application [he developed produces] a BUG in deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested PTHREAD_PRIO_INHERIT mutexes. I believe the bug is triggered when a CFS task that was boosted by a SCHED_DEADLINE task boosts another CFS task (nested priority inheritance). ------------[ cut here ]------------ kernel BUG at kernel/sched/deadline.c:1462! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ... Hardware name: ... RIP: 0010:enqueue_task_dl+0x335/0x910 Code: ... RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002 RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500 RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600 RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078 R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600 R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600 FS: 00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? intel_pstate_update_util_hwp+0x13/0x170 rt_mutex_setprio+0x1cc/0x4b0 task_blocks_on_rt_mutex+0x225/0x260 rt_spin_lock_slowlock_locked+0xab/0x2d0 rt_spin_lock_slowlock+0x50/0x80 hrtimer_grab_expiry_lock+0x20/0x30 hrtimer_cancel+0x13/0x30 do_nanosleep+0xa0/0x150 hrtimer_nanosleep+0xe1/0x230 ? __hrtimer_init_sleeper+0x60/0x60 __x64_sys_nanosleep+0x8d/0xa0 do_syscall_64+0x4a/0x100 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fa58b52330d ... ---[ end trace 0000000000000002 ]— He also provided a simple reproducer creating the situation below: So the execution order of locking steps are the following (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2 are mutexes that are enabled * with priority inheritance.) Time moves forward as this timeline goes down: N1 N2 D1 | | | | | | Lock(M1) | | | | | | Lock(M2) | | | | | | Lock(M2) | | | | Lock(M1) | | (!!bug triggered!) | Daniel reported a similar situation as well, by just letting ksoftirqd run with DEADLINE (and eventually block on a mutex). Problem is that boosted entities (Priority Inheritance) use static DEADLINE parameters of the top priority waiter. However, there might be cases where top waiter could be a non-DEADLINE entity that is currently boosted by a DEADLINE entity from a different lock chain (i.e., nested priority chains involving entities of non-DEADLINE classes). In this case, top waiter static DEADLINE parameters could be null (initialized to 0 at fork()) and replenish_dl_entity() would hit a BUG(). Fix this by keeping track of the original donor and using its parameters when a task is boosted. Reported-by: Glenn Elliott Reported-by: Daniel Bristot de Oliveira Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com --- include/linux/sched.h | 10 ++++- kernel/sched/core.c | 11 +++--- kernel/sched/deadline.c | 97 +++++++++++++++++++++++++++---------------------- 3 files changed, 68 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e91b451d2a2..095fdec07b38 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -551,7 +551,6 @@ struct sched_dl_entity { * overruns. */ unsigned int dl_throttled : 1; - unsigned int dl_boosted : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; @@ -570,6 +569,15 @@ struct sched_dl_entity { * time. */ struct hrtimer inactive_timer; + +#ifdef CONFIG_RT_MUTEXES + /* + * Priority Inheritance. When a DEADLINE scheduling entity is boosted + * pi_se points to the donor, otherwise points to the dl_se it belongs + * to (the original one/itself). + */ + struct sched_dl_entity *pi_se; +#endif }; #ifdef CONFIG_UCLAMP_TASK diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9f0ebfb0d17b..e7e453492cff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4912,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (!dl_prio(p->normal_prio) || (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; + p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; + } else { + p->dl.pi_se = &p->dl; + } p->sched_class = &dl_sched_class; } else if (rt_prio(prio)) { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; p->sched_class = &fair_sched_class; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6d93f4518734..949bc5c083c1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + BUG_ON(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded && dl_se->runtime > 0) @@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded) @@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } } @@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1287,7 +1306,7 @@ throttle: dl_se->dl_overrun = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { BUG_ON(on_dl_rq(dl_se)); @@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, */ if (flags & ENQUEUE_WAKEUP) { task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); + update_dl_entity(dl_se); } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); + replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { @@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; + if (is_dl_boosted(&p->dl)) { /* * Because of delays in the detection of the overrun of a * thread's runtime, it might be the case that a thread @@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * the throttle. */ p->dl.dl_throttled = 0; - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); return; } @@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + enqueue_dl_entity(&p->dl, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); @@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; - dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) -- cgit v1.2.3 From 138559b9f99d3b6b1d5e75c78facc067a23871c6 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 15 Nov 2020 15:14:48 +0200 Subject: net/tls: Fix wrong record sn in async mode of device resync In async_resync mode, we log the TCP seq of records until the async request is completed. Later, in case one of the logged seqs matches the resync request, we return it, together with its record serial number. Before this fix, we mistakenly returned the serial number of the current record instead. Fixes: ed9b7646b06a ("net/tls: Add asynchronous resync") Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Link: https://lore.kernel.org/r/20201115131448.2702-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/tls.h | 16 +++++++++++++++- net/tls/tls_device.c | 37 +++++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index baf1e99d8193..cf1473099453 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -300,7 +300,8 @@ enum tls_offload_sync_type { #define TLS_DEVICE_RESYNC_ASYNC_LOGMAX 13 struct tls_offload_resync_async { atomic64_t req; - u32 loglen; + u16 loglen; + u16 rcd_delta; u32 log[TLS_DEVICE_RESYNC_ASYNC_LOGMAX]; }; @@ -471,6 +472,18 @@ static inline bool tls_bigint_increment(unsigned char *seq, int len) return (i == -1); } +static inline void tls_bigint_subtract(unsigned char *seq, int n) +{ + u64 rcd_sn; + __be64 *p; + + BUILD_BUG_ON(TLS_MAX_REC_SEQ_SIZE != 8); + + p = (__be64 *)seq; + rcd_sn = be64_to_cpu(*p); + *p = cpu_to_be64(rcd_sn - n); +} + static inline struct tls_context *tls_get_ctx(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -639,6 +652,7 @@ tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len) atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) | ((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC); rx_ctx->resync_async->loglen = 0; + rx_ctx->resync_async->rcd_delta = 0; } static inline void diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index cec86229a6a0..54d3e161d198 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -694,36 +694,51 @@ static void tls_device_resync_rx(struct tls_context *tls_ctx, static bool tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, - s64 resync_req, u32 *seq) + s64 resync_req, u32 *seq, u16 *rcd_delta) { u32 is_async = resync_req & RESYNC_REQ_ASYNC; u32 req_seq = resync_req >> 32; u32 req_end = req_seq + ((resync_req >> 16) & 0xffff); + u16 i; + + *rcd_delta = 0; if (is_async) { + /* shouldn't get to wraparound: + * too long in async stage, something bad happened + */ + if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) + return false; + /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request */ - if (between(*seq, req_seq, req_end) && + if (before(*seq, req_seq)) + return false; + if (!after(*seq, req_end) && resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX) resync_async->log[resync_async->loglen++] = *seq; + resync_async->rcd_delta++; + return false; } /* synchronous stage: check against the logged entries and * proceed to check the next entries if no match was found */ - while (resync_async->loglen) { - if (req_seq == resync_async->log[resync_async->loglen - 1] && - atomic64_try_cmpxchg(&resync_async->req, - &resync_req, 0)) { - resync_async->loglen = 0; + for (i = 0; i < resync_async->loglen; i++) + if (req_seq == resync_async->log[i] && + atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) { + *rcd_delta = resync_async->rcd_delta - i; *seq = req_seq; + resync_async->loglen = 0; + resync_async->rcd_delta = 0; return true; } - resync_async->loglen--; - } + + resync_async->loglen = 0; + resync_async->rcd_delta = 0; if (req_seq == *seq && atomic64_try_cmpxchg(&resync_async->req, @@ -741,6 +756,7 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; + u16 rcd_delta; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) @@ -786,8 +802,9 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) return; if (!tls_device_rx_resync_async(rx_ctx->resync_async, - resync_req, &seq)) + resync_req, &seq, &rcd_delta)) return; + tls_bigint_subtract(rcd_sn, rcd_delta); break; } -- cgit v1.2.3 From 4d213e76a359e540ca786ee937da7f35faa8e5f8 Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Tue, 10 Nov 2020 15:19:08 +0800 Subject: iommu/vt-d: Avoid panic if iommu init fails in tboot system "intel_iommu=off" command line is used to disable iommu but iommu is force enabled in a tboot system for security reason. However for better performance on high speed network device, a new option "intel_iommu=tboot_noforce" is introduced to disable the force on. By default kernel should panic if iommu init fail in tboot for security reason, but it's unnecessory if we use "intel_iommu=tboot_noforce,off". Fix the code setting force_on and move intel_iommu_tboot_noforce from tboot code to intel iommu code. Fixes: 7304e8f28bb2 ("iommu/vt-d: Correctly disable Intel IOMMU force on") Signed-off-by: Zhenzhong Duan Tested-by: Lukasz Hawrylko Acked-by: Lu Baolu Link: https://lore.kernel.org/r/20201110071908.3133-1-zhenzhong.duan@gmail.com Signed-off-by: Will Deacon --- arch/x86/kernel/tboot.c | 3 --- drivers/iommu/intel/iommu.c | 5 +++-- include/linux/intel-iommu.h | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 992fb1415c0f..420be871d9d4 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -514,9 +514,6 @@ int tboot_force_iommu(void) if (!tboot_enabled()) return 0; - if (intel_iommu_tboot_noforce) - return 1; - if (no_iommu || swiotlb || dmar_disabled) pr_warn("Forcing Intel-IOMMU to enabled\n"); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 1b1ca63e6bbe..4d9b298002f0 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -179,7 +179,7 @@ static int rwbf_quirk; * (used when kernel is launched w/ TXT) */ static int force_on = 0; -int intel_iommu_tboot_noforce; +static int intel_iommu_tboot_noforce; static int no_platform_optin; #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) @@ -4885,7 +4885,8 @@ int __init intel_iommu_init(void) * Intel IOMMU is required for a TXT/tboot launch or platform * opt in, so enforce that. */ - force_on = tboot_force_iommu() || platform_optin_force_iommu(); + force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) || + platform_optin_force_iommu(); if (iommu_init_mempool()) { if (force_on) diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index fbf5b3e7707e..d956987ed032 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -798,7 +798,6 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; extern int intel_iommu_enabled; -extern int intel_iommu_tboot_noforce; extern int intel_iommu_gfx_mapped; #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) -- cgit v1.2.3 From 2d8f6481c17db9fa5238b277cdbc392084060b09 Mon Sep 17 00:00:00 2001 From: Georg Kohmann Date: Thu, 19 Nov 2020 10:58:33 +0100 Subject: ipv6: Remove dependency of ipv6_frag_thdr_truncated on ipv6 module IPV6=m NF_DEFRAG_IPV6=y ld: net/ipv6/netfilter/nf_conntrack_reasm.o: in function `nf_ct_frag6_gather': net/ipv6/netfilter/nf_conntrack_reasm.c:462: undefined reference to `ipv6_frag_thdr_truncated' Netfilter is depending on ipv6 symbol ipv6_frag_thdr_truncated. This dependency is forcing IPV6=y. Remove this dependency by moving ipv6_frag_thdr_truncated out of ipv6. This is the same solution as used with a similar issues: Referring to commit 70b095c843266 ("ipv6: remove dependency of nf_defrag_ipv6 on ipv6 module") Fixes: 9d9e937b1c8b ("ipv6/netfilter: Discard first fragment not including all headers") Reported-by: Randy Dunlap Reported-by: kernel test robot Signed-off-by: Georg Kohmann Acked-by: Pablo Neira Ayuso Acked-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/20201119095833.8409-1-geokohma@cisco.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 2 -- include/net/ipv6_frag.h | 30 ++++++++++++++++++++++++++++++ net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 31 +------------------------------ 4 files changed, 32 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 637cc6dd12b7..bd1f396cc9c7 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1064,8 +1064,6 @@ int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp, bool ipv6_ext_hdr(u8 nexthdr); -bool ipv6_frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp); - enum { IP6_FH_F_FRAG = (1 << 0), IP6_FH_F_AUTH = (1 << 1), diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index a21e8b1381a1..851029ecff13 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -108,5 +108,35 @@ out_rcu_unlock: rcu_read_unlock(); inet_frag_put(&fq->q); } + +/* Check if the upper layer header is truncated in the first fragment. */ +static inline bool +ipv6frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp) +{ + u8 nexthdr = *nexthdrp; + __be16 frag_off; + int offset; + + offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off); + if (offset < 0 || (frag_off & htons(IP6_OFFSET))) + return false; + switch (nexthdr) { + case NEXTHDR_TCP: + offset += sizeof(struct tcphdr); + break; + case NEXTHDR_UDP: + offset += sizeof(struct udphdr); + break; + case NEXTHDR_ICMP: + offset += sizeof(struct icmp6hdr); + break; + default: + offset += 1; + } + if (offset > skb->len) + return true; + return false; +} + #endif #endif diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b9cc0b330dbe..c129ad334eb3 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -459,7 +459,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) /* Discard the first fragment if it does not include all headers * RFC 8200, Section 4.5 */ - if (ipv6_frag_thdr_truncated(skb, fhoff, &nexthdr)) { + if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) { pr_debug("Drop incomplete fragment\n"); return 0; } diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e3869bac9c88..47a0dc46cbdb 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -318,35 +318,6 @@ out_fail: return -1; } -/* Check if the upper layer header is truncated in the first fragment. */ -bool ipv6_frag_thdr_truncated(struct sk_buff *skb, int start, u8 *nexthdrp) -{ - u8 nexthdr = *nexthdrp; - __be16 frag_off; - int offset; - - offset = ipv6_skip_exthdr(skb, start, &nexthdr, &frag_off); - if (offset < 0 || (frag_off & htons(IP6_OFFSET))) - return false; - switch (nexthdr) { - case NEXTHDR_TCP: - offset += sizeof(struct tcphdr); - break; - case NEXTHDR_UDP: - offset += sizeof(struct udphdr); - break; - case NEXTHDR_ICMP: - offset += sizeof(struct icmp6hdr); - break; - default: - offset += 1; - } - if (offset > skb->len) - return true; - return false; -} -EXPORT_SYMBOL(ipv6_frag_thdr_truncated); - static int ipv6_frag_rcv(struct sk_buff *skb) { struct frag_hdr *fhdr; @@ -390,7 +361,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) * the source of the fragment, with the Pointer field set to zero. */ nexthdr = hdr->nexthdr; - if (ipv6_frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { + if (ipv6frag_thdr_truncated(skb, skb_transport_offset(skb), &nexthdr)) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0); -- cgit v1.2.3 From 2bf31d94423c8ae3ff58e38a115b177df6940399 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 16 Nov 2020 11:18:08 +0100 Subject: jbd2: fix kernel-doc markups Kernel-doc markup should use this format: identifier - description They should not have any type before that, as otherwise the parser won't do the right thing. Also, some identifiers have different names between their prototypes and the kernel-doc markup. Reviewed-by: Jan Kara Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/72f5c6628f5f278d67625f60893ffbc2ca28d46e.1605521731.git.mchehab+huawei@kernel.org Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 34 ++++++++++++++++++---------------- fs/jbd2/transaction.c | 31 ++++++++++++++++--------------- include/linux/jbd2.h | 2 +- 3 files changed, 35 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0c3d5e3b24b2..188f79d76988 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -566,12 +566,14 @@ static int __jbd2_journal_force_commit(journal_t *journal) } /** - * Force and wait upon a commit if the calling process is not within - * transaction. This is used for forcing out undo-protected data which contains - * bitmaps, when the fs is running out of space. + * jbd2_journal_force_commit_nested - Force and wait upon a commit if the + * calling process is not within transaction. * * @journal: journal to force * Returns true if progress was made. + * + * This is used for forcing out undo-protected data which contains + * bitmaps, when the fs is running out of space. */ int jbd2_journal_force_commit_nested(journal_t *journal) { @@ -582,7 +584,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal) } /** - * int journal_force_commit() - force any uncommitted transactions + * jbd2_journal_force_commit() - force any uncommitted transactions * @journal: journal to force * * Caller want unconditional commit. We can only force the running transaction @@ -1881,7 +1883,7 @@ static int load_superblock(journal_t *journal) /** - * int jbd2_journal_load() - Read journal from disk. + * jbd2_journal_load() - Read journal from disk. * @journal: Journal to act on. * * Given a journal_t structure which tells us which disk blocks contain @@ -1951,7 +1953,7 @@ recovery_error: } /** - * void jbd2_journal_destroy() - Release a journal_t structure. + * jbd2_journal_destroy() - Release a journal_t structure. * @journal: Journal to act on. * * Release a journal_t structure once it is no longer in use by the @@ -2028,7 +2030,7 @@ int jbd2_journal_destroy(journal_t *journal) /** - *int jbd2_journal_check_used_features() - Check if features specified are used. + * jbd2_journal_check_used_features() - Check if features specified are used. * @journal: Journal to check. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -2063,7 +2065,7 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, } /** - * int jbd2_journal_check_available_features() - Check feature set in journalling layer + * jbd2_journal_check_available_features() - Check feature set in journalling layer * @journal: Journal to check. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -2126,7 +2128,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal) } /** - * int jbd2_journal_set_features() - Mark a given journal feature in the superblock + * jbd2_journal_set_features() - Mark a given journal feature in the superblock * @journal: Journal to act on. * @compat: bitmask of compatible features * @ro: bitmask of features that force read-only mount @@ -2217,7 +2219,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, } /* - * jbd2_journal_clear_features () - Clear a given journal feature in the + * jbd2_journal_clear_features() - Clear a given journal feature in the * superblock * @journal: Journal to act on. * @compat: bitmask of compatible features @@ -2246,7 +2248,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, EXPORT_SYMBOL(jbd2_journal_clear_features); /** - * int jbd2_journal_flush () - Flush journal + * jbd2_journal_flush() - Flush journal * @journal: Journal to act on. * * Flush all data for a given journal to disk and empty the journal. @@ -2321,7 +2323,7 @@ out: } /** - * int jbd2_journal_wipe() - Wipe journal contents + * jbd2_journal_wipe() - Wipe journal contents * @journal: Journal to act on. * @write: flag (see below) * @@ -2362,7 +2364,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) } /** - * void jbd2_journal_abort () - Shutdown the journal immediately. + * jbd2_journal_abort () - Shutdown the journal immediately. * @journal: the journal to shutdown. * @errno: an error number to record in the journal indicating * the reason for the shutdown. @@ -2453,7 +2455,7 @@ void jbd2_journal_abort(journal_t *journal, int errno) } /** - * int jbd2_journal_errno () - returns the journal's error state. + * jbd2_journal_errno() - returns the journal's error state. * @journal: journal to examine. * * This is the errno number set with jbd2_journal_abort(), the last @@ -2477,7 +2479,7 @@ int jbd2_journal_errno(journal_t *journal) } /** - * int jbd2_journal_clear_err () - clears the journal's error state + * jbd2_journal_clear_err() - clears the journal's error state * @journal: journal to act on. * * An error must be cleared or acked to take a FS out of readonly @@ -2497,7 +2499,7 @@ int jbd2_journal_clear_err(journal_t *journal) } /** - * void jbd2_journal_ack_err() - Ack journal err. + * jbd2_journal_ack_err() - Ack journal err. * @journal: journal to act on. * * An error must be cleared or acked to take a FS out of readonly diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d54f04674e8e..9396666b7314 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -519,7 +519,7 @@ EXPORT_SYMBOL(jbd2__journal_start); /** - * handle_t *jbd2_journal_start() - Obtain a new handle. + * jbd2_journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. * @nblocks: number of block buffer we might modify * @@ -566,7 +566,7 @@ void jbd2_journal_free_reserved(handle_t *handle) EXPORT_SYMBOL(jbd2_journal_free_reserved); /** - * int jbd2_journal_start_reserved() - start reserved handle + * jbd2_journal_start_reserved() - start reserved handle * @handle: handle to start * @type: for handle statistics * @line_no: for handle statistics @@ -620,7 +620,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type, EXPORT_SYMBOL(jbd2_journal_start_reserved); /** - * int jbd2_journal_extend() - extend buffer credits. + * jbd2_journal_extend() - extend buffer credits. * @handle: handle to 'extend' * @nblocks: nr blocks to try to extend by. * @revoke_records: number of revoke records to try to extend by. @@ -745,7 +745,7 @@ static void stop_this_handle(handle_t *handle) } /** - * int jbd2_journal_restart() - restart a handle . + * jbd2__journal_restart() - restart a handle . * @handle: handle to restart * @nblocks: nr credits requested * @revoke_records: number of revoke record credits requested @@ -815,7 +815,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) EXPORT_SYMBOL(jbd2_journal_restart); /** - * void jbd2_journal_lock_updates () - establish a transaction barrier. + * jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. * * This locks out any further updates from being started, and blocks @@ -874,7 +874,7 @@ void jbd2_journal_lock_updates(journal_t *journal) } /** - * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier + * jbd2_journal_unlock_updates () - release barrier * @journal: Journal to release the barrier on. * * Release a transaction barrier obtained with jbd2_journal_lock_updates(). @@ -1182,7 +1182,8 @@ out: } /** - * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * jbd2_journal_get_write_access() - notify intent to modify a buffer + * for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes * @@ -1226,7 +1227,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) * unlocked buffer beforehand. */ /** - * int jbd2_journal_get_create_access () - notify intent to use newly created bh + * jbd2_journal_get_create_access () - notify intent to use newly created bh * @handle: transaction to new buffer to * @bh: new buffer. * @@ -1306,7 +1307,7 @@ out: } /** - * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with + * jbd2_journal_get_undo_access() - Notify intent to modify metadata with * non-rewindable consequences * @handle: transaction * @bh: buffer to undo @@ -1383,7 +1384,7 @@ out: } /** - * void jbd2_journal_set_triggers() - Add triggers for commit writeout + * jbd2_journal_set_triggers() - Add triggers for commit writeout * @bh: buffer to trigger on * @type: struct jbd2_buffer_trigger_type containing the trigger(s). * @@ -1425,7 +1426,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, } /** - * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata + * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark * @@ -1593,7 +1594,7 @@ out: } /** - * void jbd2_journal_forget() - bforget() for potentially-journaled buffers. + * jbd2_journal_forget() - bforget() for potentially-journaled buffers. * @handle: transaction handle * @bh: bh to 'forget' * @@ -1762,7 +1763,7 @@ drop: } /** - * int jbd2_journal_stop() - complete a transaction + * jbd2_journal_stop() - complete a transaction * @handle: transaction to complete. * * All done for a particular handle. @@ -2080,7 +2081,7 @@ out: } /** - * int jbd2_journal_try_to_free_buffers() - try to free page buffers. + * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation * @page: to try and free * @@ -2411,7 +2412,7 @@ zap_buffer_unlocked: } /** - * void jbd2_journal_invalidatepage() + * jbd2_journal_invalidatepage() * @journal: journal to use for flush... * @page: page to flush * @offset: start of the range to invalidate diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1c49fd62ff2e..578ff196b3ce 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -401,7 +401,7 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) #define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** - * struct jbd_inode - The jbd_inode type is the structure linking inodes in + * struct jbd2_inode - The jbd_inode type is the structure linking inodes in * ordered mode present in a transaction so that we can sync them during commit. */ struct jbd2_inode { -- cgit v1.2.3 From b9ad3e9f5a7a760ab068e33e1f18d240ba32ce92 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Fri, 20 Nov 2020 14:28:27 +0000 Subject: bonding: wait for sysfs kobject destruction before freeing struct slave syzkaller found that with CONFIG_DEBUG_KOBJECT_RELEASE=y, releasing a struct slave device could result in the following splat: kobject: 'bonding_slave' (00000000cecdd4fe): kobject_release, parent 0000000074ceb2b2 (delayed 1000) bond0 (unregistering): (slave bond_slave_1): Releasing backup interface ------------[ cut here ]------------ ODEBUG: free active (active state 0) object type: timer_list hint: workqueue_select_cpu_near kernel/workqueue.c:1549 [inline] ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x98 kernel/workqueue.c:1600 WARNING: CPU: 1 PID: 842 at lib/debugobjects.c:485 debug_print_object+0x180/0x240 lib/debugobjects.c:485 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 842 Comm: kworker/u4:4 Tainted: G S 5.9.0-rc8+ #96 Hardware name: linux,dummy-virt (DT) Workqueue: netns cleanup_net Call trace: dump_backtrace+0x0/0x4d8 include/linux/bitmap.h:239 show_stack+0x34/0x48 arch/arm64/kernel/traps.c:142 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x174/0x1f8 lib/dump_stack.c:118 panic+0x360/0x7a0 kernel/panic.c:231 __warn+0x244/0x2ec kernel/panic.c:600 report_bug+0x240/0x398 lib/bug.c:198 bug_handler+0x50/0xc0 arch/arm64/kernel/traps.c:974 call_break_hook+0x160/0x1d8 arch/arm64/kernel/debug-monitors.c:322 brk_handler+0x30/0xc0 arch/arm64/kernel/debug-monitors.c:329 do_debug_exception+0x184/0x340 arch/arm64/mm/fault.c:864 el1_dbg+0x48/0xb0 arch/arm64/kernel/entry-common.c:65 el1_sync_handler+0x170/0x1c8 arch/arm64/kernel/entry-common.c:93 el1_sync+0x80/0x100 arch/arm64/kernel/entry.S:594 debug_print_object+0x180/0x240 lib/debugobjects.c:485 __debug_check_no_obj_freed lib/debugobjects.c:967 [inline] debug_check_no_obj_freed+0x200/0x430 lib/debugobjects.c:998 slab_free_hook mm/slub.c:1536 [inline] slab_free_freelist_hook+0x190/0x210 mm/slub.c:1577 slab_free mm/slub.c:3138 [inline] kfree+0x13c/0x460 mm/slub.c:4119 bond_free_slave+0x8c/0xf8 drivers/net/bonding/bond_main.c:1492 __bond_release_one+0xe0c/0xec8 drivers/net/bonding/bond_main.c:2190 bond_slave_netdev_event drivers/net/bonding/bond_main.c:3309 [inline] bond_netdev_event+0x8f0/0xa70 drivers/net/bonding/bond_main.c:3420 notifier_call_chain+0xf0/0x200 kernel/notifier.c:83 __raw_notifier_call_chain kernel/notifier.c:361 [inline] raw_notifier_call_chain+0x44/0x58 kernel/notifier.c:368 call_netdevice_notifiers_info+0xbc/0x150 net/core/dev.c:2033 call_netdevice_notifiers_extack net/core/dev.c:2045 [inline] call_netdevice_notifiers net/core/dev.c:2059 [inline] rollback_registered_many+0x6a4/0xec0 net/core/dev.c:9347 unregister_netdevice_many.part.0+0x2c/0x1c0 net/core/dev.c:10509 unregister_netdevice_many net/core/dev.c:10508 [inline] default_device_exit_batch+0x294/0x338 net/core/dev.c:10992 ops_exit_list.isra.0+0xec/0x150 net/core/net_namespace.c:189 cleanup_net+0x44c/0x888 net/core/net_namespace.c:603 process_one_work+0x96c/0x18c0 kernel/workqueue.c:2269 worker_thread+0x3f0/0xc30 kernel/workqueue.c:2415 kthread+0x390/0x498 kernel/kthread.c:292 ret_from_fork+0x10/0x18 arch/arm64/kernel/entry.S:925 This is a potential use-after-free if the sysfs nodes are being accessed whilst removing the struct slave, so wait for the object destruction to complete before freeing the struct slave itself. Fixes: 07699f9a7c8d ("bonding: add sysfs /slave dir for bond slave devices.") Fixes: a068aab42258 ("bonding: Fix reference count leak in bond_sysfs_slave_add.") Cc: Qiushi Wu Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Signed-off-by: Jamie Iles Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20201120142827.879226-1-jamie@nuviainc.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 61 ++++++++++++++++++++++++---------- drivers/net/bonding/bond_sysfs_slave.c | 18 +--------- include/net/bonding.h | 8 +++++ 3 files changed, 52 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 84ecbc6fa0ff..47afc5938c26 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1460,7 +1460,39 @@ static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave) slave->dev->flags &= ~IFF_SLAVE; } -static struct slave *bond_alloc_slave(struct bonding *bond) +static void slave_kobj_release(struct kobject *kobj) +{ + struct slave *slave = to_slave(kobj); + struct bonding *bond = bond_get_bond_by_slave(slave); + + cancel_delayed_work_sync(&slave->notify_work); + if (BOND_MODE(bond) == BOND_MODE_8023AD) + kfree(SLAVE_AD_INFO(slave)); + + kfree(slave); +} + +static struct kobj_type slave_ktype = { + .release = slave_kobj_release, +#ifdef CONFIG_SYSFS + .sysfs_ops = &slave_sysfs_ops, +#endif +}; + +static int bond_kobj_init(struct slave *slave) +{ + int err; + + err = kobject_init_and_add(&slave->kobj, &slave_ktype, + &(slave->dev->dev.kobj), "bonding_slave"); + if (err) + kobject_put(&slave->kobj); + + return err; +} + +static struct slave *bond_alloc_slave(struct bonding *bond, + struct net_device *slave_dev) { struct slave *slave = NULL; @@ -1468,11 +1500,17 @@ static struct slave *bond_alloc_slave(struct bonding *bond) if (!slave) return NULL; + slave->bond = bond; + slave->dev = slave_dev; + + if (bond_kobj_init(slave)) + return NULL; + if (BOND_MODE(bond) == BOND_MODE_8023AD) { SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info), GFP_KERNEL); if (!SLAVE_AD_INFO(slave)) { - kfree(slave); + kobject_put(&slave->kobj); return NULL; } } @@ -1481,17 +1519,6 @@ static struct slave *bond_alloc_slave(struct bonding *bond) return slave; } -static void bond_free_slave(struct slave *slave) -{ - struct bonding *bond = bond_get_bond_by_slave(slave); - - cancel_delayed_work_sync(&slave->notify_work); - if (BOND_MODE(bond) == BOND_MODE_8023AD) - kfree(SLAVE_AD_INFO(slave)); - - kfree(slave); -} - static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info) { info->bond_mode = BOND_MODE(bond); @@ -1678,14 +1705,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, goto err_undo_flags; } - new_slave = bond_alloc_slave(bond); + new_slave = bond_alloc_slave(bond, slave_dev); if (!new_slave) { res = -ENOMEM; goto err_undo_flags; } - new_slave->bond = bond; - new_slave->dev = slave_dev; /* Set the new_slave's queue_id to be zero. Queue ID mapping * is set via sysfs or module option if desired. */ @@ -2007,7 +2032,7 @@ err_restore_mtu: dev_set_mtu(slave_dev, new_slave->original_mtu); err_free: - bond_free_slave(new_slave); + kobject_put(&new_slave->kobj); err_undo_flags: /* Enslave of first slave has failed and we need to fix master's mac */ @@ -2187,7 +2212,7 @@ static int __bond_release_one(struct net_device *bond_dev, if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; - bond_free_slave(slave); + kobject_put(&slave->kobj); return 0; } diff --git a/drivers/net/bonding/bond_sysfs_slave.c b/drivers/net/bonding/bond_sysfs_slave.c index 9b8346638f69..fd07561da034 100644 --- a/drivers/net/bonding/bond_sysfs_slave.c +++ b/drivers/net/bonding/bond_sysfs_slave.c @@ -121,7 +121,6 @@ static const struct slave_attribute *slave_attrs[] = { }; #define to_slave_attr(_at) container_of(_at, struct slave_attribute, attr) -#define to_slave(obj) container_of(obj, struct slave, kobj) static ssize_t slave_show(struct kobject *kobj, struct attribute *attr, char *buf) @@ -132,28 +131,15 @@ static ssize_t slave_show(struct kobject *kobj, return slave_attr->show(slave, buf); } -static const struct sysfs_ops slave_sysfs_ops = { +const struct sysfs_ops slave_sysfs_ops = { .show = slave_show, }; -static struct kobj_type slave_ktype = { -#ifdef CONFIG_SYSFS - .sysfs_ops = &slave_sysfs_ops, -#endif -}; - int bond_sysfs_slave_add(struct slave *slave) { const struct slave_attribute **a; int err; - err = kobject_init_and_add(&slave->kobj, &slave_ktype, - &(slave->dev->dev.kobj), "bonding_slave"); - if (err) { - kobject_put(&slave->kobj); - return err; - } - for (a = slave_attrs; *a; ++a) { err = sysfs_create_file(&slave->kobj, &((*a)->attr)); if (err) { @@ -171,6 +157,4 @@ void bond_sysfs_slave_del(struct slave *slave) for (a = slave_attrs; *a; ++a) sysfs_remove_file(&slave->kobj, &((*a)->attr)); - - kobject_put(&slave->kobj); } diff --git a/include/net/bonding.h b/include/net/bonding.h index 7d132cc1e584..d9d0ff3b0ad3 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -185,6 +185,11 @@ struct slave { struct rtnl_link_stats64 slave_stats; }; +static inline struct slave *to_slave(struct kobject *kobj) +{ + return container_of(kobj, struct slave, kobj); +} + struct bond_up_slave { unsigned int count; struct rcu_head rcu; @@ -750,6 +755,9 @@ extern struct bond_parm_tbl ad_select_tbl[]; /* exported from bond_netlink.c */ extern struct rtnl_link_ops bond_link_ops; +/* exported from bond_sysfs_slave.c */ +extern const struct sysfs_ops slave_sysfs_ops; + static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { atomic_long_inc(&dev->tx_dropped); -- cgit v1.2.3 From bc2dc4406c463174613047d8b7946e12c8808cda Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Sat, 21 Nov 2020 22:17:01 -0800 Subject: compiler-clang: remove version check for BPF Tracing bpftrace parses the kernel headers and uses Clang under the hood. Remove the version check when __BPF_TRACING__ is defined (as bpftrace does) so that this tool can continue to parse kernel headers, even with older clang sources. Fixes: commit 1f7a44f63e6c ("compiler-clang: add build check for clang 10.0.1") Reported-by: Chen Yu Reported-by: Jarkko Sakkinen Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Tested-by: Jarkko Sakkinen Acked-by: Jarkko Sakkinen Acked-by: Song Liu Acked-by: Nathan Chancellor Acked-by: Miguel Ojeda Link: https://lkml.kernel.org/r/20201104191052.390657-1-ndesaulniers@google.com Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index dd7233c48bf3..98cff1b4b088 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -8,8 +8,10 @@ + __clang_patchlevel__) #if CLANG_VERSION < 100001 +#ifndef __BPF_TRACING__ # error Sorry, your version of Clang is too old - please use 10.0.1 or newer. #endif +#endif /* Compiler specific definitions for Clang compiler */ -- cgit v1.2.3 From a927bd6ba952d13c52b8b385030943032f659a3e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 21 Nov 2020 22:17:05 -0800 Subject: mm: fix phys_to_target_node() and memory_add_physaddr_to_nid() exports The core-mm has a default __weak implementation of phys_to_target_node() to mirror the weak definition of memory_add_physaddr_to_nid(). That symbol is exported for modules. However, while the export in mm/memory_hotplug.c exported the symbol in the configuration cases of: CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_HOTPLUG=y ...and: CONFIG_NUMA_KEEP_MEMINFO=n CONFIG_MEMORY_HOTPLUG=y ...it failed to export the symbol in the case of: CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_HOTPLUG=n Not only is that broken, but Christoph points out that the kernel should not be exporting any __weak symbol, which means that memory_add_physaddr_to_nid() example that phys_to_target_node() copied is broken too. Rework the definition of phys_to_target_node() and memory_add_physaddr_to_nid() to not require weak symbols. Move to the common arch override design-pattern of an asm header defining a symbol to replace the default implementation. The only common header that all memory_add_physaddr_to_nid() producing architectures implement is asm/sparsemem.h. In fact, powerpc already defines its memory_add_physaddr_to_nid() helper in sparsemem.h. Double-down on that observation and define phys_to_target_node() where necessary in asm/sparsemem.h. An alternate consideration that was discarded was to put this override in asm/numa.h, but that entangles with the definition of MAX_NUMNODES relative to the inclusion of linux/nodemask.h, and requires powerpc to grow a new header. The dependency on NUMA_KEEP_MEMINFO for DEV_DAX_HMEM_DEVICES is invalid now that the symbol is properly exported / stubbed in all combinations of CONFIG_NUMA_KEEP_MEMINFO and CONFIG_MEMORY_HOTPLUG. [dan.j.williams@intel.com: v4] Link: https://lkml.kernel.org/r/160461461867.1505359.5301571728749534585.stgit@dwillia2-desk3.amr.corp.intel.com [dan.j.williams@intel.com: powerpc: fix create_section_mapping compile warning] Link: https://lkml.kernel.org/r/160558386174.2948926.2740149041249041764.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: a035b6bf863e ("mm/memory_hotplug: introduce default phys_to_target_node() implementation") Reported-by: Randy Dunlap Reported-by: Thomas Gleixner Reported-by: kernel test robot Reported-by: Christoph Hellwig Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Tested-by: Randy Dunlap Tested-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Cc: Joao Martins Cc: Tony Luck Cc: Fenghua Yu Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Vishal Verma Cc: Stephen Rothwell Link: https://lkml.kernel.org/r/160447639846.1133764.7044090803980177548.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/sparsemem.h | 6 ++++++ arch/powerpc/include/asm/mmzone.h | 5 +++++ arch/powerpc/include/asm/sparsemem.h | 5 ++--- arch/powerpc/mm/mem.c | 1 + arch/x86/include/asm/sparsemem.h | 10 ++++++++++ arch/x86/mm/numa.c | 2 ++ drivers/dax/Kconfig | 1 - include/linux/memory_hotplug.h | 14 -------------- include/linux/numa.h | 30 +++++++++++++++++++++++++++++- mm/memory_hotplug.c | 18 ------------------ 10 files changed, 55 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h index 336d0570e1fa..dd8c166ffd7b 100644 --- a/arch/ia64/include/asm/sparsemem.h +++ b/arch/ia64/include/asm/sparsemem.h @@ -18,4 +18,10 @@ #endif #endif /* CONFIG_SPARSEMEM */ + +#ifdef CONFIG_MEMORY_HOTPLUG +int memory_add_physaddr_to_nid(u64 addr); +#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +#endif + #endif /* _ASM_IA64_SPARSEMEM_H */ diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h index 91c69ff53a8a..6cda76b57c5d 100644 --- a/arch/powerpc/include/asm/mmzone.h +++ b/arch/powerpc/include/asm/mmzone.h @@ -46,5 +46,10 @@ u64 memory_hotplug_max(void); #define __HAVE_ARCH_RESERVED_KERNEL_PAGES #endif +#ifdef CONFIG_MEMORY_HOTPLUG +extern int create_section_mapping(unsigned long start, unsigned long end, + int nid, pgprot_t prot); +#endif + #endif /* __KERNEL__ */ #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index 1e6fa371cc38..d072866842e4 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -13,9 +13,9 @@ #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_MEMORY_HOTPLUG -extern int create_section_mapping(unsigned long start, unsigned long end, - int nid, pgprot_t prot); extern int remove_section_mapping(unsigned long start, unsigned long end); +extern int memory_add_physaddr_to_nid(u64 start); +#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid #ifdef CONFIG_NUMA extern int hot_add_scn_to_nid(unsigned long scn_addr); @@ -26,6 +26,5 @@ static inline int hot_add_scn_to_nid(unsigned long scn_addr) } #endif /* CONFIG_NUMA */ #endif /* CONFIG_MEMORY_HOTPLUG */ - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_SPARSEMEM_H */ diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 01ec2a252f09..3fc325bebe4d 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -50,6 +50,7 @@ #include #include #include +#include #include diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 6bfc878f6771..6a9ccc1b2be5 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -28,4 +28,14 @@ #endif #endif /* CONFIG_SPARSEMEM */ + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_NUMA_KEEP_MEMINFO +extern int phys_to_target_node(phys_addr_t start); +#define phys_to_target_node phys_to_target_node +extern int memory_add_physaddr_to_nid(u64 start); +#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid +#endif +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_X86_SPARSEMEM_H */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 44148691d78b..5eb4dc2b97da 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -938,6 +938,7 @@ int phys_to_target_node(phys_addr_t start) return meminfo_to_nid(&numa_reserved_meminfo, start); } +EXPORT_SYMBOL_GPL(phys_to_target_node); int memory_add_physaddr_to_nid(u64 start) { @@ -947,4 +948,5 @@ int memory_add_physaddr_to_nid(u64 start) nid = numa_meminfo.blk[0].nid; return nid; } +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 567428e10b7b..d2834c2cfa10 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -50,7 +50,6 @@ config DEV_DAX_HMEM Say M if unsure. config DEV_DAX_HMEM_DEVICES - depends on NUMA_KEEP_MEMINFO # for phys_to_target_node() depends on DEV_DAX_HMEM && DAX=y def_bool y diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d65c6fdc5cfc..551093b74596 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -281,20 +281,6 @@ static inline bool movable_node_is_enabled(void) } #endif /* ! CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_NUMA -extern int memory_add_physaddr_to_nid(u64 start); -extern int phys_to_target_node(u64 start); -#else -static inline int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -static inline int phys_to_target_node(u64 start) -{ - return 0; -} -#endif - #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * pgdat resizing functions diff --git a/include/linux/numa.h b/include/linux/numa.h index 8cb33ccfb671..cb44cfe2b725 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -21,13 +21,41 @@ #endif #ifdef CONFIG_NUMA +#include +#include + /* Generic implementation available */ int numa_map_to_online_node(int node); -#else + +#ifndef memory_add_physaddr_to_nid +static inline int memory_add_physaddr_to_nid(u64 start) +{ + pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +#endif +#ifndef phys_to_target_node +static inline int phys_to_target_node(u64 start) +{ + pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", + start); + return 0; +} +#endif +#else /* !CONFIG_NUMA */ static inline int numa_map_to_online_node(int node) { return NUMA_NO_NODE; } +static inline int memory_add_physaddr_to_nid(u64 start) +{ + return 0; +} +static inline int phys_to_target_node(u64 start) +{ + return 0; +} #endif #endif /* _LINUX_NUMA_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b44d4c7ba73b..63b2e46b6555 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -350,24 +350,6 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, return err; } -#ifdef CONFIG_NUMA -int __weak memory_add_physaddr_to_nid(u64 start) -{ - pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); - -int __weak phys_to_target_node(u64 start) -{ - pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n", - start); - return 0; -} -EXPORT_SYMBOL_GPL(phys_to_target_node); -#endif - /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, -- cgit v1.2.3 From 4349a83a3190c1d4414371161b0f4a4c3ccd3f9d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 21 Nov 2020 22:17:08 -0800 Subject: mm: fix readahead_page_batch for retry entries Both btrfs and fuse have reported faults caused by seeing a retry entry instead of the page they were looking for. This was caused by a missing check in the iterator. As can be seen in the below panic log, the accessing 0x402 causes a panic. In the xarray.h, 0x402 means RETRY_ENTRY. BUG: kernel NULL pointer dereference, address: 0000000000000402 CPU: 14 PID: 306003 Comm: as Not tainted 5.9.0-1-amd64 #1 Debian 5.9.1-1 Hardware name: Lenovo ThinkSystem SR665/7D2VCTO1WW, BIOS D8E106Q-1.01 05/30/2020 RIP: 0010:fuse_readahead+0x152/0x470 [fuse] Code: 41 8b 57 18 4c 8d 54 10 ff 4c 89 d6 48 8d 7c 24 10 e8 d2 e3 28 f9 48 85 c0 0f 84 fe 00 00 00 44 89 f2 49 89 04 d4 44 8d 72 01 <48> 8b 10 41 8b 4f 1c 48 c1 ea 10 83 e2 01 80 fa 01 19 d2 81 e2 01 RSP: 0018:ffffad99ceaebc50 EFLAGS: 00010246 RAX: 0000000000000402 RBX: 0000000000000001 RCX: 0000000000000002 RDX: 0000000000000000 RSI: ffff94c5af90bd98 RDI: ffffad99ceaebc60 RBP: ffff94ddc1749a00 R08: 0000000000000402 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000100 R12: ffff94de6c429ce0 R13: ffff94de6c4d3700 R14: 0000000000000001 R15: ffffad99ceaebd68 FS: 00007f228c5c7040(0000) GS:ffff94de8ed80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000402 CR3: 0000001dbd9b4000 CR4: 0000000000350ee0 Call Trace: read_pages+0x83/0x270 page_cache_readahead_unbounded+0x197/0x230 generic_file_buffered_read+0x57a/0xa20 new_sync_read+0x112/0x1a0 vfs_read+0xf8/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x33/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 042124cc64c3 ("mm: add new readahead_control API") Reported-by: David Sterba Reported-by: Wonhyuk Yang Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Cc: Link: https://lkml.kernel.org/r/20201103142852.8543-1-willy@infradead.org Link: https://lkml.kernel.org/r/20201103124349.16722-1-vvghjk1234@gmail.com Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e1e19c1f9ec9..d5570deff400 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -906,6 +906,8 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, xas_set(&xas, rac->_index); rcu_read_lock(); xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { + if (xas_retry(&xas, page)) + continue; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); array[i++] = page; -- cgit v1.2.3 From 01770a166165738a6e05c3d911fb4609cc4eb416 Mon Sep 17 00:00:00 2001 From: Ricardo Dias Date: Fri, 20 Nov 2020 11:11:33 +0000 Subject: tcp: fix race condition when creating child sockets from syncookies When the TCP stack is in SYN flood mode, the server child socket is created from the SYN cookie received in a TCP packet with the ACK flag set. The child socket is created when the server receives the first TCP packet with a valid SYN cookie from the client. Usually, this packet corresponds to the final step of the TCP 3-way handshake, the ACK packet. But is also possible to receive a valid SYN cookie from the first TCP data packet sent by the client, and thus create a child socket from that SYN cookie. Since a client socket is ready to send data as soon as it receives the SYN+ACK packet from the server, the client can send the ACK packet (sent by the TCP stack code), and the first data packet (sent by the userspace program) almost at the same time, and thus the server will equally receive the two TCP packets with valid SYN cookies almost at the same instant. When such event happens, the TCP stack code has a race condition that occurs between the momement a lookup is done to the established connections hashtable to check for the existence of a connection for the same client, and the moment that the child socket is added to the established connections hashtable. As a consequence, this race condition can lead to a situation where we add two child sockets to the established connections hashtable and deliver two sockets to the userspace program to the same client. This patch fixes the race condition by checking if an existing child socket exists for the same client when we are adding the second child socket to the established connections socket. If an existing child socket exists, we drop the packet and discard the second child socket to the same client. Signed-off-by: Ricardo Dias Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20201120111133.GA67501@rdias-suse-pc.lan Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 5 +-- net/dccp/ipv4.c | 2 +- net/dccp/ipv6.c | 2 +- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/inet_hashtables.c | 68 ++++++++++++++++++++++++++++++++++++----- net/ipv4/tcp_ipv4.c | 15 +++++++-- net/ipv6/tcp_ipv6.c | 13 +++++++- 7 files changed, 91 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 92560974ea67..ca6a3ea9057e 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -247,8 +247,9 @@ void inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long high_limit); int inet_hashinfo2_init_mod(struct inet_hashinfo *h); -bool inet_ehash_insert(struct sock *sk, struct sock *osk); -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk); +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk); +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, + bool *found_dup_sk); int __inet_hash(struct sock *sk, struct sock *osk); int inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bb3d70664dde..b0b6e6a4784e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -427,7 +427,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); if (*own_req) ireq->ireq_opt = NULL; else diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index ef4ab28cfde0..78ee1b5acf1f 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -533,7 +533,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, dccp_done(newsk); goto out; } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4148f5f78f31..f60869acbef0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -787,7 +787,7 @@ static void reqsk_queue_hash_req(struct request_sock *req, timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); mod_timer(&req->rsk_timer, jiffies + timeout); - inet_ehash_insert(req_to_sk(req), NULL); + inet_ehash_insert(req_to_sk(req), NULL, NULL); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8cbe74313f38..45fb450b4522 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,6 +20,9 @@ #include #include #include +#if IS_ENABLED(CONFIG_IPV6) +#include +#endif #include #include #include @@ -508,10 +511,52 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -/* insert a socket into ehash, and eventually remove another one - * (The another one can be a SYN_RECV or TIMEWAIT +/* Searches for an exsiting socket in the ehash bucket list. + * Returns true if found, false otherwise. */ -bool inet_ehash_insert(struct sock *sk, struct sock *osk) +static bool inet_ehash_lookup_by_sk(struct sock *sk, + struct hlist_nulls_head *list) +{ + const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num); + const int sdif = sk->sk_bound_dev_if; + const int dif = sk->sk_bound_dev_if; + const struct hlist_nulls_node *node; + struct net *net = sock_net(sk); + struct sock *esk; + + INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr); + + sk_nulls_for_each_rcu(esk, node, list) { + if (esk->sk_hash != sk->sk_hash) + continue; + if (sk->sk_family == AF_INET) { + if (unlikely(INET_MATCH(esk, net, acookie, + sk->sk_daddr, + sk->sk_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#if IS_ENABLED(CONFIG_IPV6) + else if (sk->sk_family == AF_INET6) { + if (unlikely(INET6_MATCH(esk, net, + &sk->sk_v6_daddr, + &sk->sk_v6_rcv_saddr, + ports, dif, sdif))) { + return true; + } + } +#endif + } + return false; +} + +/* Insert a socket into ehash, and eventually remove another one + * (The another one can be a SYN_RECV or TIMEWAIT) + * If an existing socket already exists, socket sk is not inserted, + * and sets found_dup_sk parameter to true. + */ +bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; @@ -530,16 +575,23 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk) if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); ret = sk_nulls_del_node_init_rcu(osk); + } else if (found_dup_sk) { + *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); + if (*found_dup_sk) + ret = false; } + if (ret) __sk_nulls_add_node_rcu(sk, list); + spin_unlock(lock); + return ret; } -bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) +bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - bool ok = inet_ehash_insert(sk, osk); + bool ok = inet_ehash_insert(sk, osk, found_dup_sk); if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -583,7 +635,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) int err = 0; if (sk->sk_state != TCP_LISTEN) { - inet_ehash_nolisten(sk, osk); + inet_ehash_nolisten(sk, osk, NULL); return 0; } WARN_ON(!sk_unhashed(sk)); @@ -679,7 +731,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); spin_unlock_bh(&head->lock); return 0; } @@ -758,7 +810,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + inet_ehash_nolisten(sk, (struct sock *)tw, NULL); } if (tw) inet_twsk_bind_unhash(tw, hinfo); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 25d35a56ceba..c95fcdfeed42 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1499,6 +1499,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, bool *own_req) { struct inet_request_sock *ireq; + bool found_dup_sk = false; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; @@ -1576,12 +1577,22 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (likely(*own_req)) { tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { - newinet->inet_opt = NULL; + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } else { + newinet->inet_opt = NULL; + } } return newsk; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 3d49e8d0afee..d2502911b7fa 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1193,6 +1193,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; + bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG @@ -1368,7 +1369,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_done(newsk); goto out; } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), + &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); @@ -1383,6 +1385,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * skb_set_owner_r(newnp->pktoptions, newsk); } } + } else { + if (!req_unhash && found_dup_sk) { + /* This code path should only be executed in the + * syncookie case only + */ + bh_unlock_sock(newsk); + sock_put(newsk); + newsk = NULL; + } } return newsk; -- cgit v1.2.3 From d549699048b4b5c22dd710455bcdb76966e55aa3 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 21 Nov 2020 08:28:17 +0200 Subject: net/packet: fix packet receive on L3 devices without visible hard header In the patchset merged by commit b9fcf0a0d826 ("Merge branch 'support-AF_PACKET-for-layer-3-devices'") L3 devices which did not have header_ops were given one for the purpose of protocol parsing on af_packet transmit path. That change made af_packet receive path regard these devices as having a visible L3 header and therefore aligned incoming skb->data to point to the skb's mac_header. Some devices, such as ipip, xfrmi, and others, do not reset their mac_header prior to ingress and therefore their incoming packets became malformed. Ideally these devices would reset their mac headers, or af_packet would be able to rely on dev->hard_header_len being 0 for such cases, but it seems this is not the case. Fix by changing af_packet RX ll visibility criteria to include the existence of a '.create()' header operation, which is used when creating a device hard header - via dev_hard_header() - by upper layers, and does not exist in these L3 devices. As this predicate may be useful in other situations, add it as a common dev_has_header() helper in netdevice.h. Fixes: b9fcf0a0d826 ("Merge branch 'support-AF_PACKET-for-layer-3-devices'") Signed-off-by: Eyal Birger Acked-by: Jason A. Donenfeld Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/20201121062817.3178900-1-eyal.birger@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++++ net/packet/af_packet.c | 18 +++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 964b494b0e8d..fa275a054f46 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3137,6 +3137,11 @@ static inline bool dev_validate_header(const struct net_device *dev, return false; } +static inline bool dev_has_header(const struct net_device *dev) +{ + return dev->header_ops && dev->header_ops->create; +} + typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len, int size); int register_gifconf(unsigned int family, gifconf_func_t *gifconf); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index cefbd50c1090..7a18ffff8551 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -93,8 +93,8 @@ /* Assumptions: - - If the device has no dev->header_ops, there is no LL header visible - above the device. In this case, its hard_header_len should be 0. + - If the device has no dev->header_ops->create, there is no LL header + visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. @@ -108,26 +108,26 @@ On receive: ----------- -Incoming, dev->header_ops != NULL +Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data -Outgoing, dev->header_ops != NULL +Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header -Incoming, dev->header_ops == NULL +Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data -Outgoing, dev->header_ops == NULL +Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume - If dev->header_ops == NULL we are unable to restore the ll header, + If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. @@ -2069,7 +2069,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; - if (dev->header_ops) { + if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * @@ -2198,7 +2198,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; - if (dev->header_ops) { + if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { -- cgit v1.2.3 From acfdd18591eaac25446e976a0c0d190f8b3dbfb1 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Mon, 23 Nov 2020 21:52:41 -0800 Subject: firmware: xilinx: Use hash-table for api feature check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently array of fix length PM_API_MAX is used to cache the pm_api version (valid or invalid). However ATF based PM APIs values are much higher then PM_API_MAX. So to include ATF based PM APIs also, use hash-table to store the pm_api version status. Signed-off-by: Amit Sunil Dhamne Reported-by: Arnd Bergmann  Signed-off-by: Ravi Patel Signed-off-by: Rajan Vaja Reviewed-by: Arnd Bergmann Tested-by: Michal Simek Fixes: f3217d6f2f7a ("firmware: xilinx: fix out-of-bounds access") Cc: stable Link: https://lore.kernel.org/r/1606197161-25976-1-git-send-email-rajan.vaja@xilinx.com Signed-off-by: Michal Simek --- drivers/firmware/xilinx/zynqmp.c | 63 ++++++++++++++++++++++++++++-------- include/linux/firmware/xlnx-zynqmp.h | 4 --- 2 files changed, 49 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 5828302c7c5c..d08ac824c993 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -20,12 +20,28 @@ #include #include #include +#include #include #include "zynqmp-debug.h" +/* Max HashMap Order for PM API feature check (1<<7 = 128) */ +#define PM_API_FEATURE_CHECK_MAX_ORDER 7 + static bool feature_check_enabled; -static u32 zynqmp_pm_features[PM_API_MAX]; +DEFINE_HASHTABLE(pm_api_features_map, PM_API_FEATURE_CHECK_MAX_ORDER); + +/** + * struct pm_api_feature_data - PM API Feature data + * @pm_api_id: PM API Id, used as key to index into hashmap + * @feature_status: status of PM API feature: valid, invalid + * @hentry: hlist_node that hooks this entry into hashtable + */ +struct pm_api_feature_data { + u32 pm_api_id; + int feature_status; + struct hlist_node hentry; +}; static const struct mfd_cell firmware_devs[] = { { @@ -142,29 +158,37 @@ static int zynqmp_pm_feature(u32 api_id) int ret; u32 ret_payload[PAYLOAD_ARG_CNT]; u64 smc_arg[2]; + struct pm_api_feature_data *feature_data; if (!feature_check_enabled) return 0; - /* Return value if feature is already checked */ - if (api_id > ARRAY_SIZE(zynqmp_pm_features)) - return PM_FEATURE_INVALID; + /* Check for existing entry in hash table for given api */ + hash_for_each_possible(pm_api_features_map, feature_data, hentry, + api_id) { + if (feature_data->pm_api_id == api_id) + return feature_data->feature_status; + } - if (zynqmp_pm_features[api_id] != PM_FEATURE_UNCHECKED) - return zynqmp_pm_features[api_id]; + /* Add new entry if not present */ + feature_data = kmalloc(sizeof(*feature_data), GFP_KERNEL); + if (!feature_data) + return -ENOMEM; + feature_data->pm_api_id = api_id; smc_arg[0] = PM_SIP_SVC | PM_FEATURE_CHECK; smc_arg[1] = api_id; ret = do_fw_call(smc_arg[0], smc_arg[1], 0, ret_payload); - if (ret) { - zynqmp_pm_features[api_id] = PM_FEATURE_INVALID; - return PM_FEATURE_INVALID; - } + if (ret) + ret = -EOPNOTSUPP; + else + ret = ret_payload[1]; - zynqmp_pm_features[api_id] = ret_payload[1]; + feature_data->feature_status = ret; + hash_add(pm_api_features_map, &feature_data->hentry, api_id); - return zynqmp_pm_features[api_id]; + return ret; } /** @@ -200,9 +224,12 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1, * Make sure to stay in x0 register */ u64 smc_arg[4]; + int ret; - if (zynqmp_pm_feature(pm_api_id) == PM_FEATURE_INVALID) - return -ENOTSUPP; + /* Check if feature is supported or not */ + ret = zynqmp_pm_feature(pm_api_id); + if (ret < 0) + return ret; smc_arg[0] = PM_SIP_SVC | pm_api_id; smc_arg[1] = ((u64)arg1 << 32) | arg0; @@ -1252,9 +1279,17 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) static int zynqmp_firmware_remove(struct platform_device *pdev) { + struct pm_api_feature_data *feature_data; + int i; + mfd_remove_devices(&pdev->dev); zynqmp_pm_api_debugfs_exit(); + hash_for_each(pm_api_features_map, i, feature_data, hentry) { + hash_del(&feature_data->hentry); + kfree(feature_data); + } + return 0; } diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 5968df82b991..41a1bab98b7e 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -50,10 +50,6 @@ #define ZYNQMP_PM_CAPABILITY_WAKEUP 0x4U #define ZYNQMP_PM_CAPABILITY_UNUSABLE 0x8U -/* Feature check status */ -#define PM_FEATURE_INVALID -1 -#define PM_FEATURE_UNCHECKED 0 - /* * Firmware FPGA Manager flags * XILINX_ZYNQMP_PM_FPGA_FULL: FPGA full reconfiguration -- cgit v1.2.3 From 5204bb683c1633e550c2124ccc2358dd645a80db Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 23 Nov 2020 07:36:25 +0200 Subject: devlink: Fix reload stats structure Fix reload stats structure exposed to the user. Change stats structure hierarchy to have the reload action as a parent of the stat entry and then stat entry includes value per limit. This will also help to avoid string concatenation on iproute2 output. Reload stats structure before this fix: "stats": { "reload": { "driver_reinit": 2, "fw_activate": 1, "fw_activate_no_reset": 0 } } After this fix: "stats": { "reload": { "driver_reinit": { "unspecified": 2 }, "fw_activate": { "unspecified": 1, "no_reset": 0 } } Fixes: a254c264267e ("devlink: Add reload stats") Signed-off-by: Moshe Shemesh Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/1606109785-25197-1-git-send-email-moshe@mellanox.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 49 +++++++++++++++++++++++++++++--------------- 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 0113bc4db9f5..5203f54a2be1 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -526,6 +526,8 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_STATS_LIMIT, /* u8 */ DEVLINK_ATTR_RELOAD_STATS_VALUE, /* u32 */ DEVLINK_ATTR_REMOTE_RELOAD_STATS, /* nested */ + DEVLINK_ATTR_RELOAD_ACTION_INFO, /* nested */ + DEVLINK_ATTR_RELOAD_ACTION_STATS, /* nested */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 4b0211590aac..c91e15b7a2bd 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -517,7 +517,7 @@ devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_l return test_bit(limit, &devlink->ops->reload_limits); } -static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_action action, +static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_limit limit, u32 value) { struct nlattr *reload_stats_entry; @@ -526,8 +526,7 @@ static int devlink_reload_stat_put(struct sk_buff *msg, enum devlink_reload_acti if (!reload_stats_entry) return -EMSGSIZE; - if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, action) || - nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) || nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value)) goto nla_put_failure; nla_nest_end(msg, reload_stats_entry); @@ -540,7 +539,7 @@ nla_put_failure: static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote) { - struct nlattr *reload_stats_attr; + struct nlattr *reload_stats_attr, *act_info, *act_stats; int i, j, stat_idx; u32 value; @@ -552,17 +551,29 @@ static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink if (!reload_stats_attr) return -EMSGSIZE; - for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { - /* Remote stats are shown even if not locally supported. Stats - * of actions with unspecified limit are shown though drivers - * don't need to register unspecified limit. - */ - if (!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && - !devlink_reload_limit_is_supported(devlink, j)) + for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { + if ((!is_remote && + !devlink_reload_action_is_supported(devlink, i)) || + i == DEVLINK_RELOAD_ACTION_UNSPEC) continue; - for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) { - if ((!is_remote && !devlink_reload_action_is_supported(devlink, i)) || - i == DEVLINK_RELOAD_ACTION_UNSPEC || + act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO); + if (!act_info) + goto nla_put_failure; + + if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i)) + goto action_info_nest_cancel; + act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS); + if (!act_stats) + goto action_info_nest_cancel; + + for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) { + /* Remote stats are shown even if not locally supported. + * Stats of actions with unspecified limit are shown + * though drivers don't need to register unspecified + * limit. + */ + if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC && + !devlink_reload_limit_is_supported(devlink, j)) || devlink_reload_combination_is_invalid(i, j)) continue; @@ -571,13 +582,19 @@ static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink value = devlink->stats.reload_stats[stat_idx]; else value = devlink->stats.remote_reload_stats[stat_idx]; - if (devlink_reload_stat_put(msg, i, j, value)) - goto nla_put_failure; + if (devlink_reload_stat_put(msg, j, value)) + goto action_stats_nest_cancel; } + nla_nest_end(msg, act_stats); + nla_nest_end(msg, act_info); } nla_nest_end(msg, reload_stats_attr); return 0; +action_stats_nest_cancel: + nla_nest_cancel(msg, act_stats); +action_info_nest_cancel: + nla_nest_cancel(msg, act_info); nla_put_failure: nla_nest_cancel(msg, reload_stats_attr); return -EMSGSIZE; -- cgit v1.2.3 From fdeb17c70c9ecae655378761accf5a26a55a33cf Mon Sep 17 00:00:00 2001 From: Hui Su Date: Wed, 25 Nov 2020 00:52:05 +0800 Subject: trace: fix potenial dangerous pointer The bdi_dev_name() returns a char [64], and the __entry->name is a char [32]. It maybe dangerous to TP_printk("%s", __entry->name) after the strncpy(). CC: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201124165205.GA23937@rlk Acked-by: Steven Rostedt (VMware) Acked-by: Tejun Heo Signed-off-by: Hui Su Signed-off-by: Jan Kara --- include/trace/events/writeback.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index e7cbccc7c14c..57d795365987 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -190,7 +190,7 @@ TRACE_EVENT(inode_foreign_history, ), TP_fast_assign( - strncpy(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); + strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; @@ -219,7 +219,7 @@ TRACE_EVENT(inode_switch_wbs, ), TP_fast_assign( - strncpy(__entry->name, bdi_dev_name(old_wb->bdi), 32); + strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); @@ -252,7 +252,7 @@ TRACE_EVENT(track_foreign_dirty, struct address_space *mapping = page_mapping(page); struct inode *inode = mapping ? mapping->host : NULL; - strncpy(__entry->name, bdi_dev_name(wb->bdi), 32); + strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; @@ -285,7 +285,7 @@ TRACE_EVENT(flush_foreign, ), TP_fast_assign( - strncpy(__entry->name, bdi_dev_name(wb->bdi), 32); + strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; -- cgit v1.2.3 From 025cc2fb6a4e84e9a0552c0017dcd1c24b7ac7da Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 25 Nov 2020 14:18:10 -0800 Subject: net/tls: Protect from calling tls_dev_del for TLS RX twice tls_device_offload_cleanup_rx doesn't clear tls_ctx->netdev after calling tls_dev_del if TLX TX offload is also enabled. Clearing tls_ctx->netdev gets postponed until tls_device_gc_task. It leaves a time frame when tls_device_down may get called and call tls_dev_del for RX one extra time, confusing the driver, which may lead to a crash. This patch corrects this racy behavior by adding a flag to prevent tls_device_down from calling tls_dev_del the second time. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20201125221810.69870-1-saeedm@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/tls.h | 6 ++++++ net/tls/tls_device.c | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index cf1473099453..2bdd802212fe 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -199,6 +199,12 @@ enum tls_context_flags { * to be atomic. */ TLS_TX_SYNC_SCHED = 1, + /* tls_dev_del was called for the RX side, device state was released, + * but tls_ctx->netdev might still be kept, because TX-side driver + * resources might not be released yet. Used to prevent the second + * tls_dev_del call in tls_device_down if it happens simultaneously. + */ + TLS_RX_DEV_CLOSED = 2, }; struct cipher_context { diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 54d3e161d198..a3ab2d3d4e4e 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -1262,6 +1262,8 @@ void tls_device_offload_cleanup_rx(struct sock *sk) if (tls_ctx->tx_conf != TLS_HW) { dev_put(netdev); tls_ctx->netdev = NULL; + } else { + set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags); } out: up_read(&device_offload_lock); @@ -1291,7 +1293,8 @@ static int tls_device_down(struct net_device *netdev) if (ctx->tx_conf == TLS_HW) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX); - if (ctx->rx_conf == TLS_HW) + if (ctx->rx_conf == TLS_HW && + !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags)) netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_RX); WRITE_ONCE(ctx->netdev, NULL); -- cgit v1.2.3 From 4df910620bebb5cfe234af16ac8f6474b60215fd Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 25 Nov 2020 13:22:21 +0800 Subject: mm: memcg: relayout structure mem_cgroup to avoid cache interference 0day reported one -22.7% regression for will-it-scale page_fault2 case [1] on a 4 sockets 144 CPU platform, and bisected to it to be caused by Waiman's optimization (commit bd0b230fe1) of saving one 'struct page_counter' space for 'struct mem_cgroup'. Initially we thought it was due to the cache alignment change introduced by the patch, but further debug shows that it is due to some hot data members ('vmstats_local', 'vmstats_percpu', 'vmstats') sit in 2 adjacent cacheline (2N and 2N+1 cacheline), and when adjacent cache line prefetch is enabled, it triggers an "extended level" of cache false sharing for 2 adjacent cache lines. So exchange the 2 member blocks, while keeping mostly the original cache alignment, which can restore and even enhance the performance, and save 64 bytes of space for 'struct mem_cgroup' (from 2880 to 2816, with 0day's default RHEL-8.3 kernel config) [1]. https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/ Fixes: bd0b230fe145 ("mm/memcg: unify swap and memsw page counters") Reported-by: kernel test robot Signed-off-by: Feng Tang Acked-by: Waiman Long Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a80c59af2c60..922a7f600465 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -282,20 +282,6 @@ struct mem_cgroup { MEMCG_PADDING(_pad1_); - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - struct task_struct *move_lock_task; - - /* Legacy local VM stats and events */ - struct memcg_vmstats_percpu __percpu *vmstats_local; - - /* Subtree VM stats and events (batched updates) */ - struct memcg_vmstats_percpu __percpu *vmstats_percpu; - - MEMCG_PADDING(_pad2_); - atomic_long_t vmstats[MEMCG_NR_STAT]; atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; @@ -317,6 +303,20 @@ struct mem_cgroup { struct list_head objcg_list; /* list of inherited objcgs */ #endif + MEMCG_PADDING(_pad2_); + + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + struct task_struct *move_lock_task; + + /* Legacy local VM stats and events */ + struct memcg_vmstats_percpu __percpu *vmstats_local; + + /* Subtree VM stats and events (batched updates) */ + struct memcg_vmstats_percpu __percpu *vmstats_percpu; + #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; -- cgit v1.2.3 From 69929d4c49e182f8526d42c43b37b460d562d3a0 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 24 Nov 2020 07:34:44 -0500 Subject: net: openvswitch: fix TTL decrement action netlink message format Currently, the openvswitch module is not accepting the correctly formated netlink message for the TTL decrement action. For both setting and getting the dec_ttl action, the actions should be nested in the OVS_DEC_TTL_ATTR_ACTION attribute as mentioned in the openvswitch.h uapi. When the original patch was sent, it was tested with a private OVS userspace implementation. This implementation was unfortunately not upstreamed and reviewed, hence an erroneous version of this patch was sent out. Leaving the patch as-is would cause problems as the kernel module could interpret additional attributes as actions and vice-versa, due to the actions not being encapsulated/nested within the actual attribute, but being concatinated after it. Fixes: 744676e77720 ("openvswitch: add TTL decrement action") Signed-off-by: Eelco Chaudron Link: https://lore.kernel.org/r/160622121495.27296.888010441924340582.stgit@wsfd-netdev64.ntdv.lab.eng.bos.redhat.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/openvswitch.h | 2 ++ net/openvswitch/actions.c | 7 ++-- net/openvswitch/flow_netlink.c | 74 +++++++++++++++++++++++++++++----------- 3 files changed, 60 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 8300cc29dec8..8d16744edc31 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -1058,4 +1058,6 @@ enum ovs_dec_ttl_attr { __OVS_DEC_TTL_ATTR_MAX }; +#define OVS_DEC_TTL_ATTR_MAX (__OVS_DEC_TTL_ATTR_MAX - 1) + #endif /* _LINUX_OPENVSWITCH_H */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index b87bfc82f44f..5829a020b81c 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -958,14 +958,13 @@ static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, { /* The first action is always 'OVS_DEC_TTL_ATTR_ARG'. */ struct nlattr *dec_ttl_arg = nla_data(attr); - int rem = nla_len(attr); if (nla_len(dec_ttl_arg)) { - struct nlattr *actions = nla_next(dec_ttl_arg, &rem); + struct nlattr *actions = nla_data(dec_ttl_arg); if (actions) - return clone_execute(dp, skb, key, 0, actions, rem, - last, false); + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), last, false); } consume_skb(skb); return 0; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 9d3e50c4d29f..ec0689ddc635 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2503,28 +2503,42 @@ static int validate_and_copy_dec_ttl(struct net *net, __be16 eth_type, __be16 vlan_tci, u32 mpls_label_count, bool log) { - int start, err; - u32 nested = true; + const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1]; + int start, action_start, err, rem; + const struct nlattr *a, *actions; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); - if (!nla_len(attr)) - return ovs_nla_add_action(sfa, OVS_ACTION_ATTR_DEC_TTL, - NULL, 0, log); + /* Ignore unknown attributes to be future proof. */ + if (type > OVS_DEC_TTL_ATTR_MAX) + continue; + + if (!type || attrs[type]) + return -EINVAL; + + attrs[type] = a; + } + + actions = attrs[OVS_DEC_TTL_ATTR_ACTION]; + if (rem || !actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log); if (start < 0) return start; - err = ovs_nla_add_action(sfa, OVS_DEC_TTL_ATTR_ACTION, &nested, - sizeof(nested), log); - - if (err) - return err; + action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log); + if (action_start < 0) + return start; - err = __ovs_nla_copy_actions(net, attr, key, sfa, eth_type, + err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, vlan_tci, mpls_label_count, log); if (err) return err; + add_nested_action_end(*sfa, action_start); add_nested_action_end(*sfa, start); return 0; } @@ -3487,20 +3501,42 @@ out: static int dec_ttl_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) { - int err = 0, rem = nla_len(attr); - struct nlattr *start; + struct nlattr *start, *action_start; + const struct nlattr *a; + int err = 0, rem; start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL); - if (!start) return -EMSGSIZE; - err = ovs_nla_put_actions(nla_data(attr), rem, skb); - if (err) - nla_nest_cancel(skb, start); - else - nla_nest_end(skb, start); + nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + + action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION); + if (!action_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + goto out; + + nla_nest_end(skb, action_start); + break; + default: + /* Ignore all other option to be future compatible */ + break; + } + } + + nla_nest_end(skb, start); + return 0; + +out: + nla_nest_cancel(skb, start); return err; } -- cgit v1.2.3