From ee8209fd026b074bb8eb75bece516a338a281b1b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 8 May 2013 11:55:48 +0300 Subject: dma: acpi-dma: parse CSRT to extract additional resources Since we have CSRT only to get additional DMA controller resources, let's get rid of drivers/acpi/csrt.c and move its logic inside ACPI DMA helpers code. Signed-off-by: Andy Shevchenko Signed-off-by: Mika Westerberg Acked-by: Rafael J. Wysocki Signed-off-by: Vinod Koul --- include/linux/acpi_dma.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/acpi_dma.h b/include/linux/acpi_dma.h index d09deabc7bf6..fb0298082916 100644 --- a/include/linux/acpi_dma.h +++ b/include/linux/acpi_dma.h @@ -37,6 +37,8 @@ struct acpi_dma_spec { * @dev: struct device of this controller * @acpi_dma_xlate: callback function to find a suitable channel * @data: private data used by a callback function + * @base_request_line: first supported request line (CSRT) + * @end_request_line: last supported request line (CSRT) */ struct acpi_dma { struct list_head dma_controllers; @@ -44,6 +46,8 @@ struct acpi_dma { struct dma_chan *(*acpi_dma_xlate) (struct acpi_dma_spec *, struct acpi_dma *); void *data; + unsigned short base_request_line; + unsigned short end_request_line; }; /* Used with acpi_dma_simple_xlate() */ -- cgit v1.2.3 From b59cc200ac025aca597fb21862c1c9e667f2eff2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 8 May 2013 11:55:49 +0300 Subject: ACPI / LPSS: register clock device for Lynxpoint DMA properly The DMA controller in Lynxpoint is enumerated as a regular ACPI device now. To work properly it is using the LPSS root clock as a functional clock. That's why we have to register the clock device accordingly to the ACPI ID of the DMA controller. The acpi_lpss.c module is responsible to do the job. This patch also removes hardcoded name of the DMA device in clk-lpt.c and the name of the root clock in acpi_lpss.c. Signed-off-by: Rafael J. Wysocki Signed-off-by: Andy Shevchenko Signed-off-by: Vinod Koul --- drivers/acpi/acpi_lpss.c | 26 ++++++++++++++++++++++---- drivers/clk/x86/clk-lpt.c | 15 +++++++++++---- include/linux/platform_data/clk-lpss.h | 5 +++++ 3 files changed, 38 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c index b1c95422ce74..652fd5ce303c 100644 --- a/drivers/acpi/acpi_lpss.c +++ b/drivers/acpi/acpi_lpss.c @@ -35,11 +35,16 @@ ACPI_MODULE_NAME("acpi_lpss"); struct lpss_device_desc { bool clk_required; - const char *clk_parent; + const char *clkdev_name; bool ltr_required; unsigned int prv_offset; }; +static struct lpss_device_desc lpss_dma_desc = { + .clk_required = true, + .clkdev_name = "hclk", +}; + struct lpss_private_data { void __iomem *mmio_base; resource_size_t mmio_size; @@ -49,7 +54,6 @@ struct lpss_private_data { static struct lpss_device_desc lpt_dev_desc = { .clk_required = true, - .clk_parent = "lpss_clk", .prv_offset = 0x800, .ltr_required = true, }; @@ -60,6 +64,9 @@ static struct lpss_device_desc lpt_sdio_dev_desc = { }; static const struct acpi_device_id acpi_lpss_device_ids[] = { + /* Generic LPSS devices */ + { "INTL9C60", (unsigned long)&lpss_dma_desc }, + /* Lynxpoint LPSS devices */ { "INT33C0", (unsigned long)&lpt_dev_desc }, { "INT33C1", (unsigned long)&lpt_dev_desc }, @@ -91,16 +98,27 @@ static int register_device_clock(struct acpi_device *adev, struct lpss_private_data *pdata) { const struct lpss_device_desc *dev_desc = pdata->dev_desc; + struct lpss_clk_data *clk_data; if (!lpss_clk_dev) lpt_register_clock_device(); - if (!dev_desc->clk_parent || !pdata->mmio_base + clk_data = platform_get_drvdata(lpss_clk_dev); + if (!clk_data) + return -ENODEV; + + if (dev_desc->clkdev_name) { + clk_register_clkdev(clk_data->clk, dev_desc->clkdev_name, + dev_name(&adev->dev)); + return 0; + } + + if (!pdata->mmio_base || pdata->mmio_size < dev_desc->prv_offset + LPSS_CLK_SIZE) return -ENODATA; pdata->clk = clk_register_gate(NULL, dev_name(&adev->dev), - dev_desc->clk_parent, 0, + clk_data->name, 0, pdata->mmio_base + dev_desc->prv_offset, 0, 0, NULL); if (IS_ERR(pdata->clk)) diff --git a/drivers/clk/x86/clk-lpt.c b/drivers/clk/x86/clk-lpt.c index 5cf4f4686406..4f45eee9e33b 100644 --- a/drivers/clk/x86/clk-lpt.c +++ b/drivers/clk/x86/clk-lpt.c @@ -15,22 +15,29 @@ #include #include #include +#include #include #define PRV_CLOCK_PARAMS 0x800 static int lpt_clk_probe(struct platform_device *pdev) { + struct lpss_clk_data *drvdata; struct clk *clk; + drvdata = devm_kzalloc(&pdev->dev, sizeof(*drvdata), GFP_KERNEL); + if (!drvdata) + return -ENOMEM; + /* LPSS free running clock */ - clk = clk_register_fixed_rate(&pdev->dev, "lpss_clk", NULL, CLK_IS_ROOT, - 100000000); + drvdata->name = "lpss_clk"; + clk = clk_register_fixed_rate(&pdev->dev, drvdata->name, NULL, + CLK_IS_ROOT, 100000000); if (IS_ERR(clk)) return PTR_ERR(clk); - /* Shared DMA clock */ - clk_register_clkdev(clk, "hclk", "INTL9C60.0.auto"); + drvdata->clk = clk; + platform_set_drvdata(pdev, drvdata); return 0; } diff --git a/include/linux/platform_data/clk-lpss.h b/include/linux/platform_data/clk-lpss.h index 528e73ce46d2..23901992b9dd 100644 --- a/include/linux/platform_data/clk-lpss.h +++ b/include/linux/platform_data/clk-lpss.h @@ -13,6 +13,11 @@ #ifndef __CLK_LPSS_H #define __CLK_LPSS_H +struct lpss_clk_data { + const char *name; + struct clk *clk; +}; + extern int lpt_clk_init(void); #endif /* __CLK_LPSS_H */ -- cgit v1.2.3 From 755ccb9d577b95b43537cfeb36da59140412f858 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 15 May 2013 08:00:25 +0000 Subject: broadcom: add include guards to include/linux/brcmphy.h include/linux/brcmphy.h is currently not protected against double inclusion, add ifdefs guard to fix that. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index b840a4960282..677b4f01b2d0 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -1,3 +1,6 @@ +#ifndef _LINUX_BRCMPHY_H +#define _LINUX_BRCMPHY_H + #define PHY_ID_BCM50610 0x0143bd60 #define PHY_ID_BCM50610M 0x0143bd70 #define PHY_ID_BCM5241 0x0143bc30 @@ -29,3 +32,5 @@ #define PHY_BRCM_CLEAR_RGMII_MODE 0x00004000 #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 #define PHY_BCM_FLAGS_VALID 0x80000000 + +#endif /* _LINUX_BRCMPHY_H */ -- cgit v1.2.3 From b0ce3508b25ea6fa10ae3ca254de1d695b521702 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 May 2013 07:34:53 +0000 Subject: bonding: allow TSO being set on bonding master MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some situations, we need to disable TSO on bonding slaves. bonding device automatically unset TSO in bond_fix_features(), and performance is not good because : 1) We consume more cpu cycles. 2) GSO segmentation has some bugs leading to out of order TCP packets if this segmentation is done before virtual device. This particular problem will be addressed in a separate patch. This patch allows TSO being set/unset on the bonding master, so that GSO segmentation is done after bonding layer. Signed-off-by: Eric Dumazet Cc: Michał Mirosław Cc: Jay Vosburgh Cc: Andy Gospodarek Cc: Maciej Żenczykowski Cc: Tom Herbert Cc: Neal Cardwell Cc: Yuchung Cheng Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + include/linux/netdevice.h | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index d0aade04e49a..449ad9bbe45c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1362,6 +1362,7 @@ static netdev_features_t bond_fix_features(struct net_device *dev, slave->dev->features, mask); } + features = netdev_add_tso_features(features, mask); out: read_unlock(&bond->lock); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a94a5a0ab122..60584b185a0c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2733,6 +2733,17 @@ static inline netdev_features_t netdev_get_wanted_features( } netdev_features_t netdev_increment_features(netdev_features_t all, netdev_features_t one, netdev_features_t mask); + +/* Allow TSO being used on stacked device : + * Performing the GSO segmentation before last device + * is a performance improvement. + */ +static inline netdev_features_t netdev_add_tso_features(netdev_features_t features, + netdev_features_t mask) +{ + return netdev_increment_features(features, NETIF_F_ALL_TSO, mask); +} + int __netdev_update_features(struct net_device *dev); void netdev_update_features(struct net_device *dev); void netdev_change_features(struct net_device *dev); -- cgit v1.2.3 From d4988d4c733ba0b61cb372edd3d1992d26dd10d3 Mon Sep 17 00:00:00 2001 From: Rafał Miłecki Date: Thu, 9 May 2013 21:24:24 +0200 Subject: bcma: add more core IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe and ARM CR4 cores were found on 14e4:43b1 AKA BCM4352. Reported-by: Gabriel Thörnblad Signed-off-by: Rafał Miłecki Signed-off-by: John W. Linville --- drivers/bcma/scan.c | 2 ++ include/linux/bcma/bcma.h | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c index bca9c80056fe..8bffa5c9818c 100644 --- a/drivers/bcma/scan.c +++ b/drivers/bcma/scan.c @@ -84,6 +84,8 @@ static const struct bcma_device_id_name bcma_bcm_device_names[] = { { BCMA_CORE_I2S, "I2S" }, { BCMA_CORE_SDR_DDR1_MEM_CTL, "SDR/DDR1 Memory Controller" }, { BCMA_CORE_SHIM, "SHIM" }, + { BCMA_CORE_PCIE2, "PCIe Gen2" }, + { BCMA_CORE_ARM_CR4, "ARM CR4" }, { BCMA_CORE_DEFAULT, "Default" }, }; diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index f14a98a79c9d..2e34db82a643 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -134,7 +134,10 @@ struct bcma_host_ops { #define BCMA_CORE_I2S 0x834 #define BCMA_CORE_SDR_DDR1_MEM_CTL 0x835 /* SDR/DDR1 memory controller core */ #define BCMA_CORE_SHIM 0x837 /* SHIM component in ubus/6362 */ -#define BCMA_CORE_ARM_CR4 0x83e +#define BCMA_CORE_PHY_AC 0x83B +#define BCMA_CORE_PCIE2 0x83C /* PCI Express Gen2 */ +#define BCMA_CORE_USB30_DEV 0x83D +#define BCMA_CORE_ARM_CR4 0x83E #define BCMA_CORE_DEFAULT 0xFFF #define BCMA_MAX_NR_CORES 16 -- cgit v1.2.3 From 7f18d05a1af75142d4856a91ffd2f1d8eb553c12 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Wed, 15 May 2013 20:18:41 +0530 Subject: SERIAL: OMAP: Remove the slave idle handling from the driver UART IP slave idle handling now taken care by runtime pm backend(hwmod layer) so remove the hackery from the driver. As discussed on the list, in future if dma mode needs to be brought back to this driver, UART sysc handling needs to be updated in framework such a way that no-idle/force idle profile can be supported. Given the broken dma mode for OMAP uarts, its very unlikely. Acked-by: Greg Kroah-Hartman Tested-by: Vaibhav Bedia Tested-by: Sourav Poddar Signed-off-by: Rajendra nayak Signed-off-by: Santosh Shilimkar Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman # OMAP4/Panda Signed-off-by: Paul Walmsley --- drivers/tty/serial/omap-serial.c | 23 ----------------------- include/linux/platform_data/serial-omap.h | 2 -- 2 files changed, 25 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c index 30d4f7a783cd..f0b9f6b52b32 100644 --- a/drivers/tty/serial/omap-serial.c +++ b/drivers/tty/serial/omap-serial.c @@ -202,26 +202,6 @@ static int serial_omap_get_context_loss_count(struct uart_omap_port *up) return pdata->get_context_loss_count(up->dev); } -static void serial_omap_set_forceidle(struct uart_omap_port *up) -{ - struct omap_uart_port_info *pdata = up->dev->platform_data; - - if (!pdata || !pdata->set_forceidle) - return; - - pdata->set_forceidle(up->dev); -} - -static void serial_omap_set_noidle(struct uart_omap_port *up) -{ - struct omap_uart_port_info *pdata = up->dev->platform_data; - - if (!pdata || !pdata->set_noidle) - return; - - pdata->set_noidle(up->dev); -} - static void serial_omap_enable_wakeup(struct uart_omap_port *up, bool enable) { struct omap_uart_port_info *pdata = up->dev->platform_data; @@ -298,8 +278,6 @@ static void serial_omap_stop_tx(struct uart_port *port) serial_out(up, UART_IER, up->ier); } - serial_omap_set_forceidle(up); - pm_runtime_mark_last_busy(up->dev); pm_runtime_put_autosuspend(up->dev); } @@ -364,7 +342,6 @@ static void serial_omap_start_tx(struct uart_port *port) pm_runtime_get_sync(up->dev); serial_omap_enable_ier_thri(up); - serial_omap_set_noidle(up); pm_runtime_mark_last_busy(up->dev); pm_runtime_put_autosuspend(up->dev); } diff --git a/include/linux/platform_data/serial-omap.h b/include/linux/platform_data/serial-omap.h index ff9b0aab5281..c860c1b314c0 100644 --- a/include/linux/platform_data/serial-omap.h +++ b/include/linux/platform_data/serial-omap.h @@ -43,8 +43,6 @@ struct omap_uart_port_info { int DTR_present; int (*get_context_loss_count)(struct device *); - void (*set_forceidle)(struct device *); - void (*set_noidle)(struct device *); void (*enable_wakeup)(struct device *, bool); }; -- cgit v1.2.3 From 2a7851bffb008ff4882eee673da74718997b4265 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 May 2013 03:56:10 +0000 Subject: netfilter: add nf_ipv6_ops hook to fix xt_addrtype with IPv6 Quoting https://bugzilla.netfilter.org/show_bug.cgi?id=812: [ ip6tables -m addrtype ] When I tried to use in the nat/PREROUTING it messes up the routing cache even if the rule didn't matched at all. [..] If I remove the --limit-iface-in from the non-working scenario, so just use the -m addrtype --dst-type LOCAL it works! This happens when LOCAL type matching is requested with --limit-iface-in, and the default ipv6 route is via the interface the packet we test arrived on. Because xt_addrtype uses ip6_route_output, the ipv6 routing implementation creates an unwanted cached entry, and the packet won't make it to the real/expected destination. Silently ignoring --limit-iface-in makes the routing work but it breaks rule matching (--dst-type LOCAL with limit-iface-in is supposed to only match if the dst address is configured on the incoming interface; without --limit-iface-in it will match if the address is reachable via lo). The test should call ipv6_chk_addr() instead. However, this would add a link-time dependency on ipv6. There are two possible solutions: 1) Revert the commit that moved ipt_addrtype to xt_addrtype, and put ipv6 specific code into ip6t_addrtype. 2) add new "nf_ipv6_ops" struct to register pointers to ipv6 functions. While the former might seem preferable, Pablo pointed out that there are more xt modules with link-time dependeny issues regarding ipv6, so lets go for 2). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 16 ++++++++++++++++ include/net/addrconf.h | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/netfilter.c | 7 +++++++ net/netfilter/core.c | 2 ++ net/netfilter/xt_addrtype.c | 27 ++++++++++++++++----------- 6 files changed, 43 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 98ffb54988b6..2d4df6ce043e 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -17,6 +17,22 @@ extern __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, extern int ipv6_netfilter_init(void); extern void ipv6_netfilter_fini(void); + +/* + * Hook functions for ipv6 to allow xt_* modules to be built-in even + * if IPv6 is a module. + */ +struct nf_ipv6_ops { + int (*chk_addr)(struct net *net, const struct in6_addr *addr, + const struct net_device *dev, int strict); +}; + +extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops; +static inline const struct nf_ipv6_ops *nf_get_ipv6_ops(void) +{ + return rcu_dereference(nf_ipv6_ops); +} + #else /* CONFIG_NETFILTER */ static inline int ipv6_netfilter_init(void) { return 0; } static inline void ipv6_netfilter_fini(void) { return; } diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 84a6440f1f19..21f702704f24 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -65,7 +65,7 @@ extern int addrconf_set_dstaddr(struct net *net, extern int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev, + const struct net_device *dev, int strict); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d1ab6ab29a55..d1b2d8034b54 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1487,7 +1487,7 @@ static int ipv6_count_addresses(struct inet6_dev *idev) } int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, - struct net_device *dev, int strict) + const struct net_device *dev, int strict) { struct inet6_ifaddr *ifp; unsigned int hash = inet6_addr_hash(addr); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 72836f40b730..95f3f1da0d7f 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -186,6 +187,10 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, return csum; }; +static const struct nf_ipv6_ops ipv6ops = { + .chk_addr = ipv6_chk_addr, +}; + static const struct nf_afinfo nf_ip6_afinfo = { .family = AF_INET6, .checksum = nf_ip6_checksum, @@ -198,6 +203,7 @@ static const struct nf_afinfo nf_ip6_afinfo = { int __init ipv6_netfilter_init(void) { + RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops); return nf_register_afinfo(&nf_ip6_afinfo); } @@ -206,5 +212,6 @@ int __init ipv6_netfilter_init(void) */ void ipv6_netfilter_fini(void) { + RCU_INIT_POINTER(nf_ipv6_ops, NULL); nf_unregister_afinfo(&nf_ip6_afinfo); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 07c865a31a3d..857ca9f35177 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -30,6 +30,8 @@ static DEFINE_MUTEX(afinfo_mutex); const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); +const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; +EXPORT_SYMBOL_GPL(nf_ipv6_ops); int nf_register_afinfo(const struct nf_afinfo *afinfo) { diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c index 49c5ff7f6dd6..68ff29f60867 100644 --- a/net/netfilter/xt_addrtype.c +++ b/net/netfilter/xt_addrtype.c @@ -22,6 +22,7 @@ #include #endif +#include #include #include @@ -33,12 +34,12 @@ MODULE_ALIAS("ip6t_addrtype"); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, - const struct in6_addr *addr) + const struct in6_addr *addr, u16 mask) { const struct nf_afinfo *afinfo; struct flowi6 flow; struct rt6_info *rt; - u32 ret; + u32 ret = 0; int route_err; memset(&flow, 0, sizeof(flow)); @@ -49,12 +50,19 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, rcu_read_lock(); afinfo = nf_get_afinfo(NFPROTO_IPV6); - if (afinfo != NULL) + if (afinfo != NULL) { + const struct nf_ipv6_ops *v6ops; + + if (dev && (mask & XT_ADDRTYPE_LOCAL)) { + v6ops = nf_get_ipv6_ops(); + if (v6ops && v6ops->chk_addr(net, addr, dev, true)) + ret = XT_ADDRTYPE_LOCAL; + } route_err = afinfo->route(net, (struct dst_entry **)&rt, - flowi6_to_flowi(&flow), !!dev); - else + flowi6_to_flowi(&flow), false); + } else { route_err = 1; - + } rcu_read_unlock(); if (route_err) @@ -62,15 +70,12 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev, if (rt->rt6i_flags & RTF_REJECT) ret = XT_ADDRTYPE_UNREACHABLE; - else - ret = 0; - if (rt->rt6i_flags & RTF_LOCAL) + if (dev == NULL && rt->rt6i_flags & RTF_LOCAL) ret |= XT_ADDRTYPE_LOCAL; if (rt->rt6i_flags & RTF_ANYCAST) ret |= XT_ADDRTYPE_ANYCAST; - dst_release(&rt->dst); return ret; } @@ -90,7 +95,7 @@ static bool match_type6(struct net *net, const struct net_device *dev, if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST | XT_ADDRTYPE_UNREACHABLE) & mask) - return !!(mask & match_lookup_rt6(net, dev, addr)); + return !!(mask & match_lookup_rt6(net, dev, addr, mask)); return true; } -- cgit v1.2.3 From 7805d000db30a3787a4c969bab6ae4d8a5fd8ce6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 May 2013 10:50:24 +0900 Subject: cgroup: fix a subtle bug in descendant pre-order walk When cgroup_next_descendant_pre() initiates a walk, it checks whether the subtree root doesn't have any children and if not returns NULL. Later code assumes that the subtree isn't empty. This is broken because the subtree may become empty inbetween, which can lead to the traversal escaping the subtree by walking to the sibling of the subtree root. There's no reason to have the early exit path. Remove it along with the later assumption that the subtree isn't empty. This simplifies the code a bit and fixes the subtle bug. While at it, fix the comment of cgroup_for_each_descendant_pre() which was incorrectly referring to ->css_offline() instead of ->css_online(). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Cc: stable@vger.kernel.org --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5047355b9a0f..8bda1294c035 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -707,7 +707,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * * If a subsystem synchronizes against the parent in its ->css_online() and * before starting iterating, and synchronizes against @pos on each - * iteration, any descendant cgroup which finished ->css_offline() is + * iteration, any descendant cgroup which finished ->css_online() is * guaranteed to be visible in the future iterations. * * In other words, the following guarantees that a descendant can't escape diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 38b136553044..31e9ef319070 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2954,11 +2954,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, pretend we just visited @cgroup */ - if (!pos) { - if (list_empty(&cgroup->children)) - return NULL; + if (!pos) pos = cgroup; - } /* visit the first child if exists */ next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); @@ -2966,14 +2963,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, return next; /* no child, visit my or the closest ancestor's next sibling */ - do { + while (pos != cgroup) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); if (&next->sibling != &pos->parent->children) return next; pos = pos->parent; - } while (pos != cgroup); + } return NULL; } -- cgit v1.2.3 From a11650e11093ed57dca78bf16e7836517c599098 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Fri, 24 May 2013 15:55:05 -0700 Subject: rapidio: make enumeration/discovery configurable Systems that use RapidIO fabric may need to implement their own enumeration and discovery methods which are better suitable for needs of a target application. The following set of patches is intended to simplify process of introduction of new RapidIO fabric enumeration/discovery methods. The first patch offers ability to add new RapidIO enumeration/discovery methods using kernel configuration options. This new configuration option mechanism allows to select statically linked or modular enumeration/discovery method(s) from the list of existing methods or use external module(s). This patch also updates the currently existing enumeration/discovery code to be used as a statically linked or modular method. The corresponding configuration option is named "Basic enumeration/discovery" method. This is the only one configuration option available today but new methods are expected to be introduced after adoption of provided patches. The second patch address a long time complaint of RapidIO subsystem users regarding fabric enumeration/discovery start sequence. Existing implementation offers only a boot-time enumeration/discovery start which requires synchronized boot of all endpoints in RapidIO network. While it works for small closed configurations with limited number of endpoints, using this approach in systems with large number of endpoints is quite challenging. To eliminate requirement for synchronized start the second patch introduces RapidIO enumeration/discovery start from user space. For compatibility with the existing RapidIO subsystem implementation, automatic boot time enumeration/discovery start can be configured in by specifying "rio-scan.scan=1" command line parameter if statically linked basic enumeration method is selected. This patch: Rework to implement RapidIO enumeration/discovery method selection combined with ability to use enumeration/discovery as a kernel module. This patch adds ability to introduce new RapidIO enumeration/discovery methods using kernel configuration options. Configuration option mechanism allows to select statically linked or modular enumeration/discovery method from the list of existing methods or use external modules. If a modular enumeration/discovery is selected each RapidIO mport device can have its own method attached to it. The existing enumeration/discovery code was updated to be used as statically linked or modular method. This configuration option is named "Basic enumeration/discovery" method. Several common routines have been moved from rio-scan.c to make them available to other enumeration methods and reduce number of exported symbols. Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Li Yang Cc: Kumar Gala Cc: Andre van Herk Cc: Micha Nelissen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/Kconfig | 20 ++++ drivers/rapidio/Makefile | 3 +- drivers/rapidio/rio-driver.c | 7 ++ drivers/rapidio/rio-scan.c | 166 ++++++++------------------------ drivers/rapidio/rio.c | 222 +++++++++++++++++++++++++++++++++++++++++-- drivers/rapidio/rio.h | 11 ++- include/linux/rio.h | 13 ++- include/linux/rio_drv.h | 1 + 8 files changed, 304 insertions(+), 139 deletions(-) (limited to 'include/linux') diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig index 6194d35ebb97..5ab056494bbe 100644 --- a/drivers/rapidio/Kconfig +++ b/drivers/rapidio/Kconfig @@ -47,4 +47,24 @@ config RAPIDIO_DEBUG If you are unsure about this, say N here. +choice + prompt "Enumeration method" + depends on RAPIDIO + default RAPIDIO_ENUM_BASIC + help + There are different enumeration and discovery mechanisms offered + for RapidIO subsystem. You may select single built-in method or + or any number of methods to be built as modules. + Selecting a built-in method disables use of loadable methods. + + If unsure, select Basic built-in. + +config RAPIDIO_ENUM_BASIC + tristate "Basic" + help + This option includes basic RapidIO fabric enumeration and discovery + mechanism similar to one described in RapidIO specification Annex 1. + +endchoice + source "drivers/rapidio/switches/Kconfig" diff --git a/drivers/rapidio/Makefile b/drivers/rapidio/Makefile index ec3fb8121004..3036702ffe8b 100644 --- a/drivers/rapidio/Makefile +++ b/drivers/rapidio/Makefile @@ -1,7 +1,8 @@ # # Makefile for RapidIO interconnect services # -obj-y += rio.o rio-access.o rio-driver.o rio-scan.o rio-sysfs.o +obj-y += rio.o rio-access.o rio-driver.o rio-sysfs.o +obj-$(CONFIG_RAPIDIO_ENUM_BASIC) += rio-scan.o obj-$(CONFIG_RAPIDIO) += switches/ obj-$(CONFIG_RAPIDIO) += devices/ diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index 0f4a53bdaa3c..55850bb21480 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -164,6 +164,13 @@ void rio_unregister_driver(struct rio_driver *rdrv) driver_unregister(&rdrv->driver); } +void rio_attach_device(struct rio_dev *rdev) +{ + rdev->dev.bus = &rio_bus_type; + rdev->dev.parent = &rio_bus; +} +EXPORT_SYMBOL_GPL(rio_attach_device); + /** * rio_match_bus - Tell if a RIO device structure has a matching RIO driver device id structure * @dev: the standard device structure to match against diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index a965acd3c0e4..7bdc67419cc3 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -37,12 +37,8 @@ #include "rio.h" -LIST_HEAD(rio_devices); - static void rio_init_em(struct rio_dev *rdev); -DEFINE_SPINLOCK(rio_global_list_lock); - static int next_destid = 0; static int next_comptag = 1; @@ -326,127 +322,6 @@ static int rio_is_switch(struct rio_dev *rdev) return 0; } -/** - * rio_switch_init - Sets switch operations for a particular vendor switch - * @rdev: RIO device - * @do_enum: Enumeration/Discovery mode flag - * - * Searches the RIO switch ops table for known switch types. If the vid - * and did match a switch table entry, then call switch initialization - * routine to setup switch-specific routines. - */ -static void rio_switch_init(struct rio_dev *rdev, int do_enum) -{ - struct rio_switch_ops *cur = __start_rio_switch_ops; - struct rio_switch_ops *end = __end_rio_switch_ops; - - while (cur < end) { - if ((cur->vid == rdev->vid) && (cur->did == rdev->did)) { - pr_debug("RIO: calling init routine for %s\n", - rio_name(rdev)); - cur->init_hook(rdev, do_enum); - break; - } - cur++; - } - - if ((cur >= end) && (rdev->pef & RIO_PEF_STD_RT)) { - pr_debug("RIO: adding STD routing ops for %s\n", - rio_name(rdev)); - rdev->rswitch->add_entry = rio_std_route_add_entry; - rdev->rswitch->get_entry = rio_std_route_get_entry; - rdev->rswitch->clr_table = rio_std_route_clr_table; - } - - if (!rdev->rswitch->add_entry || !rdev->rswitch->get_entry) - printk(KERN_ERR "RIO: missing routing ops for %s\n", - rio_name(rdev)); -} - -/** - * rio_add_device- Adds a RIO device to the device model - * @rdev: RIO device - * - * Adds the RIO device to the global device list and adds the RIO - * device to the RIO device list. Creates the generic sysfs nodes - * for an RIO device. - */ -static int rio_add_device(struct rio_dev *rdev) -{ - int err; - - err = device_add(&rdev->dev); - if (err) - return err; - - spin_lock(&rio_global_list_lock); - list_add_tail(&rdev->global_list, &rio_devices); - spin_unlock(&rio_global_list_lock); - - rio_create_sysfs_dev_files(rdev); - - return 0; -} - -/** - * rio_enable_rx_tx_port - enable input receiver and output transmitter of - * given port - * @port: Master port associated with the RIO network - * @local: local=1 select local port otherwise a far device is reached - * @destid: Destination ID of the device to check host bit - * @hopcount: Number of hops to reach the target - * @port_num: Port (-number on switch) to enable on a far end device - * - * Returns 0 or 1 from on General Control Command and Status Register - * (EXT_PTR+0x3C) - */ -inline int rio_enable_rx_tx_port(struct rio_mport *port, - int local, u16 destid, - u8 hopcount, u8 port_num) { -#ifdef CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS - u32 regval; - u32 ext_ftr_ptr; - - /* - * enable rx input tx output port - */ - pr_debug("rio_enable_rx_tx_port(local = %d, destid = %d, hopcount = " - "%d, port_num = %d)\n", local, destid, hopcount, port_num); - - ext_ftr_ptr = rio_mport_get_physefb(port, local, destid, hopcount); - - if (local) { - rio_local_read_config_32(port, ext_ftr_ptr + - RIO_PORT_N_CTL_CSR(0), - ®val); - } else { - if (rio_mport_read_config_32(port, destid, hopcount, - ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), ®val) < 0) - return -EIO; - } - - if (regval & RIO_PORT_N_CTL_P_TYP_SER) { - /* serial */ - regval = regval | RIO_PORT_N_CTL_EN_RX_SER - | RIO_PORT_N_CTL_EN_TX_SER; - } else { - /* parallel */ - regval = regval | RIO_PORT_N_CTL_EN_RX_PAR - | RIO_PORT_N_CTL_EN_TX_PAR; - } - - if (local) { - rio_local_write_config_32(port, ext_ftr_ptr + - RIO_PORT_N_CTL_CSR(0), regval); - } else { - if (rio_mport_write_config_32(port, destid, hopcount, - ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), regval) < 0) - return -EIO; - } -#endif - return 0; -} - /** * rio_setup_device- Allocates and sets up a RIO device * @net: RIO network @@ -587,8 +462,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net, rdev->destid); } - rdev->dev.bus = &rio_bus_type; - rdev->dev.parent = &rio_bus; + rio_attach_device(rdev); device_initialize(&rdev->dev); rdev->dev.release = rio_release_dev; @@ -1421,3 +1295,41 @@ enum_done: bail: return -EBUSY; } + +static struct rio_scan rio_scan_ops = { + .enumerate = rio_enum_mport, + .discover = rio_disc_mport, +}; + +static bool scan; +module_param(scan, bool, 0); +MODULE_PARM_DESC(scan, "Start RapidIO network enumeration/discovery " + "(default = 0)"); + +/** + * rio_basic_attach: + * + * When this enumeration/discovery method is loaded as a module this function + * registers its specific enumeration and discover routines for all available + * RapidIO mport devices. The "scan" command line parameter controls ability of + * the module to start RapidIO enumeration/discovery automatically. + * + * Returns 0 for success or -EIO if unable to register itself. + * + * This enumeration/discovery method cannot be unloaded and therefore does not + * provide a matching cleanup_module routine. + */ + +static int __init rio_basic_attach(void) +{ + if (rio_register_scan(RIO_MPORT_ANY, &rio_scan_ops)) + return -EIO; + if (scan) + rio_init_mports(); + return 0; +} + +late_initcall(rio_basic_attach); + +MODULE_DESCRIPTION("Basic RapidIO enumeration/discovery"); +MODULE_LICENSE("GPL"); diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index d553b5d13722..6e75dda34799 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -31,7 +31,11 @@ #include "rio.h" +static LIST_HEAD(rio_devices); +static DEFINE_SPINLOCK(rio_global_list_lock); + static LIST_HEAD(rio_mports); +static DEFINE_MUTEX(rio_mport_list_lock); static unsigned char next_portid; static DEFINE_SPINLOCK(rio_mmap_lock); @@ -52,6 +56,32 @@ u16 rio_local_get_device_id(struct rio_mport *port) return (RIO_GET_DID(port->sys_size, result)); } +/** + * rio_add_device- Adds a RIO device to the device model + * @rdev: RIO device + * + * Adds the RIO device to the global device list and adds the RIO + * device to the RIO device list. Creates the generic sysfs nodes + * for an RIO device. + */ +int rio_add_device(struct rio_dev *rdev) +{ + int err; + + err = device_add(&rdev->dev); + if (err) + return err; + + spin_lock(&rio_global_list_lock); + list_add_tail(&rdev->global_list, &rio_devices); + spin_unlock(&rio_global_list_lock); + + rio_create_sysfs_dev_files(rdev); + + return 0; +} +EXPORT_SYMBOL_GPL(rio_add_device); + /** * rio_request_inb_mbox - request inbound mailbox service * @mport: RIO master port from which to allocate the mailbox resource @@ -489,6 +519,7 @@ rio_mport_get_physefb(struct rio_mport *port, int local, return ext_ftr_ptr; } +EXPORT_SYMBOL_GPL(rio_mport_get_physefb); /** * rio_get_comptag - Begin or continue searching for a RIO device by component tag @@ -521,6 +552,7 @@ exit: spin_unlock(&rio_global_list_lock); return rdev; } +EXPORT_SYMBOL_GPL(rio_get_comptag); /** * rio_set_port_lockout - Sets/clears LOCKOUT bit (RIO EM 1.3) for a switch port. @@ -545,6 +577,107 @@ int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock) regval); return 0; } +EXPORT_SYMBOL_GPL(rio_set_port_lockout); + +/** + * rio_switch_init - Sets switch operations for a particular vendor switch + * @rdev: RIO device + * @do_enum: Enumeration/Discovery mode flag + * + * Searches the RIO switch ops table for known switch types. If the vid + * and did match a switch table entry, then call switch initialization + * routine to setup switch-specific routines. + */ +void rio_switch_init(struct rio_dev *rdev, int do_enum) +{ + struct rio_switch_ops *cur = __start_rio_switch_ops; + struct rio_switch_ops *end = __end_rio_switch_ops; + + while (cur < end) { + if ((cur->vid == rdev->vid) && (cur->did == rdev->did)) { + pr_debug("RIO: calling init routine for %s\n", + rio_name(rdev)); + cur->init_hook(rdev, do_enum); + break; + } + cur++; + } + + if ((cur >= end) && (rdev->pef & RIO_PEF_STD_RT)) { + pr_debug("RIO: adding STD routing ops for %s\n", + rio_name(rdev)); + rdev->rswitch->add_entry = rio_std_route_add_entry; + rdev->rswitch->get_entry = rio_std_route_get_entry; + rdev->rswitch->clr_table = rio_std_route_clr_table; + } + + if (!rdev->rswitch->add_entry || !rdev->rswitch->get_entry) + printk(KERN_ERR "RIO: missing routing ops for %s\n", + rio_name(rdev)); +} +EXPORT_SYMBOL_GPL(rio_switch_init); + +/** + * rio_enable_rx_tx_port - enable input receiver and output transmitter of + * given port + * @port: Master port associated with the RIO network + * @local: local=1 select local port otherwise a far device is reached + * @destid: Destination ID of the device to check host bit + * @hopcount: Number of hops to reach the target + * @port_num: Port (-number on switch) to enable on a far end device + * + * Returns 0 or 1 from on General Control Command and Status Register + * (EXT_PTR+0x3C) + */ +int rio_enable_rx_tx_port(struct rio_mport *port, + int local, u16 destid, + u8 hopcount, u8 port_num) +{ +#ifdef CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS + u32 regval; + u32 ext_ftr_ptr; + + /* + * enable rx input tx output port + */ + pr_debug("rio_enable_rx_tx_port(local = %d, destid = %d, hopcount = " + "%d, port_num = %d)\n", local, destid, hopcount, port_num); + + ext_ftr_ptr = rio_mport_get_physefb(port, local, destid, hopcount); + + if (local) { + rio_local_read_config_32(port, ext_ftr_ptr + + RIO_PORT_N_CTL_CSR(0), + ®val); + } else { + if (rio_mport_read_config_32(port, destid, hopcount, + ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), ®val) < 0) + return -EIO; + } + + if (regval & RIO_PORT_N_CTL_P_TYP_SER) { + /* serial */ + regval = regval | RIO_PORT_N_CTL_EN_RX_SER + | RIO_PORT_N_CTL_EN_TX_SER; + } else { + /* parallel */ + regval = regval | RIO_PORT_N_CTL_EN_RX_PAR + | RIO_PORT_N_CTL_EN_TX_PAR; + } + + if (local) { + rio_local_write_config_32(port, ext_ftr_ptr + + RIO_PORT_N_CTL_CSR(0), regval); + } else { + if (rio_mport_write_config_32(port, destid, hopcount, + ext_ftr_ptr + RIO_PORT_N_CTL_CSR(port_num), regval) < 0) + return -EIO; + } +#endif + return 0; +} +EXPORT_SYMBOL_GPL(rio_enable_rx_tx_port); + /** * rio_chk_dev_route - Validate route to the specified device. @@ -610,6 +743,7 @@ rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid, u8 hopcount) return 0; } +EXPORT_SYMBOL_GPL(rio_mport_chk_dev_access); /** * rio_chk_dev_access - Validate access to the specified device. @@ -941,6 +1075,7 @@ rio_mport_get_efb(struct rio_mport *port, int local, u16 destid, return RIO_GET_BLOCK_ID(reg_val); } } +EXPORT_SYMBOL_GPL(rio_mport_get_efb); /** * rio_mport_get_feature - query for devices' extended features @@ -997,6 +1132,7 @@ rio_mport_get_feature(struct rio_mport * port, int local, u16 destid, return 0; } +EXPORT_SYMBOL_GPL(rio_mport_get_feature); /** * rio_get_asm - Begin or continue searching for a RIO device by vid/did/asm_vid/asm_did @@ -1246,6 +1382,71 @@ EXPORT_SYMBOL_GPL(rio_dma_prep_slave_sg); #endif /* CONFIG_RAPIDIO_DMA_ENGINE */ +/** + * rio_register_scan - enumeration/discovery method registration interface + * @mport_id: mport device ID for which fabric scan routine has to be set + * (RIO_MPORT_ANY = set for all available mports) + * @scan_ops: enumeration/discovery control structure + * + * Assigns enumeration or discovery method to the specified mport device (or all + * available mports if RIO_MPORT_ANY is specified). + * Returns error if the mport already has an enumerator attached to it. + * In case of RIO_MPORT_ANY ignores ports with valid scan routines and returns + * an error if was unable to find at least one available mport. + */ +int rio_register_scan(int mport_id, struct rio_scan *scan_ops) +{ + struct rio_mport *port; + int rc = -EBUSY; + + mutex_lock(&rio_mport_list_lock); + list_for_each_entry(port, &rio_mports, node) { + if (port->id == mport_id || mport_id == RIO_MPORT_ANY) { + if (port->nscan && mport_id == RIO_MPORT_ANY) + continue; + else if (port->nscan) + break; + + port->nscan = scan_ops; + rc = 0; + + if (mport_id != RIO_MPORT_ANY) + break; + } + } + mutex_unlock(&rio_mport_list_lock); + + return rc; +} +EXPORT_SYMBOL_GPL(rio_register_scan); + +/** + * rio_unregister_scan - removes enumeration/discovery method from mport + * @mport_id: mport device ID for which fabric scan routine has to be + * unregistered (RIO_MPORT_ANY = set for all available mports) + * + * Removes enumeration or discovery method assigned to the specified mport + * device (or all available mports if RIO_MPORT_ANY is specified). + */ +int rio_unregister_scan(int mport_id) +{ + struct rio_mport *port; + + mutex_lock(&rio_mport_list_lock); + list_for_each_entry(port, &rio_mports, node) { + if (port->id == mport_id || mport_id == RIO_MPORT_ANY) { + if (port->nscan) + port->nscan = NULL; + if (mport_id != RIO_MPORT_ANY) + break; + } + } + mutex_unlock(&rio_mport_list_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(rio_unregister_scan); + static void rio_fixup_device(struct rio_dev *dev) { } @@ -1274,7 +1475,7 @@ static void disc_work_handler(struct work_struct *_work) work = container_of(_work, struct rio_disc_work, work); pr_debug("RIO: discovery work for mport %d %s\n", work->mport->id, work->mport->name); - rio_disc_mport(work->mport); + work->mport->nscan->discover(work->mport); } int rio_init_mports(void) @@ -1290,12 +1491,15 @@ int rio_init_mports(void) * First, run enumerations and check if we need to perform discovery * on any of the registered mports. */ + mutex_lock(&rio_mport_list_lock); list_for_each_entry(port, &rio_mports, node) { - if (port->host_deviceid >= 0) - rio_enum_mport(port); - else + if (port->host_deviceid >= 0) { + if (port->nscan) + port->nscan->enumerate(port); + } else n++; } + mutex_unlock(&rio_mport_list_lock); if (!n) goto no_disc; @@ -1322,14 +1526,16 @@ int rio_init_mports(void) } n = 0; + mutex_lock(&rio_mport_list_lock); list_for_each_entry(port, &rio_mports, node) { - if (port->host_deviceid < 0) { + if (port->host_deviceid < 0 && port->nscan) { work[n].mport = port; INIT_WORK(&work[n].work, disc_work_handler); queue_work(rio_wq, &work[n].work); n++; } } + mutex_unlock(&rio_mport_list_lock); flush_workqueue(rio_wq); pr_debug("RIO: destroy discovery workqueue\n"); @@ -1342,8 +1548,6 @@ no_disc: return 0; } -device_initcall_sync(rio_init_mports); - static int hdids[RIO_MAX_MPORTS + 1]; static int rio_get_hdid(int index) @@ -1371,7 +1575,10 @@ int rio_register_mport(struct rio_mport *port) port->id = next_portid++; port->host_deviceid = rio_get_hdid(port->id); + port->nscan = NULL; + mutex_lock(&rio_mport_list_lock); list_add_tail(&port->node, &rio_mports); + mutex_unlock(&rio_mport_list_lock); return 0; } @@ -1386,3 +1593,4 @@ EXPORT_SYMBOL_GPL(rio_request_inb_mbox); EXPORT_SYMBOL_GPL(rio_release_inb_mbox); EXPORT_SYMBOL_GPL(rio_request_outb_mbox); EXPORT_SYMBOL_GPL(rio_release_outb_mbox); +EXPORT_SYMBOL_GPL(rio_init_mports); diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index b1af414f15e6..0afdf482517e 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -15,6 +15,7 @@ #include #define RIO_MAX_CHK_RETRY 3 +#define RIO_MPORT_ANY (-1) /* Functions internal to the RIO core code */ @@ -27,8 +28,6 @@ extern u32 rio_mport_get_efb(struct rio_mport *port, int local, u16 destid, extern int rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid, u8 hopcount); extern int rio_create_sysfs_dev_files(struct rio_dev *rdev); -extern int rio_enum_mport(struct rio_mport *mport); -extern int rio_disc_mport(struct rio_mport *mport); extern int rio_std_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount, u16 table, u16 route_destid, u8 route_port); @@ -39,10 +38,16 @@ extern int rio_std_route_clr_table(struct rio_mport *mport, u16 destid, u8 hopcount, u16 table); extern int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock); extern struct rio_dev *rio_get_comptag(u32 comp_tag, struct rio_dev *from); +extern int rio_add_device(struct rio_dev *rdev); +extern void rio_switch_init(struct rio_dev *rdev, int do_enum); +extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid, + u8 hopcount, u8 port_num); +extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops); +extern int rio_unregister_scan(int mport_id); +extern void rio_attach_device(struct rio_dev *rdev); /* Structures internal to the RIO core code */ extern struct device_attribute rio_dev_attrs[]; -extern spinlock_t rio_global_list_lock; extern struct rio_switch_ops __start_rio_switch_ops[]; extern struct rio_switch_ops __end_rio_switch_ops[]; diff --git a/include/linux/rio.h b/include/linux/rio.h index a3e784278667..cd3796ee7410 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -83,7 +83,6 @@ extern struct bus_type rio_bus_type; extern struct device rio_bus; -extern struct list_head rio_devices; /* list of all devices */ struct rio_mport; struct rio_dev; @@ -237,6 +236,7 @@ enum rio_phy_type { * @name: Port name string * @priv: Master port private data * @dma: DMA device associated with mport + * @nscan: RapidIO network enumeration/discovery operations */ struct rio_mport { struct list_head dbells; /* list of doorbell events */ @@ -262,6 +262,7 @@ struct rio_mport { #ifdef CONFIG_RAPIDIO_DMA_ENGINE struct dma_device dma; #endif + struct rio_scan *nscan; }; struct rio_id_table { @@ -460,6 +461,16 @@ static inline struct rio_mport *dma_to_mport(struct dma_device *ddev) } #endif /* CONFIG_RAPIDIO_DMA_ENGINE */ +/** + * struct rio_scan - RIO enumeration and discovery operations + * @enumerate: Callback to perform RapidIO fabric enumeration. + * @discover: Callback to perform RapidIO fabric discovery. + */ +struct rio_scan { + int (*enumerate)(struct rio_mport *mport); + int (*discover)(struct rio_mport *mport); +}; + /* Architecture and hardware-specific functions */ extern int rio_register_mport(struct rio_mport *); extern int rio_open_inb_mbox(struct rio_mport *, void *, int, int); diff --git a/include/linux/rio_drv.h b/include/linux/rio_drv.h index b75c05920ab5..5059994fe297 100644 --- a/include/linux/rio_drv.h +++ b/include/linux/rio_drv.h @@ -433,5 +433,6 @@ extern u16 rio_local_get_device_id(struct rio_mport *port); extern struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from); extern struct rio_dev *rio_get_asm(u16 vid, u16 did, u16 asm_vid, u16 asm_did, struct rio_dev *from); +extern int rio_init_mports(void); #endif /* LINUX_RIO_DRV_H */ -- cgit v1.2.3 From bc8fcfea18249640f2728c46d70999dcb7e4dc49 Mon Sep 17 00:00:00 2001 From: Alexandre Bounine Date: Fri, 24 May 2013 15:55:06 -0700 Subject: rapidio: add enumeration/discovery start from user space Add RapidIO enumeration/discovery start from user space. User space start allows to defer RapidIO fabric scan until the moment when all participating endpoints are initialized avoiding mandatory synchronized start of all endpoints (which may be challenging in systems with large number of RapidIO endpoints). Signed-off-by: Alexandre Bounine Cc: Matt Porter Cc: Li Yang Cc: Kumar Gala Cc: Andre van Herk Cc: Micha Nelissen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rapidio/rio-driver.c | 1 + drivers/rapidio/rio-scan.c | 24 ++++++++++++++++++++--- drivers/rapidio/rio-sysfs.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ drivers/rapidio/rio.c | 28 +++++++++++++++++++++++++-- drivers/rapidio/rio.h | 2 ++ include/linux/rio.h | 9 +++++++-- 6 files changed, 102 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c index 55850bb21480..a0c875563d76 100644 --- a/drivers/rapidio/rio-driver.c +++ b/drivers/rapidio/rio-driver.c @@ -207,6 +207,7 @@ struct bus_type rio_bus_type = { .name = "rapidio", .match = rio_match_bus, .dev_attrs = rio_dev_attrs, + .bus_attrs = rio_bus_attrs, .probe = rio_device_probe, .remove = rio_device_remove, }; diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index 7bdc67419cc3..4c15dbf81087 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -1134,19 +1134,30 @@ static void rio_pw_enable(struct rio_mport *port, int enable) /** * rio_enum_mport- Start enumeration through a master port * @mport: Master port to send transactions + * @flags: Enumeration control flags * * Starts the enumeration process. If somebody has enumerated our * master port device, then give up. If not and we have an active * link, then start recursive peer enumeration. Returns %0 if * enumeration succeeds or %-EBUSY if enumeration fails. */ -int rio_enum_mport(struct rio_mport *mport) +int rio_enum_mport(struct rio_mport *mport, u32 flags) { struct rio_net *net = NULL; int rc = 0; printk(KERN_INFO "RIO: enumerate master port %d, %s\n", mport->id, mport->name); + + /* + * To avoid multiple start requests (repeat enumeration is not supported + * by this method) check if enumeration/discovery was performed for this + * mport: if mport was added into the list of mports for a net exit + * with error. + */ + if (mport->nnode.next || mport->nnode.prev) + return -EBUSY; + /* If somebody else enumerated our master port device, bail. */ if (rio_enum_host(mport) < 0) { printk(KERN_INFO @@ -1236,14 +1247,16 @@ static void rio_build_route_tables(struct rio_net *net) /** * rio_disc_mport- Start discovery through a master port * @mport: Master port to send transactions + * @flags: discovery control flags * * Starts the discovery process. If we have an active link, - * then wait for the signal that enumeration is complete. + * then wait for the signal that enumeration is complete (if wait + * is allowed). * When enumeration completion is signaled, start recursive * peer discovery. Returns %0 if discovery succeeds or %-EBUSY * on failure. */ -int rio_disc_mport(struct rio_mport *mport) +int rio_disc_mport(struct rio_mport *mport, u32 flags) { struct rio_net *net = NULL; unsigned long to_end; @@ -1253,6 +1266,11 @@ int rio_disc_mport(struct rio_mport *mport) /* If master port has an active link, allocate net and discover peers */ if (rio_mport_is_active(mport)) { + if (rio_enum_complete(mport)) + goto enum_done; + else if (flags & RIO_SCAN_ENUM_NO_WAIT) + return -EAGAIN; + pr_debug("RIO: wait for enumeration to complete...\n"); to_end = jiffies + CONFIG_RAPIDIO_DISC_TIMEOUT * HZ; diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c index 4dbe360989be..66d4acd5e18f 100644 --- a/drivers/rapidio/rio-sysfs.c +++ b/drivers/rapidio/rio-sysfs.c @@ -285,3 +285,48 @@ void rio_remove_sysfs_dev_files(struct rio_dev *rdev) rdev->rswitch->sw_sysfs(rdev, RIO_SW_SYSFS_REMOVE); } } + +static ssize_t bus_scan_store(struct bus_type *bus, const char *buf, + size_t count) +{ + long val; + struct rio_mport *port = NULL; + int rc; + + if (kstrtol(buf, 0, &val) < 0) + return -EINVAL; + + if (val == RIO_MPORT_ANY) { + rc = rio_init_mports(); + goto exit; + } + + if (val < 0 || val >= RIO_MAX_MPORTS) + return -EINVAL; + + port = rio_find_mport((int)val); + + if (!port) { + pr_debug("RIO: %s: mport_%d not available\n", + __func__, (int)val); + return -EINVAL; + } + + if (!port->nscan) + return -EINVAL; + + if (port->host_deviceid >= 0) + rc = port->nscan->enumerate(port, 0); + else + rc = port->nscan->discover(port, RIO_SCAN_ENUM_NO_WAIT); +exit: + if (!rc) + rc = count; + + return rc; +} + +struct bus_attribute rio_bus_attrs[] = { + __ATTR(scan, (S_IWUSR|S_IWGRP), NULL, bus_scan_store), + __ATTR_NULL +}; diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c index 6e75dda34799..cb1c08996fbb 100644 --- a/drivers/rapidio/rio.c +++ b/drivers/rapidio/rio.c @@ -1382,6 +1382,30 @@ EXPORT_SYMBOL_GPL(rio_dma_prep_slave_sg); #endif /* CONFIG_RAPIDIO_DMA_ENGINE */ +/** + * rio_find_mport - find RIO mport by its ID + * @mport_id: number (ID) of mport device + * + * Given a RIO mport number, the desired mport is located + * in the global list of mports. If the mport is found, a pointer to its + * data structure is returned. If no mport is found, %NULL is returned. + */ +struct rio_mport *rio_find_mport(int mport_id) +{ + struct rio_mport *port; + + mutex_lock(&rio_mport_list_lock); + list_for_each_entry(port, &rio_mports, node) { + if (port->id == mport_id) + goto found; + } + port = NULL; +found: + mutex_unlock(&rio_mport_list_lock); + + return port; +} + /** * rio_register_scan - enumeration/discovery method registration interface * @mport_id: mport device ID for which fabric scan routine has to be set @@ -1475,7 +1499,7 @@ static void disc_work_handler(struct work_struct *_work) work = container_of(_work, struct rio_disc_work, work); pr_debug("RIO: discovery work for mport %d %s\n", work->mport->id, work->mport->name); - work->mport->nscan->discover(work->mport); + work->mport->nscan->discover(work->mport, 0); } int rio_init_mports(void) @@ -1495,7 +1519,7 @@ int rio_init_mports(void) list_for_each_entry(port, &rio_mports, node) { if (port->host_deviceid >= 0) { if (port->nscan) - port->nscan->enumerate(port); + port->nscan->enumerate(port, 0); } else n++; } diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h index 0afdf482517e..c14f864dea5c 100644 --- a/drivers/rapidio/rio.h +++ b/drivers/rapidio/rio.h @@ -45,9 +45,11 @@ extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid, extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops); extern int rio_unregister_scan(int mport_id); extern void rio_attach_device(struct rio_dev *rdev); +extern struct rio_mport *rio_find_mport(int mport_id); /* Structures internal to the RIO core code */ extern struct device_attribute rio_dev_attrs[]; +extern struct bus_attribute rio_bus_attrs[]; extern struct rio_switch_ops __start_rio_switch_ops[]; extern struct rio_switch_ops __end_rio_switch_ops[]; diff --git a/include/linux/rio.h b/include/linux/rio.h index cd3796ee7410..18e099342e6f 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -265,6 +265,11 @@ struct rio_mport { struct rio_scan *nscan; }; +/* + * Enumeration/discovery control flags + */ +#define RIO_SCAN_ENUM_NO_WAIT 0x00000001 /* Do not wait for enum completed */ + struct rio_id_table { u16 start; /* logical minimal id */ u32 max; /* max number of IDs in table */ @@ -467,8 +472,8 @@ static inline struct rio_mport *dma_to_mport(struct dma_device *ddev) * @discover: Callback to perform RapidIO fabric discovery. */ struct rio_scan { - int (*enumerate)(struct rio_mport *mport); - int (*discover)(struct rio_mport *mport); + int (*enumerate)(struct rio_mport *mport, u32 flags); + int (*discover)(struct rio_mport *mport, u32 flags); }; /* Architecture and hardware-specific functions */ -- cgit v1.2.3 From 4c663cfc523a88d97a8309b04a089c27dc57fd7e Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Fri, 24 May 2013 15:55:09 -0700 Subject: wait: fix false timeouts when using wait_event_timeout() Many callers of the wait_event_timeout() and wait_event_interruptible_timeout() expect that the return value will be positive if the specified condition becomes true before the timeout elapses. However, at the moment this isn't guaranteed. If the wake-up handler is delayed enough, the time remaining until timeout will be calculated as 0 - and passed back as a return value - even if the condition became true before the timeout has passed. Fix this by returning at least 1 if the condition becomes true. This semantic is in line with what wait_for_condition_timeout() does; see commit bb10ed09 ("sched: fix wait_for_completion_timeout() spurious failure under heavy load"). Daniel said "We have 3 instances of this bug in drm/i915. One case even where we switch between the interruptible and not interruptible wait_event_timeout variants, foolishly presuming they have the same semantics. I very much like this." One such bug is reported at https://bugs.freedesktop.org/show_bug.cgi?id=64133 Signed-off-by: Imre Deak Acked-by: Daniel Vetter Acked-by: David Howells Acked-by: Jens Axboe Cc: "Paul E. McKenney" Cc: Dave Jones Cc: Lukas Czerner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/wait.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index ac38be2692d8..1133695eb067 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -217,6 +217,8 @@ do { \ if (!ret) \ break; \ } \ + if (!ret && (condition)) \ + ret = 1; \ finish_wait(&wq, &__wait); \ } while (0) @@ -233,8 +235,9 @@ do { \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * - * The function returns 0 if the @timeout elapsed, and the remaining - * jiffies if the condition evaluated to true before the timeout elapsed. + * The function returns 0 if the @timeout elapsed, or the remaining + * jiffies (at least 1) if the @condition evaluated to %true before + * the @timeout elapsed. */ #define wait_event_timeout(wq, condition, timeout) \ ({ \ @@ -302,6 +305,8 @@ do { \ ret = -ERESTARTSYS; \ break; \ } \ + if (!ret && (condition)) \ + ret = 1; \ finish_wait(&wq, &__wait); \ } while (0) @@ -318,9 +323,10 @@ do { \ * wake_up() has to be called after changing any variable that could * change the result of the wait condition. * - * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it - * was interrupted by a signal, and the remaining jiffies otherwise - * if the condition evaluated to true before the timeout elapsed. + * Returns: + * 0 if the @timeout elapsed, -%ERESTARTSYS if it was interrupted by + * a signal, or the remaining jiffies (at least 1) if the @condition + * evaluated to %true before the @timeout elapsed. */ #define wait_event_interruptible_timeout(wq, condition, timeout) \ ({ \ -- cgit v1.2.3 From 7450231fb35492951e78a91b833fd935171f4e66 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 24 May 2013 15:55:20 -0700 Subject: linux/kernel.h: fix kernel-doc warning Fix kernel-doc warning in : Warning(include/linux/kernel.h:590): No description found for parameter 'ip' scripts/kernel-doc cannot handle macros, functions, or function prototypes between the function or macro that is being documented and its definition, so move these prototypes above the function that is being documented. Signed-off-by: Randy Dunlap Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e96329ceb28c..e9ef6d6b51d5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -562,6 +562,9 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...); extern __printf(2, 3) int __trace_printk(unsigned long ip, const char *fmt, ...); +extern int __trace_bputs(unsigned long ip, const char *str); +extern int __trace_puts(unsigned long ip, const char *str, int size); + /** * trace_puts - write a string into the ftrace buffer * @str: the string to record @@ -587,8 +590,6 @@ int __trace_printk(unsigned long ip, const char *fmt, ...); * (1 when __trace_bputs is used, strlen(str) when __trace_puts is used) */ -extern int __trace_bputs(unsigned long ip, const char *str); -extern int __trace_puts(unsigned long ip, const char *str, int size); #define trace_puts(str) ({ \ static const char *trace_printk_fmt \ __attribute__((section("__trace_printk_fmt"))) = \ -- cgit v1.2.3 From 26cb63ad11e04047a64309362674bcbbd6a6f246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 May 2013 10:55:48 +0200 Subject: perf: Fix perf mmap bugs Vince reported a problem found by his perf specific trinity fuzzer. Al noticed 2 problems with perf's mmap(): - it has issues against fork() since we use vma->vm_mm for accounting. - it has an rb refcount leak on double mmap(). We fix the issues against fork() by using VM_DONTCOPY; I don't think there's code out there that uses this; we didn't hear about weird accounting problems/crashes. If we do need this to work, the previously proposed VM_PINNED could make this work. Aside from the rb reference leak spotted by Al, Vince's example prog was indeed doing a double mmap() through the use of perf_event_set_output(). This exposes another problem, since we now have 2 events with one buffer, the accounting gets screwy because we account per event. Fix this by making the buffer responsible for its own accounting. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Al Viro Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20130528085548.GA12193@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 37 ++++++++++++++++++++----------------- kernel/events/internal.h | 3 +++ 3 files changed, 24 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f463a46424e2..c5b6dbf9c2fc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -389,8 +389,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; - int mmap_locked; - struct user_struct *mmap_user; + struct ring_buffer *rb; struct list_head rb_entry; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..ae752cd4a086 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2917,7 +2917,7 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void ring_buffer_put(struct ring_buffer *rb); +static bool ring_buffer_put(struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -3582,13 +3582,13 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static void ring_buffer_put(struct ring_buffer *rb) +static bool ring_buffer_put(struct ring_buffer *rb) { struct perf_event *event, *n; unsigned long flags; if (!atomic_dec_and_test(&rb->refcount)) - return; + return false; spin_lock_irqsave(&rb->event_lock, flags); list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { @@ -3598,6 +3598,7 @@ static void ring_buffer_put(struct ring_buffer *rb) spin_unlock_irqrestore(&rb->event_lock, flags); call_rcu(&rb->rcu_head, rb_free_rcu); + return true; } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3612,18 +3613,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->rb); - struct user_struct *user = event->mmap_user; struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); ring_buffer_detach(event, rb); mutex_unlock(&event->mmap_mutex); - ring_buffer_put(rb); - free_uid(user); + if (ring_buffer_put(rb)) { + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + } } } @@ -3676,9 +3679,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages == nr_pages) - atomic_inc(&event->rb->refcount); - else + if (event->rb->nr_pages != nr_pages) ret = -EINVAL; goto unlock; } @@ -3720,12 +3721,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->rb, rb); + + rb->mmap_locked = extra; + rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->pinned_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += extra; + + rcu_assign_pointer(event->rb, rb); perf_event_update_userpage(event); @@ -3734,7 +3737,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59df..5bc6c8e9b851 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,6 +31,9 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; + int mmap_locked; + struct user_struct *mmap_user; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; -- cgit v1.2.3 From 12bcbe66d7b3cc9f9f86cd02f925666eaa3c2107 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 May 2013 14:38:42 -0400 Subject: rcu: Add _notrace variation of rcu_dereference_raw() and hlist_for_each_entry_rcu() As rcu_dereference_raw() under RCU debug config options can add quite a bit of checks, and that tracing uses rcu_dereference_raw(), these checks happen with the function tracer. The function tracer also happens to trace these debug checks too. This added overhead can livelock the system. Add a new interface to RCU for both rcu_dereference_raw_notrace() as well as hlist_for_each_entry_rcu_notrace() as the hlist iterator uses the rcu_dereference_raw() as well, and is used a bit with the function tracer. Link: http://lkml.kernel.org/r/20130528184209.304356745@goodmis.org Acked-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- include/linux/rculist.h | 20 ++++++++++++++++++++ include/linux/rcupdate.h | 9 +++++++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8089e35d47ac..f4b1001a4676 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -460,6 +460,26 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) +/** + * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + * + * This is the same as hlist_for_each_entry_rcu() except that it does + * not do any RCU debugging or tracing. + */ +#define hlist_for_each_entry_rcu_notrace(pos, head, member) \ + for (pos = hlist_entry_safe (rcu_dereference_raw_notrace(hlist_first_rcu(head)),\ + typeof(*(pos)), member); \ + pos; \ + pos = hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\ + &(pos)->member)), typeof(*(pos)), member)) + /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 4ccd68e49b00..ddcc7826d907 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -640,6 +640,15 @@ static inline void rcu_preempt_sleep_check(void) #define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/ +/* + * The tracing infrastructure traces RCU (we want that), but unfortunately + * some of the RCU checks causes tracing to lock up the system. + * + * The tracing version of rcu_dereference_raw() must not call + * rcu_read_lock_held(). + */ +#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu) + /** * rcu_access_index() - fetch RCU index with no dereferencing * @p: The index to read -- cgit v1.2.3 From ac4e97abce9b80c020e7113325f49e58b7b15e3f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 May 2013 09:19:35 +0200 Subject: scatterlist: sg_set_buf() argument must be in linear mapping Add a check behind CONFIG_DEBUG_SG to verify this. Signed-off-by: Rusty Russell Signed-off-by: Jens Axboe --- include/linux/scatterlist.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 5951e3f38878..26806775b11b 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -111,6 +111,9 @@ static inline struct page *sg_page(struct scatterlist *sg) static inline void sg_set_buf(struct scatterlist *sg, const void *buf, unsigned int buflen) { +#ifdef CONFIG_DEBUG_SG + BUG_ON(!virt_addr_valid(buf)); +#endif sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf)); } -- cgit v1.2.3 From 37448adfc7ce0d6d5892b87aa8d57edde4126f49 Mon Sep 17 00:00:00 2001 From: Lance Ortiz Date: Thu, 30 May 2013 08:25:12 -0600 Subject: aerdrv: Move cper_print_aer() call out of interrupt context The following warning was seen on 3.9 when a corrected PCIe error was being handled by the AER subsystem. WARNING: at .../drivers/pci/search.c:214 pci_get_dev_by_id+0x8a/0x90() This occurred because a call to pci_get_domain_bus_and_slot() was added to cper_print_pcie() to setup for the call to cper_print_aer(). The warning showed up because cper_print_pcie() is called in an interrupt context and pci_get* functions are not supposed to be called in that context. The solution is to move the cper_print_aer() call out of the interrupt context and into aer_recover_work_func() to avoid any warnings when calling pci_get* functions. Signed-off-by: Lance Ortiz Acked-by: Borislav Petkov Acked-by: Rafael J. Wysocki Signed-off-by: Tony Luck --- drivers/acpi/apei/cper.c | 18 ------------------ drivers/acpi/apei/ghes.c | 4 +++- drivers/pci/pcie/aer/aerdrv_core.c | 5 ++++- drivers/pci/pcie/aer/aerdrv_errprint.c | 4 ++-- include/linux/aer.h | 5 +++-- 5 files changed, 12 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c index fefc2ca7cc3e..33dc6a004802 100644 --- a/drivers/acpi/apei/cper.c +++ b/drivers/acpi/apei/cper.c @@ -250,10 +250,6 @@ static const char *cper_pcie_port_type_strs[] = { static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, const struct acpi_hest_generic_data *gdata) { -#ifdef CONFIG_ACPI_APEI_PCIEAER - struct pci_dev *dev; -#endif - if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE) printk("%s""port_type: %d, %s\n", pfx, pcie->port_type, pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ? @@ -285,20 +281,6 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, printk( "%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n", pfx, pcie->bridge.secondary_status, pcie->bridge.control); -#ifdef CONFIG_ACPI_APEI_PCIEAER - dev = pci_get_domain_bus_and_slot(pcie->device_id.segment, - pcie->device_id.bus, pcie->device_id.function); - if (!dev) { - pr_err("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n", - pcie->device_id.segment, pcie->device_id.bus, - pcie->device_id.slot, pcie->device_id.function); - return; - } - if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) - cper_print_aer(pfx, dev, gdata->error_severity, - (struct aer_capability_regs *) pcie->aer_info); - pci_dev_put(dev); -#endif } static const char *apei_estatus_section_flag_strs[] = { diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d668a8ae602b..403baf4dffc1 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -454,7 +454,9 @@ static void ghes_do_proc(struct ghes *ghes, aer_severity = cper_severity_to_aer(sev); aer_recover_queue(pcie_err->device_id.segment, pcie_err->device_id.bus, - devfn, aer_severity); + devfn, aer_severity, + (struct aer_capability_regs *) + pcie_err->aer_info); } } diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c index 8ec8b4f48560..0f4554e48cc5 100644 --- a/drivers/pci/pcie/aer/aerdrv_core.c +++ b/drivers/pci/pcie/aer/aerdrv_core.c @@ -580,6 +580,7 @@ struct aer_recover_entry u8 devfn; u16 domain; int severity; + struct aer_capability_regs *regs; }; static DEFINE_KFIFO(aer_recover_ring, struct aer_recover_entry, @@ -593,7 +594,7 @@ static DEFINE_SPINLOCK(aer_recover_ring_lock); static DECLARE_WORK(aer_recover_work, aer_recover_work_func); void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, - int severity) + int severity, struct aer_capability_regs *aer_regs) { unsigned long flags; struct aer_recover_entry entry = { @@ -601,6 +602,7 @@ void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, .devfn = devfn, .domain = domain, .severity = severity, + .regs = aer_regs, }; spin_lock_irqsave(&aer_recover_ring_lock, flags); @@ -627,6 +629,7 @@ static void aer_recover_work_func(struct work_struct *work) PCI_SLOT(entry.devfn), PCI_FUNC(entry.devfn)); continue; } + cper_print_aer(pdev, entry.severity, entry.regs); do_recovery(pdev, entry.severity); pci_dev_put(pdev); } diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c index 5ab14251839d..2c7c9f5f592c 100644 --- a/drivers/pci/pcie/aer/aerdrv_errprint.c +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c @@ -220,7 +220,7 @@ int cper_severity_to_aer(int cper_severity) } EXPORT_SYMBOL_GPL(cper_severity_to_aer); -void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity, +void cper_print_aer(struct pci_dev *dev, int cper_severity, struct aer_capability_regs *aer) { int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0; @@ -244,7 +244,7 @@ void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity, agent = AER_GET_AGENT(aer_severity, status); dev_err(&dev->dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, mask); - cper_print_bits(prefix, status, status_strs, status_strs_size); + cper_print_bits("", status, status_strs, status_strs_size); dev_err(&dev->dev, "aer_layer=%s, aer_agent=%s\n", aer_error_layer[layer], aer_agent_string[agent]); if (aer_severity != AER_CORRECTABLE) diff --git a/include/linux/aer.h b/include/linux/aer.h index ec10e1b24c1c..737f90ab4b62 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -49,10 +49,11 @@ static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) } #endif -extern void cper_print_aer(const char *prefix, struct pci_dev *dev, +extern void cper_print_aer(struct pci_dev *dev, int cper_severity, struct aer_capability_regs *aer); extern int cper_severity_to_aer(int cper_severity); extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, - int severity); + int severity, + struct aer_capability_regs *aer_regs); #endif //_AER_H_ -- cgit v1.2.3 From 45eacc692771bd2b1ea3d384e6345cab3da10861 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 May 2013 22:16:32 +0200 Subject: vtime: Use consistent clocks among nohz accounting While computing the cputime delta of dynticks CPUs, we are mixing up clocks of differents natures: * local_clock() which takes care of unstable clock sources and fix these if needed. * sched_clock() which is the weaker version of local_clock(). It doesn't compute any fixup in case of unstable source. If the clock source is stable, those two clocks are the same and we can safely compute the difference against two random points. Otherwise it results in random deltas as sched_clock() can randomly drift away, back or forward, from local_clock(). As a consequence, some strange behaviour with unstable tsc has been observed such as non progressing constant zero cputime. (The 'top' command showing no load). Fix this by only using local_clock(), or its irq safe/remote equivalent, in vtime code. Reported-by: Mike Galbraith Suggested-by: Mike Galbraith Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 4 ++-- kernel/sched/core.c | 2 +- kernel/sched/cputime.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 71a5782d8c59..b1dd2db80076 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -34,7 +34,7 @@ static inline void vtime_user_exit(struct task_struct *tsk) } extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); -extern void vtime_init_idle(struct task_struct *tsk); +extern void vtime_init_idle(struct task_struct *tsk, int cpu); #else static inline void vtime_account_irq_exit(struct task_struct *tsk) { @@ -45,7 +45,7 @@ static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { } static inline void vtime_guest_enter(struct task_struct *tsk) { } static inline void vtime_guest_exit(struct task_struct *tsk) { } -static inline void vtime_init_idle(struct task_struct *tsk) { } +static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..e1a27f918723 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4745,7 +4745,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); - vtime_init_idle(idle); + vtime_init_idle(idle, cpu); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..b5ccba22603b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqlock(¤t->vtime_seqlock); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock(); + current->vtime_snap = sched_clock_cpu(smp_processor_id()); write_sequnlock(¤t->vtime_seqlock); } -void vtime_init_idle(struct task_struct *t) +void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; write_seqlock_irqsave(&t->vtime_seqlock, flags); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock(); + t->vtime_snap = sched_clock_cpu(cpu); write_sequnlock_irqrestore(&t->vtime_seqlock, flags); } -- cgit v1.2.3 From 521921bad1192fb1b8f9b6a5aa673635848b8b5f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 May 2013 01:21:38 +0200 Subject: kvm: Move guest entry/exit APIs to context_tracking The kvm_host.h header file doesn't handle well inclusion when archs don't support KVM. This results in build crashes for such archs when they want to implement context tracking because this subsystem includes kvm_host.h in order to implement the guest_enter/exit APIs but it doesn't handle KVM off case. To fix this, move the guest_enter()/guest_exit() declarations and generic implementation to the context tracking headers. These generic APIs actually belong to this subsystem, besides other domains boundary tracking like user_enter() et al. KVM now properly becomes a user of this library, not the other buggy way around. Reported-by: Kevin Hilman Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Kevin Hilman Cc: Marcelo Tosatti Cc: Gleb Natapov Signed-off-by: Ingo Molnar --- include/linux/context_tracking.h | 35 +++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 37 +------------------------------------ kernel/context_tracking.c | 1 - 3 files changed, 36 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 365f4a61bf04..fc09d7b0dacf 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -3,6 +3,7 @@ #include #include +#include #include struct context_tracking { @@ -19,6 +20,26 @@ struct context_tracking { } state; }; +static inline void __guest_enter(void) +{ + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system(current); + current->flags |= PF_VCPU; +} + +static inline void __guest_exit(void) +{ + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system(current); + current->flags &= ~PF_VCPU; +} + #ifdef CONFIG_CONTEXT_TRACKING DECLARE_PER_CPU(struct context_tracking, context_tracking); @@ -35,6 +56,9 @@ static inline bool context_tracking_active(void) extern void user_enter(void); extern void user_exit(void); +extern void guest_enter(void); +extern void guest_exit(void); + static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; @@ -57,6 +81,17 @@ extern void context_tracking_task_switch(struct task_struct *prev, static inline bool context_tracking_in_user(void) { return false; } static inline void user_enter(void) { } static inline void user_exit(void) { } + +static inline void guest_enter(void) +{ + __guest_enter(); +} + +static inline void guest_exit(void) +{ + __guest_exit(); +} + static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline void context_tracking_task_switch(struct task_struct *prev, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f0eea07d2c2b..8db53cfaccdb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -760,42 +761,6 @@ static inline int kvm_iommu_unmap_guest(struct kvm *kvm) } #endif -static inline void __guest_enter(void) -{ - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ - vtime_account_system(current); - current->flags |= PF_VCPU; -} - -static inline void __guest_exit(void) -{ - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ - vtime_account_system(current); - current->flags &= ~PF_VCPU; -} - -#ifdef CONFIG_CONTEXT_TRACKING -extern void guest_enter(void); -extern void guest_exit(void); - -#else /* !CONFIG_CONTEXT_TRACKING */ -static inline void guest_enter(void) -{ - __guest_enter(); -} - -static inline void guest_exit(void) -{ - __guest_exit(); -} -#endif /* !CONFIG_CONTEXT_TRACKING */ - static inline void kvm_guest_enter(void) { unsigned long flags; diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..85bdde1137eb 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,7 +15,6 @@ */ #include -#include #include #include #include -- cgit v1.2.3 From 1e2bd517c108816220f262d7954b697af03b5f9c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Thu, 30 May 2013 06:45:27 +0000 Subject: udp6: Fix udp fragmentation for tunnel traffic. udp6 over GRE tunnel does not work after to GRE tso changes. GRE tso handler passes inner packet but keeps track of outer header start in SKB_GSO_CB(skb)->mac_offset. udp6 fragment need to take care of outer header, which start at the mac_offset, while adding fragment header. This bug is introduced by commit 68c3316311 (GRE: Add TCP segmentation offload for GRE). Reported-by: Dmitry Kravkov Signed-off-by: Pravin B Shelar Tested-by: Dmitry Kravkov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 15 +++++++++++++++ net/ipv6/udp_offload.c | 20 ++++++++++++-------- 2 files changed, 27 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2e0ced1af3b1..9c676eae3968 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2852,6 +2852,21 @@ static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) SKB_GSO_CB(inner_skb)->mac_offset; } +static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) +{ + int new_headroom, headroom; + int ret; + + headroom = skb_headroom(skb); + ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); + if (ret) + return ret; + + new_headroom = skb_headroom(skb); + SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); + return 0; +} + static inline bool skb_is_gso(const struct sk_buff *skb) { return skb_shinfo(skb)->gso_size; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 3bb3a891a424..d3cfaf9c7a08 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -46,11 +46,12 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, unsigned int mss; unsigned int unfrag_ip6hlen, unfrag_len; struct frag_hdr *fptr; - u8 *mac_start, *prevhdr; + u8 *packet_start, *prevhdr; u8 nexthdr; u8 frag_hdr_sz = sizeof(struct frag_hdr); int offset; __wsum csum; + int tnl_hlen; mss = skb_shinfo(skb)->gso_size; if (unlikely(skb->len <= mss)) @@ -83,9 +84,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; /* Check if there is enough headroom to insert fragment header. */ - if ((skb_mac_header(skb) < skb->head + frag_hdr_sz) && - pskb_expand_head(skb, frag_hdr_sz, 0, GFP_ATOMIC)) - goto out; + tnl_hlen = skb_tnl_header_len(skb); + if (skb_headroom(skb) < (tnl_hlen + frag_hdr_sz)) { + if (gso_pskb_expand_head(skb, tnl_hlen + frag_hdr_sz)) + goto out; + } /* Find the unfragmentable header and shift it left by frag_hdr_sz * bytes to insert fragment header. @@ -93,11 +96,12 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr); nexthdr = *prevhdr; *prevhdr = NEXTHDR_FRAGMENT; - unfrag_len = skb_network_header(skb) - skb_mac_header(skb) + - unfrag_ip6hlen; - mac_start = skb_mac_header(skb); - memmove(mac_start-frag_hdr_sz, mac_start, unfrag_len); + unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) + + unfrag_ip6hlen + tnl_hlen; + packet_start = (u8 *) skb->head + SKB_GSO_CB(skb)->mac_offset; + memmove(packet_start-frag_hdr_sz, packet_start, unfrag_len); + SKB_GSO_CB(skb)->mac_offset -= frag_hdr_sz; skb->mac_header -= frag_hdr_sz; skb->network_header -= frag_hdr_sz; -- cgit v1.2.3 From 6d7581e62f8be462440d7b22c6361f7c9fa4902b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 29 May 2013 05:02:56 +0000 Subject: list: introduce list_first_entry_or_null non-rcu variant of list_first_or_null_rcu Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/list.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 6a1f8df9144b..b83e5657365a 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -361,6 +361,17 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) +/** + * list_first_entry_or_null - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note that if the list is empty, it returns NULL. + */ +#define list_first_entry_or_null(ptr, type, member) \ + (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) + /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. -- cgit v1.2.3 From c87a124a5d5e8cf8e21c4363c3372bcaf53ea190 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 May 2013 09:06:27 +0000 Subject: net: force a reload of first item in hlist_nulls_for_each_entry_rcu Roman Gushchin discovered that udp4_lib_lookup2() was not reloading first item in the rcu protected list, in case the loop was restarted. This produced soft lockups as in https://lkml.org/lkml/2013/4/16/37 rcu_dereference(X)/ACCESS_ONCE(X) seem to not work as intended if X is ptr->field : In some cases, gcc caches the value or ptr->field in a register. Use a barrier() to disallow such caching, as documented in Documentation/atomic_ops.txt line 114 Thanks a lot to Roman for providing analysis and numerous patches. Diagnosed-by: Roman Gushchin Signed-off-by: Eric Dumazet Reported-by: Boris Zhmurov Signed-off-by: Roman Gushchin Acked-by: Paul E. McKenney Signed-off-by: David S. Miller --- include/linux/rculist_nulls.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 2ae13714828b..1c33dd7da4a7 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -105,9 +105,14 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, * @head: the head for your list. * @member: the name of the hlist_nulls_node within the struct. * + * The barrier() is needed to make sure compiler doesn't cache first element [1], + * as this loop can be restarted [2] + * [1] Documentation/atomic_ops.txt around line 114 + * [2] Documentation/RCU/rculist_nulls.txt around line 146 */ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ - for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ + for (({barrier();}), \ + pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ (!is_a_nulls(pos)) && \ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) -- cgit v1.2.3 From a7526eb5d06b0084ef12d7b168d008fcf516caab Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 5 Jun 2013 19:38:26 +0000 Subject: net: Unbreak compat_sys_{send,recv}msg I broke them in this commit: commit 1be374a0518a288147c6a7398792583200a67261 Author: Andy Lutomirski Date: Wed May 22 14:07:44 2013 -0700 net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg This patch adds __sys_sendmsg and __sys_sendmsg as common helpers that accept MSG_CMSG_COMPAT and blocks MSG_CMSG_COMPAT at the syscall entrypoints. It also reverts some unnecessary checks in sys_socketcall. Apparently I was suffering from underscore blindness the first time around. Signed-off-by: Andy Lutomirski Tested-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/socket.h | 3 +++ net/compat.c | 13 +++++++-- net/socket.c | 72 +++++++++++++++++++++++--------------------------- 3 files changed, 47 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 33bf2dfab19d..b10ce4b341ea 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -320,6 +320,9 @@ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); struct timespec; +/* The __sys_...msg variants allow MSG_CMSG_COMPAT */ +extern long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags); +extern long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags); extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags, struct timespec *timeout); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, diff --git a/net/compat.c b/net/compat.c index 79ae88485001..f0a1ba6c8086 100644 --- a/net/compat.c +++ b/net/compat.c @@ -734,19 +734,25 @@ static unsigned char nas[21] = { asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) { - return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_sendmmsg(int fd, struct compat_mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags) { + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; return __sys_sendmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) { - return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } asmlinkage long compat_sys_recv(int fd, void __user *buf, size_t len, unsigned int flags) @@ -768,6 +774,9 @@ asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, int datagrams; struct timespec ktspec; + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + if (COMPAT_USE_64BIT_TIME) return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, flags | MSG_CMSG_COMPAT, diff --git a/net/socket.c b/net/socket.c index 9ff6366fee13..4ca1526db756 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1956,7 +1956,7 @@ struct used_address { unsigned int name_len; }; -static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg, +static int ___sys_sendmsg(struct socket *sock, struct msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address) { @@ -2071,26 +2071,30 @@ out: * BSD sendmsg interface */ -SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags) +long __sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - err = __sys_sendmsg(sock, msg, &msg_sys, flags, NULL); + err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL); fput_light(sock->file, fput_needed); out: return err; } +SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned int, flags) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_sendmsg(fd, msg, flags); +} + /* * Linux sendmmsg interface */ @@ -2121,15 +2125,16 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, while (datagrams < vlen) { if (MSG_CMSG_COMPAT & flags) { - err = __sys_sendmsg(sock, (struct msghdr __user *)compat_entry, - &msg_sys, flags, &used_address); + err = ___sys_sendmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags, &used_address); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { - err = __sys_sendmsg(sock, (struct msghdr __user *)entry, - &msg_sys, flags, &used_address); + err = ___sys_sendmsg(sock, + (struct msghdr __user *)entry, + &msg_sys, flags, &used_address); if (err < 0) break; err = put_user(err, &entry->msg_len); @@ -2158,7 +2163,7 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, return __sys_sendmmsg(fd, mmsg, vlen, flags); } -static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg, +static int ___sys_recvmsg(struct socket *sock, struct msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, int nosec) { struct compat_msghdr __user *msg_compat = @@ -2250,27 +2255,31 @@ out: * BSD recvmsg interface */ -SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, - unsigned int, flags) +long __sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags) { int fput_needed, err; struct msghdr msg_sys; struct socket *sock; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0); + err = ___sys_recvmsg(sock, msg, &msg_sys, flags, 0); fput_light(sock->file, fput_needed); out: return err; } +SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg, + unsigned int, flags) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + return __sys_recvmsg(fd, msg, flags); +} + /* * Linux recvmmsg interface */ @@ -2308,17 +2317,18 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, * No need to ask LSM for more than the first datagram. */ if (MSG_CMSG_COMPAT & flags) { - err = __sys_recvmsg(sock, (struct msghdr __user *)compat_entry, - &msg_sys, flags & ~MSG_WAITFORONE, - datagrams); + err = ___sys_recvmsg(sock, (struct msghdr __user *)compat_entry, + &msg_sys, flags & ~MSG_WAITFORONE, + datagrams); if (err < 0) break; err = __put_user(err, &compat_entry->msg_len); ++compat_entry; } else { - err = __sys_recvmsg(sock, (struct msghdr __user *)entry, - &msg_sys, flags & ~MSG_WAITFORONE, - datagrams); + err = ___sys_recvmsg(sock, + (struct msghdr __user *)entry, + &msg_sys, flags & ~MSG_WAITFORONE, + datagrams); if (err < 0) break; err = put_user(err, &entry->msg_len); @@ -2505,31 +2515,15 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) (int __user *)a[4]); break; case SYS_SENDMSG: - if (a[2] & MSG_CMSG_COMPAT) { - err = -EINVAL; - break; - } err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_SENDMMSG: - if (a[3] & MSG_CMSG_COMPAT) { - err = -EINVAL; - break; - } err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]); break; case SYS_RECVMSG: - if (a[2] & MSG_CMSG_COMPAT) { - err = -EINVAL; - break; - } err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; case SYS_RECVMMSG: - if (a[3] & MSG_CMSG_COMPAT) { - err = -EINVAL; - break; - } err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3], (struct timespec __user *)a[4]); break; -- cgit v1.2.3 From d62840995a99c9766803d54e9d7923f247a1c1db Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 May 2013 02:41:36 -0700 Subject: trace: Allow idle-safe tracepoints to be called from irq __DECLARE_TRACE_RCU() currently creates an _rcuidle() tracepoint which may safely be invoked from what RCU considers to be an idle CPU. However, these _rcuidle() tracepoints may -not- be invoked from the handler of an irq taken from idle, because rcu_idle_enter() zeroes RCU's nesting-level counter, so that the rcu_irq_exit() returning to idle will trigger a WARN_ON_ONCE(). This commit therefore substitutes rcu_irq_enter() for rcu_idle_exit() and rcu_irq_exit() for rcu_idle_enter() in order to make the _rcuidle() tracepoints usable from irq handlers as well as from process context. Reported-by: Dave Jones Signed-off-by: Paul E. McKenney Cc: Steven Rostedt --- include/linux/tracepoint.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 2f322c38bd4d..f8e084d0fc77 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -145,8 +145,8 @@ static inline void tracepoint_synchronize_unregister(void) TP_PROTO(data_proto), \ TP_ARGS(data_args), \ TP_CONDITION(cond), \ - rcu_idle_exit(), \ - rcu_idle_enter()); \ + rcu_irq_enter(), \ + rcu_irq_exit()); \ } #else #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) -- cgit v1.2.3 From ed13998c319b050fc9abdb73915859dfdbe1fb38 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 5 Jun 2013 15:30:55 +0200 Subject: sock_diag: fix filter code sent to userspace Filters need to be translated to real BPF code for userland, like SO_GETFILTER. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/filter.h | 1 + net/core/filter.c | 2 +- net/core/sock_diag.c | 9 +++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index c050dcc322a4..f65f5a69db8f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -46,6 +46,7 @@ extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); extern int sk_detach_filter(struct sock *sk); extern int sk_chk_filter(struct sock_filter *filter, unsigned int flen); extern int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned len); +extern void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to); #ifdef CONFIG_BPF_JIT #include diff --git a/net/core/filter.c b/net/core/filter.c index dad2a178f9f8..6438f29ff266 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -778,7 +778,7 @@ int sk_detach_filter(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_detach_filter); -static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) +void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to) { static const u16 decodes[] = { [BPF_S_ALU_ADD_K] = BPF_ALU|BPF_ADD|BPF_K, diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index d5bef0b0f639..a0e9cf6379de 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -73,8 +73,13 @@ int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk, goto out; } - if (filter) - memcpy(nla_data(attr), filter->insns, len); + if (filter) { + struct sock_filter *fb = (struct sock_filter *)nla_data(attr); + int i; + + for (i = 0; i < filter->len; i++, fb++) + sk_decode_filter(&filter->insns[i], fb); + } out: rcu_read_unlock(); -- cgit v1.2.3 From b79462a8b9f9a452edc20c64a70a89ba3b0a6a88 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 8 Jun 2013 15:00:55 +0200 Subject: team: fix checks in team_get_first_port_txable_rcu() should be checked if "cur" is txable, not "port". Introduced by commit 6e88e1357c "team: use function team_port_txable() for determing enabled and up port" Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_team.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 4474557904f6..16fae6436d0e 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -249,12 +249,12 @@ team_get_first_port_txable_rcu(struct team *team, struct team_port *port) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) - if (team_port_txable(port)) + if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; - if (team_port_txable(port)) + if (team_port_txable(cur)) return cur; } return NULL; -- cgit v1.2.3 From 16e53dbf10a2d7e228709a7286310e629ede5e45 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 12 Jun 2013 14:04:36 -0700 Subject: CPU hotplug: provide a generic helper to disable/enable CPU hotplug There are instances in the kernel where we would like to disable CPU hotplug (from sysfs) during some important operation. Today the freezer code depends on this and the code to do it was kinda tailor-made for that. Restructure the code and make it generic enough to be useful for other usecases too. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Robin Holt Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Russ Anderson Cc: Robin Holt Cc: Russell King Cc: Guan Xuetao Cc: Shawn Guo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpu.h | 4 ++++ kernel/cpu.c | 55 ++++++++++++++++++++++------------------------------- 2 files changed, 27 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index c6f6e0839b61..9f3c7e81270a 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -175,6 +175,8 @@ extern struct bus_type cpu_subsys; extern void get_online_cpus(void); extern void put_online_cpus(void); +extern void cpu_hotplug_disable(void); +extern void cpu_hotplug_enable(void); #define hotcpu_notifier(fn, pri) cpu_notifier(fn, pri) #define register_hotcpu_notifier(nb) register_cpu_notifier(nb) #define unregister_hotcpu_notifier(nb) unregister_cpu_notifier(nb) @@ -198,6 +200,8 @@ static inline void cpu_hotplug_driver_unlock(void) #define get_online_cpus() do { } while (0) #define put_online_cpus() do { } while (0) +#define cpu_hotplug_disable() do { } while (0) +#define cpu_hotplug_enable() do { } while (0) #define hotcpu_notifier(fn, pri) do { (void)(fn); } while (0) /* These aren't inline functions due to a GCC bug. */ #define register_hotcpu_notifier(nb) ({ (void)(nb); 0; }) diff --git a/kernel/cpu.c b/kernel/cpu.c index b5e4ab2d427e..198a38883e64 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -133,6 +133,27 @@ static void cpu_hotplug_done(void) mutex_unlock(&cpu_hotplug.lock); } +/* + * Wait for currently running CPU hotplug operations to complete (if any) and + * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects + * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the + * hotplug path before performing hotplug operations. So acquiring that lock + * guarantees mutual exclusion from any currently running hotplug operations. + */ +void cpu_hotplug_disable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + +void cpu_hotplug_enable(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + #else /* #if CONFIG_HOTPLUG_CPU */ static void cpu_hotplug_begin(void) {} static void cpu_hotplug_done(void) {} @@ -540,36 +561,6 @@ static int __init alloc_frozen_cpus(void) } core_initcall(alloc_frozen_cpus); -/* - * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU - * hotplug when tasks are about to be frozen. Also, don't allow the freezer - * to continue until any currently running CPU hotplug operation gets - * completed. - * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the - * 'cpu_add_remove_lock'. And this same lock is also taken by the regular - * CPU hotplug path and released only after it is complete. Thus, we - * (and hence the freezer) will block here until any currently running CPU - * hotplug operation gets completed. - */ -void cpu_hotplug_disable_before_freeze(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; - cpu_maps_update_done(); -} - - -/* - * When tasks have been thawed, re-enable regular CPU hotplug (which had been - * disabled while beginning to freeze tasks). - */ -void cpu_hotplug_enable_after_thaw(void) -{ - cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; - cpu_maps_update_done(); -} - /* * When callbacks for CPU hotplug notifications are being executed, we must * ensure that the state of the system with respect to the tasks being frozen @@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb, case PM_SUSPEND_PREPARE: case PM_HIBERNATION_PREPARE: - cpu_hotplug_disable_before_freeze(); + cpu_hotplug_disable(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: - cpu_hotplug_enable_after_thaw(); + cpu_hotplug_enable(); break; default: -- cgit v1.2.3 From 637241a900cbd982f744d44646b48a273d609b34 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jun 2013 14:04:39 -0700 Subject: kmsg: honor dmesg_restrict sysctl on /dev/kmsg The dmesg_restrict sysctl currently covers the syslog method for access dmesg, however /dev/kmsg isn't covered by the same protections. Most people haven't noticed because util-linux dmesg(1) defaults to using the syslog method for access in older versions. With util-linux dmesg(1) defaults to reading directly from /dev/kmsg. To fix /dev/kmsg, let's compare the existing interfaces and what they allow: - /proc/kmsg allows: - open (SYSLOG_ACTION_OPEN) if CAP_SYSLOG since it uses a destructive single-reader interface (SYSLOG_ACTION_READ). - everything, after an open. - syslog syscall allows: - anything, if CAP_SYSLOG. - SYSLOG_ACTION_READ_ALL and SYSLOG_ACTION_SIZE_BUFFER, if dmesg_restrict==0. - nothing else (EPERM). The use-cases were: - dmesg(1) needs to do non-destructive SYSLOG_ACTION_READ_ALLs. - sysklog(1) needs to open /proc/kmsg, drop privs, and still issue the destructive SYSLOG_ACTION_READs. AIUI, dmesg(1) is moving to /dev/kmsg, and systemd-journald doesn't clear the ring buffer. Based on the comments in devkmsg_llseek, it sounds like actions besides reading aren't going to be supported by /dev/kmsg (i.e. SYSLOG_ACTION_CLEAR), so we have a strict subset of the non-destructive syslog syscall actions. To this end, move the check as Josh had done, but also rename the constants to reflect their new uses (SYSLOG_FROM_CALL becomes SYSLOG_FROM_READER, and SYSLOG_FROM_FILE becomes SYSLOG_FROM_PROC). SYSLOG_FROM_READER allows non-destructive actions, and SYSLOG_FROM_PROC allows destructive actions after a capabilities-constrained SYSLOG_ACTION_OPEN check. - /dev/kmsg allows: - open if CAP_SYSLOG or dmesg_restrict==0 - reading/polling, after open Addresses https://bugzilla.redhat.com/show_bug.cgi?id=903192 [akpm@linux-foundation.org: use pr_warn_once()] Signed-off-by: Kees Cook Reported-by: Christian Kujau Tested-by: Josh Boyer Cc: Kay Sievers Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kmsg.c | 10 +++--- include/linux/syslog.h | 4 +-- kernel/printk.c | 91 +++++++++++++++++++++++++++----------------------- 3 files changed, 57 insertions(+), 48 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bd4b5a740ff1..bdfabdaefdce 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 38911391a139..98a3153c0f96 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -44,8 +44,8 @@ /* Return size of the log buffer */ #define SYSLOG_ACTION_SIZE_BUFFER 10 -#define SYSLOG_FROM_CALL 0 -#define SYSLOG_FROM_FILE 1 +#define SYSLOG_FROM_READER 0 +#define SYSLOG_FROM_PROC 1 int do_syslog(int type, char __user *buf, int count, bool from_file); diff --git a/kernel/printk.c b/kernel/printk.c index fa36e1494420..8212c1aef125 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -363,6 +363,53 @@ static void log_store(int facility, int level, log_next_seq++; } +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* + * Unless restricted, we allow "read all" and "get buffer size" + * for everybody. + */ + return type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* + * For historical reasons, accept CAP_SYS_ADMIN too, with + * a warning. + */ + if (capable(CAP_SYS_ADMIN)) { + pr_warn_once("%s (%d): Attempt to access syslog with " + "CAP_SYS_ADMIN but no CAP_SYSLOG " + "(deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return security_syslog(type); +} + + /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { u64 seq; @@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file) if ((file->f_flags & O_ACCMODE) == O_WRONLY) return 0; - err = security_syslog(SYSLOG_ACTION_READ_ALL); + err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); if (err) return err; @@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level) } #endif -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif - -static int syslog_action_restricted(int type) -{ - if (dmesg_restrict) - return 1; - /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ - return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; -} - -static int check_syslog_permissions(int type, bool from_file) -{ - /* - * If this is from /proc/kmsg and we've already opened it, then we've - * already done the capabilities checks at open time. - */ - if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; - - if (syslog_action_restricted(type)) { - if (capable(CAP_SYSLOG)) - return 0; - /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ - if (capable(CAP_SYS_ADMIN)) { - printk_once(KERN_WARNING "%s (%d): " - "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated).\n", - current->comm, task_pid_nr(current)); - return 0; - } - return -EPERM; - } - return 0; -} - #if defined(CONFIG_PRINTK_TIME) static bool printk_time = 1; #else @@ -1249,7 +1258,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len, SYSLOG_FROM_CALL); + return do_syslog(type, buf, len, SYSLOG_FROM_READER); } /* -- cgit v1.2.3 From 30dad30922ccc733cfdbfe232090cf674dc374dc Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 12 Jun 2013 14:05:04 -0700 Subject: mm: migration: add migrate_entry_wait_huge() When we have a page fault for the address which is backed by a hugepage under migration, the kernel can't wait correctly and do busy looping on hugepage fault until the migration finishes. As a result, users who try to kick hugepage migration (via soft offlining, for example) occasionally experience long delay or soft lockup. This is because pte_offset_map_lock() can't get a correct migration entry or a correct page table lock for hugepage. This patch introduces migration_entry_wait_huge() to solve this. Signed-off-by: Naoya Horiguchi Reviewed-by: Rik van Riel Reviewed-by: Wanpeng Li Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Andi Kleen Cc: KOSAKI Motohiro Cc: [2.6.35+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 3 +++ mm/hugetlb.c | 2 +- mm/migrate.c | 23 ++++++++++++++++++----- 3 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 47ead515c811..c5fd30d2a415 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -137,6 +137,7 @@ static inline void make_migration_entry_read(swp_entry_t *entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte); #else #define make_migration_entry(page, write) swp_entry(0, 0) @@ -148,6 +149,8 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } +static inline void migration_entry_wait_huge(struct mm_struct *mm, + pte_t *pte) { } static inline int is_write_migration_entry(swp_entry_t entry) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f8feeeca6686..e2bfbf73a551 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2839,7 +2839,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait(mm, (pmd_t *)ptep, address); + migration_entry_wait_huge(mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | diff --git a/mm/migrate.c b/mm/migrate.c index b1f57501de9c..6f0c24438bba 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -200,15 +200,14 @@ static void remove_migration_ptes(struct page *old, struct page *new) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) +static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) { - pte_t *ptep, pte; - spinlock_t *ptl; + pte_t pte; swp_entry_t entry; struct page *page; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) goto out; @@ -236,6 +235,20 @@ out: pte_unmap_unlock(ptep, ptl); } +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + spinlock_t *ptl = pte_lockptr(mm, pmd); + pte_t *ptep = pte_offset_map(pmd, address); + __migration_entry_wait(mm, ptep, ptl); +} + +void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl = &(mm)->page_table_lock; + __migration_entry_wait(mm, pte, ptl); +} + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, -- cgit v1.2.3 From c2853c8df57f49620d26f317d7d43347c29bfc2e Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 12 Jun 2013 14:05:10 -0700 Subject: include/linux/math64.h: add div64_ul() There is div64_long() to handle the s64/long division, but no mocro do u64/ul division. It is necessary in some scenarios, so add this function. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alex Shi Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/math64.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/math64.h b/include/linux/math64.h index b8ba85544721..2913b86eb12a 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -6,7 +6,8 @@ #if BITS_PER_LONG == 64 -#define div64_long(x,y) div64_s64((x),(y)) +#define div64_long(x, y) div64_s64((x), (y)) +#define div64_ul(x, y) div64_u64((x), (y)) /** * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder @@ -47,7 +48,8 @@ static inline s64 div64_s64(s64 dividend, s64 divisor) #elif BITS_PER_LONG == 32 -#define div64_long(x,y) div_s64((x),(y)) +#define div64_long(x, y) div_s64((x), (y)) +#define div64_ul(x, y) div_u64((x), (y)) #ifndef div_u64_rem static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) -- cgit v1.2.3 From f21afc25f9ed45b8ffe200d0f071b0caec3ed2ef Mon Sep 17 00:00:00 2001 From: David Daney Date: Fri, 14 Jun 2013 11:13:59 -0700 Subject: smp.h: Use local_irq_{save,restore}() in !SMP version of on_each_cpu(). Thanks to commit f91eb62f71b3 ("init: scream bloody murder if interrupts are enabled too early"), "bloody murder" is now being screamed. With a MIPS OCTEON config, we use on_each_cpu() in our irq_chip.irq_bus_sync_unlock() function. This gets called in early as a result of the time_init() call. Because the !SMP version of on_each_cpu() unconditionally enables irqs, we get: WARNING: at init/main.c:560 start_kernel+0x250/0x410() Interrupts were enabled early CPU: 0 PID: 0 Comm: swapper Not tainted 3.10.0-rc5-Cavium-Octeon+ #801 Call Trace: show_stack+0x68/0x80 warn_slowpath_common+0x78/0xb0 warn_slowpath_fmt+0x38/0x48 start_kernel+0x250/0x410 Suggested fix: Do what we already do in the SMP version of on_each_cpu(), and use local_irq_save/local_irq_restore. Because we need a flags variable, make it a static inline to avoid name space issues. [ Change from v1: Convert on_each_cpu to a static inline function, add #include to avoid build breakage on some files. on_each_cpu_mask() and on_each_cpu_cond() suffer the same problem as on_each_cpu(), but they are not causing !SMP bugs for me, so I will defer changing them to a less urgent patch. ] Signed-off-by: David Daney Cc: Ralf Baechle Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index e6564c1dc552..c8488763277f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -11,6 +11,7 @@ #include #include #include +#include extern void cpu_idle(void); @@ -139,13 +140,17 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) -#define on_each_cpu(func,info,wait) \ - ({ \ - local_irq_disable(); \ - func(info); \ - local_irq_enable(); \ - 0; \ - }) + +static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) +{ + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; +} + /* * Note we still need to test the mask even for UP * because we actually can get an empty mask from -- cgit v1.2.3 From 29bb9e5a75684106a37593ad75ec75ff8312731b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 May 2013 15:23:40 -0400 Subject: tracing/context-tracking: Add preempt_schedule_context() for tracing Dave Jones hit the following bug report: =============================== [ INFO: suspicious RCU usage. ] 3.10.0-rc2+ #1 Not tainted ------------------------------- include/linux/rcupdate.h:771 rcu_read_lock() used illegally while idle! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! 2 locks held by cc1/63645: #0: (&rq->lock){-.-.-.}, at: [] __schedule+0xed/0x9b0 #1: (rcu_read_lock){.+.+..}, at: [] cpuacct_charge+0x5/0x1f0 CPU: 1 PID: 63645 Comm: cc1 Not tainted 3.10.0-rc2+ #1 [loadavg: 40.57 27.55 13.39 25/277 64369] Hardware name: Gigabyte Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H, BIOS F12a 04/23/2010 0000000000000000 ffff88010f78fcf8 ffffffff816ae383 ffff88010f78fd28 ffffffff810b698d ffff88011c092548 000000000023d073 ffff88011c092500 0000000000000001 ffff88010f78fd60 ffffffff8109d7c5 ffffffff8109d645 Call Trace: [] dump_stack+0x19/0x1b [] lockdep_rcu_suspicious+0xfd/0x130 [] cpuacct_charge+0x185/0x1f0 [] ? cpuacct_charge+0x5/0x1f0 [] update_curr+0xec/0x240 [] put_prev_task_fair+0x228/0x480 [] __schedule+0x161/0x9b0 [] preempt_schedule+0x51/0x80 [] ? __cond_resched_softirq+0x60/0x60 [] ? retint_careful+0x12/0x2e [] ftrace_ops_control_func+0x1dc/0x210 [] ftrace_call+0x5/0x2f [] ? retint_careful+0xb/0x2e [] ? schedule_user+0x5/0x70 [] ? schedule_user+0x5/0x70 [] ? retint_careful+0x12/0x2e ------------[ cut here ]------------ What happened was that the function tracer traced the schedule_user() code that tells RCU that the system is coming back from userspace, and to add the CPU back to the RCU monitoring. Because the function tracer does a preempt_disable/enable_notrace() calls the preempt_enable_notrace() checks the NEED_RESCHED flag. If it is set, then preempt_schedule() is called. But this is called before the user_exit() function can inform the kernel that the CPU is no longer in user mode and needs to be accounted for by RCU. The fix is to create a new preempt_schedule_context() that checks if the kernel is still in user mode and if so to switch it to kernel mode before calling schedule. It also switches back to user mode coming back from schedule in need be. The only user of this currently is the preempt_enable_notrace(), which is only used by the tracing subsystem. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369423420.6828.226.camel@gandalf.local.home Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 18 +++++++++++++++++- kernel/context_tracking.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 87a03c746f17..f5d4723cdb3d 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -33,9 +33,25 @@ do { \ preempt_schedule(); \ } while (0) +#ifdef CONFIG_CONTEXT_TRACKING + +void preempt_schedule_context(void); + +#define preempt_check_resched_context() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + preempt_schedule_context(); \ +} while (0) +#else + +#define preempt_check_resched_context() preempt_check_resched() + +#endif /* CONFIG_CONTEXT_TRACKING */ + #else /* !CONFIG_PREEMPT */ #define preempt_check_resched() do { } while (0) +#define preempt_check_resched_context() do { } while (0) #endif /* CONFIG_PREEMPT */ @@ -88,7 +104,7 @@ do { \ do { \ preempt_enable_no_resched_notrace(); \ barrier(); \ - preempt_check_resched(); \ + preempt_check_resched_context(); \ } while (0) #else /* !CONFIG_PREEMPT_COUNT */ diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..66677003e223 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -71,6 +71,46 @@ void user_enter(void) local_irq_restore(flags); } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ + struct thread_info *ti = current_thread_info(); + enum ctx_state prev_ctx; + + if (likely(ti->preempt_count || irqs_disabled())) + return; + + /* + * Need to disable preemption in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + preempt_disable_notrace(); + prev_ctx = exception_enter(); + preempt_enable_no_resched_notrace(); + + preempt_schedule(); + + preempt_disable_notrace(); + exception_exit(prev_ctx); + preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */ /** * user_exit - Inform the context tracking that the CPU is -- cgit v1.2.3 From 7995bd287134f6c8f80d94bebe7396f05a9bc42b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 20 Jun 2013 18:58:36 +0400 Subject: splice: don't pass the address of ->f_pos to methods Signed-off-by: Al Viro --- fs/internal.h | 6 ++++++ fs/read_write.c | 24 ++++++++++++++++-------- fs/splice.c | 31 ++++++++++++++++++------------- include/linux/fs.h | 2 -- include/linux/splice.h | 1 + 5 files changed, 41 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/fs/internal.h b/fs/internal.h index eaa75f75b625..68121584ae37 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -131,6 +131,12 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); */ extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); +/* + * splice.c + */ +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + /* * pipe.c */ diff --git a/fs/read_write.c b/fs/read_write.c index 03430008704e..2cefa417be34 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1064,6 +1064,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, struct fd in, out; struct inode *in_inode, *out_inode; loff_t pos; + loff_t out_pos; ssize_t retval; int fl; @@ -1077,12 +1078,14 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; - if (!ppos) - ppos = &in.file->f_pos; - else + if (!ppos) { + pos = in.file->f_pos; + } else { + pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; - retval = rw_verify_area(READ, in.file, ppos, count); + } + retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; count = retval; @@ -1099,7 +1102,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = -EINVAL; in_inode = file_inode(in.file); out_inode = file_inode(out.file); - retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); + out_pos = out.file->f_pos; + retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; count = retval; @@ -1107,7 +1111,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); - pos = *ppos; if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) @@ -1126,18 +1129,23 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif - retval = do_splice_direct(in.file, ppos, out.file, count, fl); + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); + out.file->f_pos = out_pos; + if (ppos) + *ppos = pos; + else + in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); - if (*ppos > max) + if (pos > max) retval = -EOVERFLOW; fput_out: diff --git a/fs/splice.c b/fs/splice.c index e6b25598c8c4..9eca476227d5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1274,7 +1274,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; - return do_splice_from(pipe, file, &file->f_pos, sd->total_len, + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } @@ -1294,7 +1294,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags) + loff_t *opos, size_t len, unsigned int flags) { struct splice_desc sd = { .len = len, @@ -1302,6 +1302,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .flags = flags, .pos = *ppos, .u.file = out, + .opos = opos, }; long ret; @@ -1325,7 +1326,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; - loff_t offset, *off; + loff_t offset; long ret; ipipe = get_pipe_info(in); @@ -1356,13 +1357,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &out->f_pos; + } else { + offset = out->f_pos; + } - ret = do_splice_from(ipipe, out, off, len, flags); + ret = do_splice_from(ipipe, out, &offset, len, flags); - if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + if (!off_out) + out->f_pos = offset; + else if (copy_to_user(off_out, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; @@ -1376,13 +1379,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &in->f_pos; + } else { + offset = in->f_pos; + } - ret = do_splice_to(in, off, opipe, len, flags); + ret = do_splice_to(in, &offset, opipe, len, flags); - if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + if (!off_in) + in->f_pos = offset; + else if (copy_to_user(off_in, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index 43db02e9c9fa..65c2be22b601 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2414,8 +2414,6 @@ extern ssize_t generic_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *, size_t len, unsigned int flags); -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags); extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); diff --git a/include/linux/splice.h b/include/linux/splice.h index 09a545a7dfa3..74575cbf2d6f 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -35,6 +35,7 @@ struct splice_desc { void *data; /* cookie */ } u; loff_t pos; /* file position */ + loff_t *opos; /* sendfile: output position */ size_t num_spliced; /* number of bytes already spliced */ bool need_wakeup; /* need to wake up writer */ }; -- cgit v1.2.3