From 4a47cbae04844f0c5e2365aa6c217b61850bb832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:44:38 +0100 Subject: dma-direct: improve swiotlb error reporting Untangle the way how dma_direct_map_page calls into swiotlb to be able to properly report errors where the swiotlb DMA address overflows the mask separately from overflows in the !swiotlb case. This means that siotlb_map now has to do a little more work that duplicates dma_direct_map_page, but doing so greatly simplifies the calling convention. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index cde3dc18e21a..046bb94bd4d6 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -64,6 +64,9 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev, size_t size, enum dma_data_direction dir, enum dma_sync_target target); +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs); + #ifdef CONFIG_SWIOTLB extern enum swiotlb_force swiotlb_force; extern phys_addr_t io_tlb_start, io_tlb_end; @@ -73,8 +76,6 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr) return paddr >= io_tlb_start && paddr < io_tlb_end; } -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs); void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); @@ -85,12 +86,6 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr) { return false; } -static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys, - dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - return false; -} static inline void swiotlb_exit(void) { } -- cgit v1.2.3 From e7598fac323aad0e502415edeffd567315994dd6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 10 Feb 2020 10:36:56 +0100 Subject: iommu/vt-d: Fix compile warning from intel-svm.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intel_svm_is_pasid_valid() needs to be marked inline, otherwise it causes the compile warning below: CC [M] drivers/dma/idxd/cdev.o In file included from drivers/dma/idxd/cdev.c:9:0: ./include/linux/intel-svm.h:125:12: warning: ‘intel_svm_is_pasid_valid’ defined but not used [-Wunused-function] static int intel_svm_is_pasid_valid(struct device *dev, int pasid) ^~~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Borislav Petkov Fixes: 15060aba71711 ('iommu/vt-d: Helper function to query if a pasid has any active users') Signed-off-by: Joerg Roedel --- include/linux/intel-svm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h index 94f047a8a845..d7c403d0dd27 100644 --- a/include/linux/intel-svm.h +++ b/include/linux/intel-svm.h @@ -122,7 +122,7 @@ static inline int intel_svm_unbind_mm(struct device *dev, int pasid) BUG(); } -static int intel_svm_is_pasid_valid(struct device *dev, int pasid) +static inline int intel_svm_is_pasid_valid(struct device *dev, int pasid) { return -EINVAL; } -- cgit v1.2.3 From 73f8bda9b5dc1c69df2bc55c0cbb24461a6391a9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 3 Feb 2020 16:38:28 +0100 Subject: USB: core: add endpoint-blacklist quirk Add a new device quirk that can be used to blacklist endpoints. Since commit 3e4f8e21c4f2 ("USB: core: fix check for duplicate endpoints") USB core ignores any duplicate endpoints found during descriptor parsing. In order to handle devices where the first interfaces with duplicate endpoints are the ones that should have their endpoints ignored, we need to add a blacklist. Tested-by: edes Cc: stable Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20200203153830.26394-2-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 11 +++++++++++ drivers/usb/core/quirks.c | 32 ++++++++++++++++++++++++++++++++ drivers/usb/core/usb.h | 3 +++ include/linux/usb/quirks.h | 3 +++ 4 files changed, 49 insertions(+) (limited to 'include/linux') diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 26bc05e48d8a..7df22bcefa9d 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -256,6 +256,7 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, struct usb_host_interface *ifp, int num_ep, unsigned char *buffer, int size) { + struct usb_device *udev = to_usb_device(ddev); unsigned char *buffer0 = buffer; struct usb_endpoint_descriptor *d; struct usb_host_endpoint *endpoint; @@ -297,6 +298,16 @@ static int usb_parse_endpoint(struct device *ddev, int cfgno, goto skip_to_next_endpoint_or_interface_descriptor; } + /* Ignore blacklisted endpoints */ + if (udev->quirks & USB_QUIRK_ENDPOINT_BLACKLIST) { + if (usb_endpoint_is_blacklisted(udev, ifp, d)) { + dev_warn(ddev, "config %d interface %d altsetting %d has a blacklisted endpoint with address 0x%X, skipping\n", + cfgno, inum, asnum, + d->bEndpointAddress); + goto skip_to_next_endpoint_or_interface_descriptor; + } + } + endpoint = &ifp->endpoint[ifp->desc.bNumEndpoints]; ++ifp->desc.bNumEndpoints; diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 6b6413073584..56c8dffaf5f5 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -472,6 +472,38 @@ static const struct usb_device_id usb_amd_resume_quirk_list[] = { { } /* terminating entry must be last */ }; +/* + * Entries for blacklisted endpoints that should be ignored when parsing + * configuration descriptors. + * + * Matched for devices with USB_QUIRK_ENDPOINT_BLACKLIST. + */ +static const struct usb_device_id usb_endpoint_blacklist[] = { + { } +}; + +bool usb_endpoint_is_blacklisted(struct usb_device *udev, + struct usb_host_interface *intf, + struct usb_endpoint_descriptor *epd) +{ + const struct usb_device_id *id; + unsigned int address; + + for (id = usb_endpoint_blacklist; id->match_flags; ++id) { + if (!usb_match_device(udev, id)) + continue; + + if (!usb_match_one_id_intf(udev, intf, id)) + continue; + + address = id->driver_info; + if (address == epd->bEndpointAddress) + return true; + } + + return false; +} + static bool usb_match_any_interface(struct usb_device *udev, const struct usb_device_id *id) { diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h index cf4783cf661a..3ad0ee57e859 100644 --- a/drivers/usb/core/usb.h +++ b/drivers/usb/core/usb.h @@ -37,6 +37,9 @@ extern void usb_authorize_interface(struct usb_interface *); extern void usb_detect_quirks(struct usb_device *udev); extern void usb_detect_interface_quirks(struct usb_device *udev); extern void usb_release_quirk_list(void); +extern bool usb_endpoint_is_blacklisted(struct usb_device *udev, + struct usb_host_interface *intf, + struct usb_endpoint_descriptor *epd); extern int usb_remove_device(struct usb_device *udev); extern int usb_get_device_descriptor(struct usb_device *dev, diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index a1be64c9940f..22c1f579afe3 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -69,4 +69,7 @@ /* Hub needs extra delay after resetting its port. */ #define USB_QUIRK_HUB_SLOW_RESET BIT(14) +/* device has blacklisted endpoints */ +#define USB_QUIRK_ENDPOINT_BLACKLIST BIT(15) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3 From 0c5aae59270fb1f827acce182786094c9ccf598e Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 10 Feb 2020 15:57:30 +0100 Subject: serdev: ttyport: restore client ops on deregistration The serdev tty-port controller driver should reset the tty-port client operations also on deregistration to avoid a NULL-pointer dereference in case the port is later re-registered as a normal tty device. Note that this can only happen with tty drivers such as 8250 which have statically allocated port structures that can end up being reused and where a later registration would not register a serdev controller (e.g. due to registration errors or if the devicetree has been changed in between). Specifically, this can be an issue for any statically defined ports that would be registered by 8250 core when an 8250 driver is being unbound. Fixes: bed35c6dfa6a ("serdev: add a tty port controller driver") Cc: stable # 4.11 Reported-by: Loic Poulain Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20200210145730.22762-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serdev/serdev-ttyport.c | 6 ++---- drivers/tty/tty_port.c | 5 +++-- include/linux/tty.h | 2 ++ 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serdev/serdev-ttyport.c b/drivers/tty/serdev/serdev-ttyport.c index d1cdd2ab8b4c..d367803e2044 100644 --- a/drivers/tty/serdev/serdev-ttyport.c +++ b/drivers/tty/serdev/serdev-ttyport.c @@ -265,7 +265,6 @@ struct device *serdev_tty_port_register(struct tty_port *port, struct device *parent, struct tty_driver *drv, int idx) { - const struct tty_port_client_operations *old_ops; struct serdev_controller *ctrl; struct serport *serport; int ret; @@ -284,7 +283,6 @@ struct device *serdev_tty_port_register(struct tty_port *port, ctrl->ops = &ctrl_ops; - old_ops = port->client_ops; port->client_ops = &client_ops; port->client_data = ctrl; @@ -297,7 +295,7 @@ struct device *serdev_tty_port_register(struct tty_port *port, err_reset_data: port->client_data = NULL; - port->client_ops = old_ops; + port->client_ops = &tty_port_default_client_ops; serdev_controller_put(ctrl); return ERR_PTR(ret); @@ -312,8 +310,8 @@ int serdev_tty_port_unregister(struct tty_port *port) return -ENODEV; serdev_controller_remove(ctrl); - port->client_ops = NULL; port->client_data = NULL; + port->client_ops = &tty_port_default_client_ops; serdev_controller_put(ctrl); return 0; diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index 044c3cbdcfa4..ea80bf872f54 100644 --- a/drivers/tty/tty_port.c +++ b/drivers/tty/tty_port.c @@ -52,10 +52,11 @@ static void tty_port_default_wakeup(struct tty_port *port) } } -static const struct tty_port_client_operations default_client_ops = { +const struct tty_port_client_operations tty_port_default_client_ops = { .receive_buf = tty_port_default_receive_buf, .write_wakeup = tty_port_default_wakeup, }; +EXPORT_SYMBOL_GPL(tty_port_default_client_ops); void tty_port_init(struct tty_port *port) { @@ -68,7 +69,7 @@ void tty_port_init(struct tty_port *port) spin_lock_init(&port->lock); port->close_delay = (50 * HZ) / 100; port->closing_wait = (3000 * HZ) / 100; - port->client_ops = &default_client_ops; + port->client_ops = &tty_port_default_client_ops; kref_init(&port->kref); } EXPORT_SYMBOL(tty_port_init); diff --git a/include/linux/tty.h b/include/linux/tty.h index bfa4e2ee94a9..bd5fe0e907e8 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -225,6 +225,8 @@ struct tty_port_client_operations { void (*write_wakeup)(struct tty_port *port); }; +extern const struct tty_port_client_operations tty_port_default_client_ops; + struct tty_port { struct tty_bufhead buf; /* Locked internally */ struct tty_struct *tty; /* Back pointer */ -- cgit v1.2.3 From 84a4062632462c4320704fcdf8e99e89e94c0aba Mon Sep 17 00:00:00 2001 From: Johan Korsnes Date: Fri, 17 Jan 2020 13:08:36 +0100 Subject: HID: core: increase HID report buffer size to 8KiB We have a HID touch device that reports its opens and shorts test results in HID buffers of size 8184 bytes. The maximum size of the HID buffer is currently set to 4096 bytes, causing probe of this device to fail. With this patch we increase the maximum size of the HID buffer to 8192 bytes, making device probe and acquisition of said buffers succeed. Signed-off-by: Johan Korsnes Cc: Alan Stern Cc: Armando Visconti Cc: Jiri Kosina Signed-off-by: Jiri Kosina --- include/linux/hid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hid.h b/include/linux/hid.h index cd41f209043f..875f71132b14 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -492,7 +492,7 @@ struct hid_report_enum { }; #define HID_MIN_BUFFER_SIZE 64 /* make sure there is at least a packet size of space */ -#define HID_MAX_BUFFER_SIZE 4096 /* 4kb */ +#define HID_MAX_BUFFER_SIZE 8192 /* 8kb */ #define HID_CONTROL_FIFO_SIZE 256 /* to init devices with >100 reports */ #define HID_OUTPUT_FIFO_SIZE 64 -- cgit v1.2.3 From 7151affeef8d527f50b4b68a871fd28bd660023f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sat, 15 Feb 2020 10:50:21 +0000 Subject: net: export netdev_next_lower_dev_rcu() netdev_next_lower_dev_rcu() will be used to implement a function, which is to walk all lower interfaces. There are already functions that they walk their lower interface. (netdev_walk_all_lower_dev_rcu, netdev_walk_all_lower_dev()). But, there would be cases that couldn't be covered by given netdev_walk_all_lower_dev_{rcu}() function. So, some modules would want to implement own function, which is to walk all lower interfaces. In the next patch, netdev_next_lower_dev_rcu() will be used. In addition, this patch removes two unused prototypes in netdevice.h. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++---- net/core/dev.c | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f1f633235f6..6c3f7032e8d9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -72,6 +72,8 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ +#define MAX_NEST_DEV 8 + /* * Transmit return codes: transmit return codes originate from three different * namespaces: @@ -4389,11 +4391,8 @@ void *netdev_lower_get_next(struct net_device *dev, ldev; \ ldev = netdev_lower_get_next(dev, &(iter))) -struct net_device *netdev_all_lower_get_next(struct net_device *dev, +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, struct list_head **iter); -struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, - struct list_head **iter); - int netdev_walk_all_lower_dev(struct net_device *dev, int (*fn)(struct net_device *lower_dev, void *data), diff --git a/net/core/dev.c b/net/core/dev.c index b6d13f3f1e5a..2577ebfed293 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,7 +146,6 @@ #include "net-sysfs.h" #define MAX_GRO_SKBS 8 -#define MAX_NEST_DEV 8 /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) @@ -7207,8 +7206,8 @@ static int __netdev_walk_all_lower_dev(struct net_device *dev, return 0; } -static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, - struct list_head **iter) +struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, + struct list_head **iter) { struct netdev_adjacent *lower; @@ -7220,6 +7219,7 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev, return lower->dev; } +EXPORT_SYMBOL(netdev_next_lower_dev_rcu); static u8 __netdev_upper_depth(struct net_device *dev) { -- cgit v1.2.3 From d2f273f0a9205257b91af1d3d461ee29688c2f24 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 15 Feb 2020 15:34:07 -0800 Subject: skbuff.h: fix all kernel-doc warnings Fix all kernel-doc warnings in . Fixes these warnings: ../include/linux/skbuff.h:890: warning: Function parameter or member 'list' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'dev_scratch' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'ip_defrag_offset' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'skb_mstamp_ns' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member '__cloned_offset' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'head_frag' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member '__pkt_type_offset' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'encapsulation' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'encap_hdr_csum' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'csum_valid' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member '__pkt_vlan_present_offset' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'vlan_present' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'csum_complete_sw' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'csum_level' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'inner_protocol_type' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'remcsum_offload' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'sender_cpu' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'reserved_tailroom' not described in 'sk_buff' ../include/linux/skbuff.h:890: warning: Function parameter or member 'inner_ipproto' not described in 'sk_buff' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8806b69388..5b50278c4bc8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -611,9 +611,15 @@ typedef unsigned char *sk_buff_data_t; * @next: Next buffer in list * @prev: Previous buffer in list * @tstamp: Time we arrived/left + * @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point + * for retransmit timer * @rbnode: RB tree node, alternative to next/prev for netem/tcp + * @list: queue head * @sk: Socket we are owned by + * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in + * fragmentation management * @dev: Device we arrived on/are leaving by + * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) * @sp: the security path, used for xfrm @@ -632,6 +638,9 @@ typedef unsigned char *sk_buff_data_t; * @pkt_type: Packet class * @fclone: skbuff clone status * @ipvs_property: skbuff is owned by ipvs + * @inner_protocol_type: whether the inner protocol is + * ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO + * @remcsum_offload: remote checksum offload is enabled * @offload_fwd_mark: Packet was L2-forwarded in hardware * @offload_l3_fwd_mark: Packet was L3-forwarded in hardware * @tc_skip_classify: do not classify packet. set by IFB device @@ -650,6 +659,8 @@ typedef unsigned char *sk_buff_data_t; * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices + * @head_frag: skb was allocated from page fragments, + * not allocated by kmalloc() or vmalloc(). * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) @@ -660,15 +671,28 @@ typedef unsigned char *sk_buff_data_t; * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS + * @encapsulation: indicates the inner headers in the skbuff are valid + * @encap_hdr_csum: software checksum is needed + * @csum_valid: checksum is already valid * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL + * @csum_complete_sw: checksum was completed by software + * @csum_level: indicates the number of consecutive checksums found in + * the packet minus one that have been verified as + * CHECKSUM_UNNECESSARY (max 3) * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @napi_id: id of the NAPI struct this skb came from + * @sender_cpu: (aka @napi_id) source CPU in XPS * @secmark: security marking * @mark: Generic packet mark + * @reserved_tailroom: (aka @mark) number of bytes of free space available + * at the tail of an sk_buff + * @vlan_present: VLAN tag is present * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) + * @inner_ipproto: (aka @inner_protocol) stores ipproto when + * skb->inner_protocol_type == ENCAP_TYPE_IPPROTO; * @inner_transport_header: Inner transport layer header (encapsulation) * @inner_network_header: Network layer header (encapsulation) * @inner_mac_header: Link layer header (encapsulation) @@ -750,7 +774,9 @@ struct sk_buff { #endif #define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) + /* private: */ __u8 __cloned_offset[0]; + /* public: */ __u8 cloned:1, nohdr:1, fclone:2, @@ -775,7 +801,9 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset) + /* private: */ __u8 __pkt_type_offset[0]; + /* public: */ __u8 pkt_type:3; __u8 ignore_df:1; __u8 nf_trace:1; @@ -798,7 +826,9 @@ struct sk_buff { #define PKT_VLAN_PRESENT_BIT 0 #endif #define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset) + /* private: */ __u8 __pkt_vlan_present_offset[0]; + /* public: */ __u8 vlan_present:1; __u8 csum_complete_sw:1; __u8 csum_level:2; -- cgit v1.2.3 From 6a757c07e51f80ac34325fcd558490d2d1439e1b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:37:07 +0100 Subject: netfilter: conntrack: allow insertion of clashing entries This patch further relaxes the need to drop an skb due to a clash with an existing conntrack entry. Current clash resolution handles the case where the clash occurs between two identical entries (distinct nf_conn objects with same tuples), i.e.: Original Reply existing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 clashing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 ... existing handling will discard the unconfirmed clashing entry and makes skb->_nfct point to the existing one. The skb can then be processed normally just as if the clash would not have existed in the first place. For other clashes, the skb needs to be dropped. This frequently happens with DNS resolvers that send A and AAAA queries back-to-back when NAT rules are present that cause packets to get different DNAT transformations applied, for example: -m statistics --mode random ... -j DNAT --dnat-to 10.0.0.6:5353 -m statistics --mode random ... -j DNAT --dnat-to 10.0.0.7:5353 In this case the A or AAAA query is dropped which incurs a costly delay during name resolution. This patch also allows this collision type: Original Reply existing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.6:5353 clashing: 10.2.3.4:42 -> 10.8.8.8:53 10.2.3.4:42 <- 10.0.0.7:5353 In this case, clash is in original direction -- the reply direction is still unique. The change makes it so that when the 2nd colliding packet is received, the clashing conntrack is tagged with new IPS_NAT_CLASH_BIT, gets a fixed 1 second timeout and is inserted in the reply direction only. The entry is hidden from 'conntrack -L', it will time out quickly and it can be early dropped because it will never progress to the ASSURED state. To avoid special-casing the delete code path to special case the ORIGINAL hlist_nulls node, a new helper, "hlist_nulls_add_fake", is added so hlist_nulls_del() will work. Example: CPU A: CPU B: 1. 10.2.3.4:42 -> 10.8.8.8:53 (A) 2. 10.2.3.4:42 -> 10.8.8.8:53 (AAAA) 3. Apply DNAT, reply changed to 10.0.0.6 4. 10.2.3.4:42 -> 10.8.8.8:53 (AAAA) 5. Apply DNAT, reply changed to 10.0.0.7 6. confirm/commit to conntrack table, no collisions 7. commit clashing entry Reply comes in: 10.2.3.4:42 <- 10.0.0.6:5353 (A) -> Finds a conntrack, DNAT is reversed & packet forwarded to 10.2.3.4:42 10.2.3.4:42 <- 10.0.0.7:5353 (AAAA) -> Finds a conntrack, DNAT is reversed & packet forwarded to 10.2.3.4:42 The conntrack entry is deleted from table, as it has the NAT_CLASH bit set. In case of a retransmit from ORIGINAL dir, all further packets will get the DNAT transformation to 10.0.0.6. I tried to come up with other solutions but they all have worse problems. Alternatives considered were: 1. Confirm ct entries at allocation time, not in postrouting. a. will cause uneccesarry work when the skb that creates the conntrack is dropped by ruleset. b. in case nat is applied, ct entry would need to be moved in the table, which requires another spinlock pair to be taken. c. breaks the 'unconfirmed entry is private to cpu' assumption: we would need to guard all nfct->ext allocation requests with ct->lock spinlock. 2. Make the unconfirmed list a hash table instead of a pcpu list. Shares drawback c) of the first alternative. 3. Document this is expected and force users to rearrange their ruleset (e.g. by using "-m cluster" instead of "-m statistics"). nft has the 'jhash' expression which can be used instead of 'numgen'. Major drawback: doesn't fix what I consider a bug, not very realistic and I believe its reasonable to have the existing rulesets to 'just work'. 4. Document this is expected and force users to steer problematic packets to the same CPU -- this would serialize the "allocate new conntrack entry/nat table evaluation/perform nat/confirm entry", so no race can occur. Similar drawback to 3. Another advantage of this patch compared to 1) and 2) is that there are no changes to the hot path; things are handled in the udp tracker and the clash resolution path. Cc: rcu@vger.kernel.org Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Jozsef Kadlecsik Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/rculist_nulls.h | 7 ++ include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++- net/netfilter/nf_conntrack_core.c | 76 +++++++++++++++++++++- net/netfilter/nf_conntrack_proto_udp.c | 20 ++++-- 4 files changed, 108 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index e5b752027a03..9670b54b484a 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -145,6 +145,13 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, } } +/* after that hlist_nulls_del will work */ +static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) +{ + n->pprev = &n->next; + n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 336014bf8868..b6f0bb1dc799 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -97,6 +97,15 @@ enum ip_conntrack_status { IPS_UNTRACKED_BIT = 12, IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT), +#ifdef __KERNEL__ + /* Re-purposed for in-kernel use: + * Tags a conntrack entry that clashed with an existing entry + * on insert. + */ + IPS_NAT_CLASH_BIT = IPS_UNTRACKED_BIT, + IPS_NAT_CLASH = IPS_UNTRACKED, +#endif + /* Conntrack got a helper explicitly attached via CT target. */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), @@ -110,7 +119,8 @@ enum ip_conntrack_status { */ IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | - IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD), + IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_UNTRACKED | + IPS_OFFLOAD), __IPS_MAX_BIT = 15, }; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 3f069eb0f0fc..1927fc296f95 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -940,11 +940,71 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, return NF_DROP; } +/** + * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry + * + * @skb: skb that causes the collision + * @repl_idx: hash slot for reply direction + * + * Called when origin or reply direction had a clash. + * The skb can be handled without packet drop provided the reply direction + * is unique or there the existing entry has the identical tuple in both + * directions. + * + * Caller must hold conntrack table locks to prevent concurrent updates. + * + * Returns NF_DROP if the clash could not be handled. + */ +static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) +{ + struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); + const struct nf_conntrack_zone *zone; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; + struct net *net; + + zone = nf_ct_zone(loser_ct); + net = nf_ct_net(loser_ct); + + /* Reply direction must never result in a clash, unless both origin + * and reply tuples are identical. + */ + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { + if (nf_ct_key_equal(h, + &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) + return __nf_ct_resolve_clash(skb, h); + } + + /* We want the clashing entry to go away real soon: 1 second timeout. */ + loser_ct->timeout = nfct_time_stamp + HZ; + + /* IPS_NAT_CLASH removes the entry automatically on the first + * reply. Also prevents UDP tracker from moving the entry to + * ASSURED state, i.e. the entry can always be evicted under + * pressure. + */ + loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; + + __nf_conntrack_insert_prepare(loser_ct); + + /* fake add for ORIGINAL dir: we want lookups to only find the entry + * already in the table. This also hides the clashing entry from + * ctnetlink iteration, i.e. conntrack -L won't show them. + */ + hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); + + hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, + &nf_conntrack_hash[repl_idx]); + return NF_ACCEPT; +} + /** * nf_ct_resolve_clash - attempt to handle clash without packet drop * * @skb: skb that causes the clash * @h: tuplehash of the clashing entry already in table + * @hash_reply: hash slot for reply direction * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple. @@ -963,10 +1023,18 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb, * exactly the same, only the to-be-confirmed conntrack entry is discarded * and @skb is associated with the conntrack entry already in the table. * + * Failing that, the new, unconfirmed conntrack is still added to the table + * provided that the collision only occurs in the ORIGINAL direction. + * The new entry will be added after the existing one in the hash list, + * so packets in the ORIGINAL direction will continue to match the existing + * entry. The new entry will also have a fixed timeout so it expires -- + * due to the collision, it will not see bidirectional traffic. + * * Returns NF_DROP if the clash could not be resolved. */ static __cold noinline int -nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) +nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, + u32 reply_hash) { /* This is the conntrack entry already in hashes that won race. */ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -987,6 +1055,10 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h) if (ret == NF_ACCEPT) return ret; + ret = nf_ct_resolve_clash_harder(skb, reply_hash); + if (ret == NF_ACCEPT) + return ret; + drop: nf_ct_add_to_dying_list(loser_ct); NF_CT_STAT_INC(net, drop); @@ -1101,7 +1173,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: - ret = nf_ct_resolve_clash(skb, h); + ret = nf_ct_resolve_clash(skb, h, reply_hash); dying: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 7365b43f8f98..760ca2422816 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -81,6 +81,18 @@ static bool udp_error(struct sk_buff *skb, return false; } +static void nf_conntrack_udp_refresh_unreplied(struct nf_conn *ct, + struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + u32 extra_jiffies) +{ + if (unlikely(ctinfo == IP_CT_ESTABLISHED_REPLY && + ct->status & IPS_NAT_CLASH)) + nf_ct_kill(ct); + else + nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies); +} + /* Returns verdict for packet, and may modify conntracktype */ int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, @@ -116,8 +128,8 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_UNREPLIED]); + nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, + timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } @@ -198,8 +210,8 @@ int nf_conntrack_udplite_packet(struct nf_conn *ct, if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { - nf_ct_refresh_acct(ct, ctinfo, skb, - timeouts[UDP_CT_UNREPLIED]); + nf_conntrack_udp_refresh_unreplied(ct, skb, ctinfo, + timeouts[UDP_CT_UNREPLIED]); } return NF_ACCEPT; } -- cgit v1.2.3 From 13a7e459a41a56d788ab33d825c6205379bbb711 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 14 Jan 2020 09:27:27 +0200 Subject: net/mlx5: DR, Handle reformat capability over sw-steering tables On flow table creation, send the relevant flags according to what the FW currently supports. When FW doesn't support reformat option over SW-steering managed table, the driver shouldn't pass this. Fixes: 988fd6b32d07 ("net/mlx5: DR, Pass table flags at creation to lower layer") Signed-off-by: Erez Shitrit Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c | 9 +++++++-- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c index 3abfc8125926..c2027192e21e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/fs_dr.c @@ -66,15 +66,20 @@ static int mlx5_cmd_dr_create_flow_table(struct mlx5_flow_root_namespace *ns, struct mlx5_flow_table *next_ft) { struct mlx5dr_table *tbl; + u32 flags; int err; if (mlx5_dr_is_fw_table(ft->flags)) return mlx5_fs_cmd_get_fw_cmds()->create_flow_table(ns, ft, log_size, next_ft); + flags = ft->flags; + /* turn off encap/decap if not supported for sw-str by fw */ + if (!MLX5_CAP_FLOWTABLE(ns->dev, sw_owner_reformat_supported)) + flags = ft->flags & ~(MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT | + MLX5_FLOW_TABLE_TUNNEL_EN_DECAP); - tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, - ft->level, ft->flags); + tbl = mlx5dr_table_create(ns->fs_dr_domain.dr_domain, ft->level, flags); if (!tbl) { mlx5_core_err(ns->dev, "Failed creating dr flow_table\n"); return -EINVAL; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ff8c9d527bb4..bfdf41537cf1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -688,7 +688,10 @@ struct mlx5_ifc_flow_table_nic_cap_bits { u8 nic_rx_multi_path_tirs[0x1]; u8 nic_rx_multi_path_tirs_fts[0x1]; u8 allow_sniffer_and_nic_rx_shared_tir[0x1]; - u8 reserved_at_3[0x1d]; + u8 reserved_at_3[0x4]; + u8 sw_owner_reformat_supported[0x1]; + u8 reserved_at_8[0x18]; + u8 encap_general_header[0x1]; u8 reserved_at_21[0xa]; u8 log_max_packet_reformat_context[0x5]; -- cgit v1.2.3 From 85c46b78da58398be1c5166f55063c0512decd39 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 20 Feb 2020 21:18:42 +0900 Subject: bootconfig: Add bootconfig magic word for indicating bootconfig explicitly Add bootconfig magic word to the end of bootconfig on initrd image for indicating explicitly the bootconfig is there. Also tools/bootconfig treats wrong size or wrong checksum or parse error as an error, because if there is a bootconfig magic word, there must be a bootconfig. The bootconfig magic word is "#BOOTCONFIG\n", 12 bytes word. Thus the block image of the initrd file with bootconfig is as follows. [Initrd][bootconfig][size][csum][#BOOTCONFIG\n] Link: http://lkml.kernel.org/r/158220112263.26565.3944814205960612841.stgit@devnote2 Suggested-by: Steven Rostedt Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- Documentation/admin-guide/bootconfig.rst | 10 +++++--- include/linux/bootconfig.h | 3 +++ init/Kconfig | 2 +- init/main.c | 6 ++++- tools/bootconfig/main.c | 43 ++++++++++++++++++++++++-------- tools/bootconfig/test-bootconfig.sh | 2 +- 6 files changed, 49 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst index b342a6796392..5e7609936507 100644 --- a/Documentation/admin-guide/bootconfig.rst +++ b/Documentation/admin-guide/bootconfig.rst @@ -102,9 +102,13 @@ Boot Kernel With a Boot Config ============================== Since the boot configuration file is loaded with initrd, it will be added -to the end of the initrd (initramfs) image file. The Linux kernel decodes -the last part of the initrd image in memory to get the boot configuration -data. +to the end of the initrd (initramfs) image file with size, checksum and +12-byte magic word as below. + +[initrd][bootconfig][size(u32)][checksum(u32)][#BOOTCONFIG\n] + +The Linux kernel decodes the last part of the initrd image in memory to +get the boot configuration data. Because of this "piggyback" method, there is no need to change or update the boot loader and the kernel image itself. diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 7e18c939663e..d11e183fcb54 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -10,6 +10,9 @@ #include #include +#define BOOTCONFIG_MAGIC "#BOOTCONFIG\n" +#define BOOTCONFIG_MAGIC_LEN 12 + /* XBC tree node */ struct xbc_node { u16 next; diff --git a/init/Kconfig b/init/Kconfig index f586878410d2..a84e7aa89a29 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1222,7 +1222,7 @@ config BOOT_CONFIG Extra boot config allows system admin to pass a config file as complemental extension of kernel cmdline when booting. The boot config file must be attached at the end of initramfs - with checksum and size. + with checksum, size and magic word. See for details. If unsure, say Y. diff --git a/init/main.c b/init/main.c index d96cc5f65022..2fe8dec93e68 100644 --- a/init/main.c +++ b/init/main.c @@ -374,7 +374,11 @@ static void __init setup_boot_config(const char *cmdline) if (!initrd_end) goto not_found; - hdr = (u32 *)(initrd_end - 8); + data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN; + if (memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) + goto not_found; + + hdr = (u32 *)(data - 8); size = hdr[0]; csum = hdr[1]; diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index e18eeb070562..742271f019a9 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -131,15 +131,26 @@ int load_xbc_from_initrd(int fd, char **buf) struct stat stat; int ret; u32 size = 0, csum = 0, rcsum; + char magic[BOOTCONFIG_MAGIC_LEN]; ret = fstat(fd, &stat); if (ret < 0) return -errno; - if (stat.st_size < 8) + if (stat.st_size < 8 + BOOTCONFIG_MAGIC_LEN) return 0; - if (lseek(fd, -8, SEEK_END) < 0) { + if (lseek(fd, -BOOTCONFIG_MAGIC_LEN, SEEK_END) < 0) { + pr_err("Failed to lseek: %d\n", -errno); + return -errno; + } + if (read(fd, magic, BOOTCONFIG_MAGIC_LEN) < 0) + return -errno; + /* Check the bootconfig magic bytes */ + if (memcmp(magic, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN) != 0) + return 0; + + if (lseek(fd, -(8 + BOOTCONFIG_MAGIC_LEN), SEEK_END) < 0) { pr_err("Failed to lseek: %d\n", -errno); return -errno; } @@ -150,11 +161,14 @@ int load_xbc_from_initrd(int fd, char **buf) if (read(fd, &csum, sizeof(u32)) < 0) return -errno; - /* Wrong size, maybe no boot config here */ - if (stat.st_size < size + 8) - return 0; + /* Wrong size error */ + if (stat.st_size < size + 8 + BOOTCONFIG_MAGIC_LEN) { + pr_err("bootconfig size is too big\n"); + return -E2BIG; + } - if (lseek(fd, stat.st_size - 8 - size, SEEK_SET) < 0) { + if (lseek(fd, stat.st_size - (size + 8 + BOOTCONFIG_MAGIC_LEN), + SEEK_SET) < 0) { pr_err("Failed to lseek: %d\n", -errno); return -errno; } @@ -163,17 +177,17 @@ int load_xbc_from_initrd(int fd, char **buf) if (ret < 0) return ret; - /* Wrong Checksum, maybe no boot config here */ + /* Wrong Checksum */ rcsum = checksum((unsigned char *)*buf, size); if (csum != rcsum) { pr_err("checksum error: %d != %d\n", csum, rcsum); - return 0; + return -EINVAL; } ret = xbc_init(*buf); - /* Wrong data, maybe no boot config here */ + /* Wrong data */ if (ret < 0) - return 0; + return ret; return size; } @@ -226,7 +240,8 @@ int delete_xbc(const char *path) } else if (size > 0) { ret = fstat(fd, &stat); if (!ret) - ret = ftruncate(fd, stat.st_size - size - 8); + ret = ftruncate(fd, stat.st_size + - size - 8 - BOOTCONFIG_MAGIC_LEN); if (ret) ret = -errno; } /* Ignore if there is no boot config in initrd */ @@ -295,6 +310,12 @@ int apply_xbc(const char *path, const char *xbc_path) pr_err("Failed to apply a boot config: %d\n", ret); return ret; } + /* Write a magic word of the bootconfig */ + ret = write(fd, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN); + if (ret < 0) { + pr_err("Failed to apply a boot config magic: %d\n", ret); + return ret; + } close(fd); free(data); diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh index 1de06de328e2..adafb7c50940 100755 --- a/tools/bootconfig/test-bootconfig.sh +++ b/tools/bootconfig/test-bootconfig.sh @@ -49,7 +49,7 @@ xpass $BOOTCONF -a $TEMPCONF $INITRD new_size=$(stat -c %s $INITRD) echo "File size check" -xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9) +xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9 + 12) echo "Apply command repeat test" xpass $BOOTCONF -a $TEMPCONF $INITRD -- cgit v1.2.3 From 2546287c5fb363a0165933ae2181c92f03e701d0 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 21 Feb 2020 10:07:25 +0800 Subject: genirq/irqdomain: Make sure all irq domain flags are distinct This was noticed when printing debugfs for MSIs on my ARM64 server. The new dstate IRQD_MSI_NOMASK_QUIRK came out surprisingly while it should only be the x86 stuff for the time being... The new MSI quirk flag uses the same bit as IRQ_DOMAIN_NAME_ALLOCATED which is oddly defined as bit 6 for no good reason. Switch it to the non used bit 1. Fixes: 6f1a4891a592 ("x86/apic/msi: Plug non-maskable MSI affinity race") Signed-off-by: Zenghui Yu Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200221020725.2038-1-yuzenghui@huawei.com --- include/linux/irqdomain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index b2d47571ab67..8d062e86d954 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -192,7 +192,7 @@ enum { IRQ_DOMAIN_FLAG_HIERARCHY = (1 << 0), /* Irq domain name was allocated in __irq_domain_add() */ - IRQ_DOMAIN_NAME_ALLOCATED = (1 << 6), + IRQ_DOMAIN_NAME_ALLOCATED = (1 << 1), /* Irq domain is an IPI domain with virq per cpu */ IRQ_DOMAIN_FLAG_IPI_PER_CPU = (1 << 2), -- cgit v1.2.3 From 595abbaff5db121428247a2e6ab368734472e101 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:50 -0800 Subject: y2038: remove ktime to/from timespec/timeval conversion A couple of helpers are now obsolete and can be removed, so drivers can no longer start using them and instead use y2038-safe interfaces. Link: http://lkml.kernel.org/r/20200110154232.4104492-2-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ktime.h | 37 ------------------------------------- 1 file changed, 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index b2bb44f87f5a..d1fb05135665 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -66,33 +66,15 @@ static inline ktime_t ktime_set(const s64 secs, const unsigned long nsecs) */ #define ktime_sub_ns(kt, nsval) ((kt) - (nsval)) -/* convert a timespec to ktime_t format: */ -static inline ktime_t timespec_to_ktime(struct timespec ts) -{ - return ktime_set(ts.tv_sec, ts.tv_nsec); -} - /* convert a timespec64 to ktime_t format: */ static inline ktime_t timespec64_to_ktime(struct timespec64 ts) { return ktime_set(ts.tv_sec, ts.tv_nsec); } -/* convert a timeval to ktime_t format: */ -static inline ktime_t timeval_to_ktime(struct timeval tv) -{ - return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); -} - -/* Map the ktime_t to timespec conversion to ns_to_timespec function */ -#define ktime_to_timespec(kt) ns_to_timespec((kt)) - /* Map the ktime_t to timespec conversion to ns_to_timespec function */ #define ktime_to_timespec64(kt) ns_to_timespec64((kt)) -/* Map the ktime_t to timeval conversion to ns_to_timeval function */ -#define ktime_to_timeval(kt) ns_to_timeval((kt)) - /* Convert ktime_t to nanoseconds */ static inline s64 ktime_to_ns(const ktime_t kt) { @@ -215,25 +197,6 @@ static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); -/** - * ktime_to_timespec_cond - convert a ktime_t variable to timespec - * format only if the variable contains data - * @kt: the ktime_t variable to convert - * @ts: the timespec variable to store the result in - * - * Return: %true if there was a successful conversion, %false if kt was 0. - */ -static inline __must_check bool ktime_to_timespec_cond(const ktime_t kt, - struct timespec *ts) -{ - if (kt) { - *ts = ktime_to_timespec(kt); - return true; - } else { - return false; - } -} - /** * ktime_to_timespec64_cond - convert a ktime_t variable to timespec64 * format only if the variable contains data -- cgit v1.2.3 From 412c53a680a97cb1ae2c0ab60230e193bee86387 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:54 -0800 Subject: y2038: remove unused time32 interfaces No users remain, so kill these off before we grow new ones. Link: http://lkml.kernel.org/r/20200110154232.4104492-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 29 -------- include/linux/time32.h | 154 +----------------------------------------- include/linux/timekeeping32.h | 32 --------- include/linux/types.h | 5 -- kernel/compat.c | 64 ------------------ kernel/time/time.c | 43 ------------ 6 files changed, 1 insertion(+), 326 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index 11083d84eb23..df2475be134a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -248,15 +248,6 @@ typedef struct compat_siginfo { } _sifields; } compat_siginfo_t; -/* - * These functions operate on 32- or 64-bit specs depending on - * COMPAT_USE_64BIT_TIME, hence the void user pointer arguments. - */ -extern int compat_get_timespec(struct timespec *, const void __user *); -extern int compat_put_timespec(const struct timespec *, void __user *); -extern int compat_get_timeval(struct timeval *, const void __user *); -extern int compat_put_timeval(const struct timeval *, void __user *); - struct compat_iovec { compat_uptr_t iov_base; compat_size_t iov_len; @@ -416,26 +407,6 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginf int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); -static inline int old_timeval32_compare(struct old_timeval32 *lhs, - struct old_timeval32 *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_usec - rhs->tv_usec; -} - -static inline int old_timespec32_compare(struct old_timespec32 *lhs, - struct old_timespec32 *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_nsec - rhs->tv_nsec; -} - extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat); /* diff --git a/include/linux/time32.h b/include/linux/time32.h index cad4c3186002..cf9320cd2d0b 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -12,8 +12,6 @@ #include #include -#define TIME_T_MAX (__kernel_old_time_t)((1UL << ((sizeof(__kernel_old_time_t) << 3) - 1)) - 1) - typedef s32 old_time32_t; struct old_timespec32 { @@ -73,162 +71,12 @@ struct __kernel_timex; int get_old_timex32(struct __kernel_timex *, const struct old_timex32 __user *); int put_old_timex32(struct old_timex32 __user *, const struct __kernel_timex *); -#if __BITS_PER_LONG == 64 - -/* timespec64 is defined as timespec here */ -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - return *(const struct timespec *)&ts64; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - return *(const struct timespec64 *)&ts; -} - -#else -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - struct timespec ret; - - ret.tv_sec = (time_t)ts64.tv_sec; - ret.tv_nsec = ts64.tv_nsec; - return ret; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - struct timespec64 ret; - - ret.tv_sec = ts.tv_sec; - ret.tv_nsec = ts.tv_nsec; - return ret; -} -#endif - -static inline int timespec_equal(const struct timespec *a, - const struct timespec *b) -{ - return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); -} - -/* - * lhs < rhs: return <0 - * lhs == rhs: return 0 - * lhs > rhs: return >0 - */ -static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_nsec - rhs->tv_nsec; -} - -/* - * Returns true if the timespec is norm, false if denorm: - */ -static inline bool timespec_valid(const struct timespec *ts) -{ - /* Dates before 1970 are bogus */ - if (ts->tv_sec < 0) - return false; - /* Can't have more nanoseconds then a second */ - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - -/** - * timespec_to_ns - Convert timespec to nanoseconds - * @ts: pointer to the timespec variable to be converted - * - * Returns the scalar nanosecond representation of the timespec - * parameter. - */ -static inline s64 timespec_to_ns(const struct timespec *ts) -{ - return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; -} - /** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -extern struct timespec ns_to_timespec(const s64 nsec); - -/** - * timespec_add_ns - Adds nanoseconds to a timespec - * @a: pointer to timespec to be incremented - * @ns: unsigned nanoseconds value to be added - * - * This must always be inlined because its used from the x86-64 vdso, - * which cannot call other kernel functions. - */ -static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) -{ - a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); - a->tv_nsec = ns; -} - -static inline unsigned long mktime(const unsigned int year, - const unsigned int mon, const unsigned int day, - const unsigned int hour, const unsigned int min, - const unsigned int sec) -{ - return mktime64(year, mon, day, hour, min, sec); -} - -static inline bool timeval_valid(const struct timeval *tv) -{ - /* Dates before 1970 are bogus */ - if (tv->tv_sec < 0) - return false; - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - - return true; -} - -/** - * timeval_to_ns - Convert timeval to nanoseconds - * @ts: pointer to the timeval variable to be converted - * - * Returns the scalar nanosecond representation of the timeval - * parameter. - */ -static inline s64 timeval_to_ns(const struct timeval *tv) -{ - return ((s64) tv->tv_sec * NSEC_PER_SEC) + - tv->tv_usec * NSEC_PER_USEC; -} - -/** - * ns_to_timeval - Convert nanoseconds to timeval + * ns_to_kernel_old_timeval - Convert nanoseconds to timeval * @nsec: the nanoseconds value to be converted * * Returns the timeval representation of the nsec parameter. */ -extern struct timeval ns_to_timeval(const s64 nsec); extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); -/* - * Old names for the 32-bit time_t interfaces, these will be removed - * when everything uses the new names. - */ -#define compat_time_t old_time32_t -#define compat_timeval old_timeval32 -#define compat_timespec old_timespec32 -#define compat_itimerspec old_itimerspec32 -#define ns_to_compat_timeval ns_to_old_timeval32 -#define get_compat_itimerspec64 get_old_itimerspec32 -#define put_compat_itimerspec64 put_old_itimerspec32 -#define compat_get_timespec64 get_old_timespec32 -#define compat_put_timespec64 put_old_timespec32 - #endif diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index cc59cc9e0e84..266017fc9ee9 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -11,36 +11,4 @@ static inline unsigned long get_seconds(void) return ktime_get_real_seconds(); } -static inline void getnstimeofday(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_real_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void ktime_get_ts(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void getrawmonotonic(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_raw_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void getboottime(struct timespec *ts) -{ - struct timespec64 ts64; - - getboottime64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - #endif diff --git a/include/linux/types.h b/include/linux/types.h index eb870ad42919..d3021c879179 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -65,11 +65,6 @@ typedef __kernel_ssize_t ssize_t; typedef __kernel_ptrdiff_t ptrdiff_t; #endif -#ifndef _TIME_T -#define _TIME_T -typedef __kernel_old_time_t time_t; -#endif - #ifndef _CLOCK_T #define _CLOCK_T typedef __kernel_clock_t clock_t; diff --git a/kernel/compat.c b/kernel/compat.c index 95005f849c68..843dd17e6078 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,70 +26,6 @@ #include -static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __get_user(tv->tv_sec, &ctv->tv_sec) || - __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __put_user(tv->tv_sec, &ctv->tv_sec) || - __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int compat_get_timeval(struct timeval *tv, const void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_get_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_get_timeval); - -int compat_put_timeval(const struct timeval *tv, void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_put_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_put_timeval); - -int compat_get_timespec(struct timespec *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec); - -int compat_put_timespec(const struct timespec *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec); - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); -- cgit v1.2.3 From f66ee0410b1c3481ee75e5db9b34547b4d582465 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Tue, 11 Feb 2020 23:20:43 +0100 Subject: netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports In the case of huge hash:* types of sets, due to the single spinlock of a set the processing of the whole set under spinlock protection could take too long. There were four places where the whole hash table of the set was processed from bucket to bucket under holding the spinlock: - During resizing a set, the original set was locked to exclude kernel side add/del element operations (userspace add/del is excluded by the nfnetlink mutex). The original set is actually just read during the resize, so the spinlocking is replaced with rcu locking of regions. However, thus there can be parallel kernel side add/del of entries. In order not to loose those operations a backlog is added and replayed after the successful resize. - Garbage collection of timed out entries was also protected by the spinlock. In order not to lock too long, region locking is introduced and a single region is processed in one gc go. Also, the simple timer based gc running is replaced with a workqueue based solution. The internal book-keeping (number of elements, size of extensions) is moved to region level due to the region locking. - Adding elements: when the max number of the elements is reached, the gc was called to evict the timed out entries. The new approach is that the gc is called just for the matching region, assuming that if the region (proportionally) seems to be full, then the whole set does. We could scan the other regions to check every entry under rcu locking, but for huge sets it'd mean a slowdown at adding elements. - Listing the set header data: when the set was defined with timeout support, the garbage collector was called to clean up timed out entries to get the correct element numbers and set size values. Now the set is scanned to check non-timed out entries, without actually calling the gc for the whole set. Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe -> SOFTIRQ-unsafe lock order issues during working on the patch. Reported-by: syzbot+4b0e9d4ff3cf117837e5@syzkaller.appspotmail.com Reported-by: syzbot+c27b8d5010f45c666ed1@syzkaller.appspotmail.com Reported-by: syzbot+68a806795ac89df3aa1c@syzkaller.appspotmail.com Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") Signed-off-by: Jozsef Kadlecsik --- include/linux/netfilter/ipset/ip_set.h | 11 +- net/netfilter/ipset/ip_set_core.c | 34 +- net/netfilter/ipset/ip_set_hash_gen.h | 633 +++++++++++++++++++++++---------- 3 files changed, 472 insertions(+), 206 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 908d38dbcb91..5448c8b443db 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -121,6 +121,7 @@ struct ip_set_ext { u32 timeout; u8 packets_op; u8 bytes_op; + bool target; }; struct ip_set; @@ -187,6 +188,14 @@ struct ip_set_type_variant { /* Return true if "b" set is the same as "a" * according to the create set parameters */ bool (*same_set)(const struct ip_set *a, const struct ip_set *b); + /* Region-locking is used */ + bool region_lock; +}; + +struct ip_set_region { + spinlock_t lock; /* Region lock */ + size_t ext_size; /* Size of the dynamic extensions */ + u32 elements; /* Number of elements vs timeout */ }; /* The core set type structure */ @@ -501,7 +510,7 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, } #define IP_SET_INIT_KEXT(skb, opt, set) \ - { .bytes = (skb)->len, .packets = 1, \ + { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } #define IP_SET_INIT_UEXT(set) \ diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 69c107f9ba8d..8dd17589217d 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -723,6 +723,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index) return set; } +static inline void +ip_set_lock(struct ip_set *set) +{ + if (!set->variant->region_lock) + spin_lock_bh(&set->lock); +} + +static inline void +ip_set_unlock(struct ip_set *set) +{ + if (!set->variant->region_lock) + spin_unlock_bh(&set->lock); +} + int ip_set_test(ip_set_id_t index, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt) @@ -744,9 +758,9 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb, if (ret == -EAGAIN) { /* Type requests element to be completed */ pr_debug("element must be completed, ADD is triggered\n"); - spin_lock_bh(&set->lock); + ip_set_lock(set); set->variant->kadt(set, skb, par, IPSET_ADD, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); ret = 1; } else { /* --return-nomatch: invert matched element */ @@ -775,9 +789,9 @@ ip_set_add(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); return ret; } @@ -797,9 +811,9 @@ ip_set_del(ip_set_id_t index, const struct sk_buff *skb, !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) return -IPSET_ERR_TYPE_MISMATCH; - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); return ret; } @@ -1264,9 +1278,9 @@ ip_set_flush_set(struct ip_set *set) { pr_debug("set: %s\n", set->name); - spin_lock_bh(&set->lock); + ip_set_lock(set); set->variant->flush(set); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); } static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, @@ -1713,9 +1727,9 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, bool eexist = flags & IPSET_FLAG_EXIST, retried = false; do { - spin_lock_bh(&set->lock); + ip_set_lock(set); ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); - spin_unlock_bh(&set->lock); + ip_set_unlock(set); retried = true; } while (ret == -EAGAIN && set->variant->resize && diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 7480ce55b5c8..71e93eac0831 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -7,13 +7,21 @@ #include #include #include +#include #include -#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) -#define ipset_dereference_protected(p, set) \ - __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock)) - -#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) +#define __ipset_dereference(p) \ + rcu_dereference_protected(p, 1) +#define ipset_dereference_nfnl(p) \ + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) +#define ipset_dereference_set(p, set) \ + rcu_dereference_protected(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ + lockdep_is_held(&(set)->lock)) +#define ipset_dereference_bh_nfnl(p) \ + rcu_dereference_bh_check(p, \ + lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) /* Hashing which uses arrays to resolve clashing. The hash table is resized * (doubled) when searching becomes too long. @@ -72,11 +80,35 @@ struct hbucket { __aligned(__alignof__(u64)); }; +/* Region size for locking == 2^HTABLE_REGION_BITS */ +#define HTABLE_REGION_BITS 10 +#define ahash_numof_locks(htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? 1 \ + : jhash_size((htable_bits) - HTABLE_REGION_BITS)) +#define ahash_sizeof_regions(htable_bits) \ + (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) +#define ahash_region(n, htable_bits) \ + ((n) % ahash_numof_locks(htable_bits)) +#define ahash_bucket_start(h, htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? 0 \ + : (h) * jhash_size(HTABLE_REGION_BITS)) +#define ahash_bucket_end(h, htable_bits) \ + ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \ + : ((h) + 1) * jhash_size(HTABLE_REGION_BITS)) + +struct htable_gc { + struct delayed_work dwork; + struct ip_set *set; /* Set the gc belongs to */ + u32 region; /* Last gc run position */ +}; + /* The hash table: the table size stored here in order to make resizing easy */ struct htable { atomic_t ref; /* References for resizing */ - atomic_t uref; /* References for dumping */ + atomic_t uref; /* References for dumping and gc */ u8 htable_bits; /* size of hash table == 2^htable_bits */ + u32 maxelem; /* Maxelem per region */ + struct ip_set_region *hregion; /* Region locks and ext sizes */ struct hbucket __rcu *bucket[0]; /* hashtable buckets */ }; @@ -162,6 +194,10 @@ htable_bits(u32 hashsize) #define NLEN 0 #endif /* IP_SET_HASH_WITH_NETS */ +#define SET_ELEM_EXPIRED(set, d) \ + (SET_WITH_TIMEOUT(set) && \ + ip_set_timeout_expired(ext_timeout(d, set))) + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -205,10 +241,12 @@ htable_bits(u32 hashsize) #undef mtype_test_cidrs #undef mtype_test #undef mtype_uref -#undef mtype_expire #undef mtype_resize +#undef mtype_ext_size +#undef mtype_resize_ad #undef mtype_head #undef mtype_list +#undef mtype_gc_do #undef mtype_gc #undef mtype_gc_init #undef mtype_variant @@ -247,10 +285,12 @@ htable_bits(u32 hashsize) #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_uref IPSET_TOKEN(MTYPE, _uref) -#define mtype_expire IPSET_TOKEN(MTYPE, _expire) #define mtype_resize IPSET_TOKEN(MTYPE, _resize) +#define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size) +#define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_list IPSET_TOKEN(MTYPE, _list) +#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_variant IPSET_TOKEN(MTYPE, _variant) @@ -275,8 +315,7 @@ htable_bits(u32 hashsize) /* The generic hash structure */ struct htype { struct htable __rcu *table; /* the hash table */ - struct timer_list gc; /* garbage collection when timeout enabled */ - struct ip_set *set; /* attached to this ip_set */ + struct htable_gc gc; /* gc workqueue */ u32 maxelem; /* max elements in the hash */ u32 initval; /* random jhash init value */ #ifdef IP_SET_HASH_WITH_MARKMASK @@ -288,21 +327,33 @@ struct htype { #ifdef IP_SET_HASH_WITH_NETMASK u8 netmask; /* netmask value for subnets to store */ #endif + struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ #ifdef IP_SET_HASH_WITH_NETS struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */ #endif }; +/* ADD|DEL entries saved during resize */ +struct mtype_resize_ad { + struct list_head list; + enum ipset_adt ad; /* ADD|DEL element */ + struct mtype_elem d; /* Element value */ + struct ip_set_ext ext; /* Extensions for ADD */ + struct ip_set_ext mext; /* Target extensions for ADD */ + u32 flags; /* Flags for ADD */ +}; + #ifdef IP_SET_HASH_WITH_NETS /* Network cidr size book keeping when the hash stores different * sized networks. cidr == real cidr + 1 to support /0. */ static void -mtype_add_cidr(struct htype *h, u8 cidr, u8 n) +mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { int i, j; + spin_lock_bh(&set->lock); /* Add in increasing prefix order, so larger cidr first */ for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) { if (j != -1) { @@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n) j = i; } else if (h->nets[i].cidr[n] == cidr) { h->nets[CIDR_POS(cidr)].nets[n]++; - return; + goto unlock; } } if (j != -1) { @@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, u8 n) } h->nets[i].cidr[n] = cidr; h->nets[CIDR_POS(cidr)].nets[n] = 1; +unlock: + spin_unlock_bh(&set->lock); } static void -mtype_del_cidr(struct htype *h, u8 cidr, u8 n) +mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) { u8 i, j, net_end = NLEN - 1; + spin_lock_bh(&set->lock); for (i = 0; i < NLEN; i++) { if (h->nets[i].cidr[n] != cidr) continue; h->nets[CIDR_POS(cidr)].nets[n]--; if (h->nets[CIDR_POS(cidr)].nets[n] > 0) - return; + goto unlock; for (j = i; j < net_end && h->nets[j].cidr[n]; j++) h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; h->nets[j].cidr[n] = 0; - return; + goto unlock; } +unlock: + spin_unlock_bh(&set->lock); } #endif @@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, u8 n) static size_t mtype_ahash_memsize(const struct htype *h, const struct htable *t) { - return sizeof(*h) + sizeof(*t); + return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits); } /* Get the ith element from the array block n */ @@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set) struct htype *h = set->data; struct htable *t; struct hbucket *n; - u32 i; - - t = ipset_dereference_protected(h->table, set); - for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); - if (!n) - continue; - if (set->extensions & IPSET_EXT_DESTROY) - mtype_ext_cleanup(set, n); - /* FIXME: use slab cache */ - rcu_assign_pointer(hbucket(t, i), NULL); - kfree_rcu(n, rcu); + u32 r, i; + + t = ipset_dereference_nfnl(h->table); + for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { + spin_lock_bh(&t->hregion[r].lock); + for (i = ahash_bucket_start(r, t->htable_bits); + i < ahash_bucket_end(r, t->htable_bits); i++) { + n = __ipset_dereference(hbucket(t, i)); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY) + mtype_ext_cleanup(set, n); + /* FIXME: use slab cache */ + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); + } + t->hregion[r].ext_size = 0; + t->hregion[r].elements = 0; + spin_unlock_bh(&t->hregion[r].lock); } #ifdef IP_SET_HASH_WITH_NETS memset(h->nets, 0, sizeof(h->nets)); #endif - set->elements = 0; - set->ext_size = 0; } /* Destroy the hashtable part of the set */ @@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) u32 i; for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); + n = __ipset_dereference(hbucket(t, i)); if (!n) continue; if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) @@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set, struct htable *t, bool ext_destroy) kfree(n); } + ip_set_free(t->hregion); ip_set_free(t); } @@ -414,28 +476,21 @@ static void mtype_destroy(struct ip_set *set) { struct htype *h = set->data; + struct list_head *l, *lt; if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&h->gc); + cancel_delayed_work_sync(&h->gc.dwork); - mtype_ahash_destroy(set, - __ipset_dereference_protected(h->table, 1), true); + mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); + list_for_each_safe(l, lt, &h->ad) { + list_del(l); + kfree(l); + } kfree(h); set->data = NULL; } -static void -mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) -{ - struct htype *h = set->data; - - timer_setup(&h->gc, gc, 0); - mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); - pr_debug("gc initialized, run in every %u\n", - IPSET_GC_PERIOD(set->timeout)); -} - static bool mtype_same_set(const struct ip_set *a, const struct ip_set *b) { @@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) a->extensions == b->extensions; } -/* Delete expired elements from the hashtable */ static void -mtype_expire(struct ip_set *set, struct htype *h) +mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) { - struct htable *t; struct hbucket *n, *tmp; struct mtype_elem *data; u32 i, j, d; @@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct htype *h) #ifdef IP_SET_HASH_WITH_NETS u8 k; #endif + u8 htable_bits = t->htable_bits; - t = ipset_dereference_protected(h->table, set); - for (i = 0; i < jhash_size(t->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(t, i), 1); + spin_lock_bh(&t->hregion[r].lock); + for (i = ahash_bucket_start(r, htable_bits); + i < ahash_bucket_end(r, htable_bits); i++) { + n = __ipset_dereference(hbucket(t, i)); if (!n) continue; for (j = 0, d = 0; j < n->pos; j++) { @@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct htype *h) smp_mb__after_atomic(); #ifdef IP_SET_HASH_WITH_NETS for (k = 0; k < IPSET_NET_COUNT; k++) - mtype_del_cidr(h, + mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, k)), k); #endif + t->hregion[r].elements--; ip_set_ext_destroy(set, data); - set->elements--; d++; } if (d >= AHASH_INIT_SIZE) { if (d >= n->size) { + t->hregion[r].ext_size -= + ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, i), NULL); kfree_rcu(n, rcu); continue; } tmp = kzalloc(sizeof(*tmp) + - (n->size - AHASH_INIT_SIZE) * dsize, - GFP_ATOMIC); + (n->size - AHASH_INIT_SIZE) * dsize, + GFP_ATOMIC); if (!tmp) - /* Still try to delete expired elements */ + /* Still try to delete expired elements. */ continue; tmp->size = n->size - AHASH_INIT_SIZE; for (j = 0, d = 0; j < n->pos; j++) { if (!test_bit(j, n->used)) continue; data = ahash_data(n, j, dsize); - memcpy(tmp->value + d * dsize, data, dsize); + memcpy(tmp->value + d * dsize, + data, dsize); set_bit(d, tmp->used); d++; } tmp->pos = d; - set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); + t->hregion[r].ext_size -= + ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, i), tmp); kfree_rcu(n, rcu); } } + spin_unlock_bh(&t->hregion[r].lock); } static void -mtype_gc(struct timer_list *t) +mtype_gc(struct work_struct *work) { - struct htype *h = from_timer(h, t, gc); - struct ip_set *set = h->set; + struct htable_gc *gc; + struct ip_set *set; + struct htype *h; + struct htable *t; + u32 r, numof_locks; + unsigned int next_run; + + gc = container_of(work, struct htable_gc, dwork.work); + set = gc->set; + h = set->data; - pr_debug("called\n"); spin_lock_bh(&set->lock); - mtype_expire(set, h); + t = ipset_dereference_set(h->table, set); + atomic_inc(&t->uref); + numof_locks = ahash_numof_locks(t->htable_bits); + r = gc->region++; + if (r >= numof_locks) { + r = gc->region = 0; + } + next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks; + if (next_run < HZ/10) + next_run = HZ/10; spin_unlock_bh(&set->lock); - h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; - add_timer(&h->gc); + mtype_gc_do(set, h, t, r); + + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by expire: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + + queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run); + +} + +static void +mtype_gc_init(struct htable_gc *gc) +{ + INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc); + queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); } +static int +mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags); +static int +mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, + struct ip_set_ext *mext, u32 flags); + /* Resize a hash: create a new hash table with doubling the hashsize * and inserting the elements to it. Repeat until we succeed or * fail due to memory pressures. @@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool retried) struct htype *h = set->data; struct htable *t, *orig; u8 htable_bits; - size_t extsize, dsize = set->dsize; + size_t dsize = set->dsize; #ifdef IP_SET_HASH_WITH_NETS u8 flags; struct mtype_elem *tmp; @@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool retried) struct mtype_elem *data; struct mtype_elem *d; struct hbucket *n, *m; - u32 i, j, key; + struct list_head *l, *lt; + struct mtype_resize_ad *x; + u32 i, j, r, nr, key; int ret; #ifdef IP_SET_HASH_WITH_NETS @@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool retried) if (!tmp) return -ENOMEM; #endif - rcu_read_lock_bh(); - orig = rcu_dereference_bh_nfnl(h->table); + orig = ipset_dereference_bh_nfnl(h->table); htable_bits = orig->htable_bits; - rcu_read_unlock_bh(); retry: ret = 0; @@ -583,88 +680,124 @@ retry: ret = -ENOMEM; goto out; } + t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); + if (!t->hregion) { + kfree(t); + ret = -ENOMEM; + goto out; + } t->htable_bits = htable_bits; + t->maxelem = h->maxelem / ahash_numof_locks(htable_bits); + for (i = 0; i < ahash_numof_locks(htable_bits); i++) + spin_lock_init(&t->hregion[i].lock); - spin_lock_bh(&set->lock); - orig = __ipset_dereference_protected(h->table, 1); - /* There can't be another parallel resizing, but dumping is possible */ + /* There can't be another parallel resizing, + * but dumping, gc, kernel side add/del are possible + */ + orig = ipset_dereference_bh_nfnl(h->table); atomic_set(&orig->ref, 1); atomic_inc(&orig->uref); - extsize = 0; pr_debug("attempt to resize set %s from %u to %u, t %p\n", set->name, orig->htable_bits, htable_bits, orig); - for (i = 0; i < jhash_size(orig->htable_bits); i++) { - n = __ipset_dereference_protected(hbucket(orig, i), 1); - if (!n) - continue; - for (j = 0; j < n->pos; j++) { - if (!test_bit(j, n->used)) + for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) { + /* Expire may replace a hbucket with another one */ + rcu_read_lock_bh(); + for (i = ahash_bucket_start(r, orig->htable_bits); + i < ahash_bucket_end(r, orig->htable_bits); i++) { + n = __ipset_dereference(hbucket(orig, i)); + if (!n) continue; - data = ahash_data(n, j, dsize); + for (j = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); + if (SET_ELEM_EXPIRED(set, data)) + continue; #ifdef IP_SET_HASH_WITH_NETS - /* We have readers running parallel with us, - * so the live data cannot be modified. - */ - flags = 0; - memcpy(tmp, data, dsize); - data = tmp; - mtype_data_reset_flags(data, &flags); + /* We have readers running parallel with us, + * so the live data cannot be modified. + */ + flags = 0; + memcpy(tmp, data, dsize); + data = tmp; + mtype_data_reset_flags(data, &flags); #endif - key = HKEY(data, h->initval, htable_bits); - m = __ipset_dereference_protected(hbucket(t, key), 1); - if (!m) { - m = kzalloc(sizeof(*m) + + key = HKEY(data, h->initval, htable_bits); + m = __ipset_dereference(hbucket(t, key)); + nr = ahash_region(key, htable_bits); + if (!m) { + m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, GFP_ATOMIC); - if (!m) { - ret = -ENOMEM; - goto cleanup; - } - m->size = AHASH_INIT_SIZE; - extsize += ext_size(AHASH_INIT_SIZE, dsize); - RCU_INIT_POINTER(hbucket(t, key), m); - } else if (m->pos >= m->size) { - struct hbucket *ht; - - if (m->size >= AHASH_MAX(h)) { - ret = -EAGAIN; - } else { - ht = kzalloc(sizeof(*ht) + + if (!m) { + ret = -ENOMEM; + goto cleanup; + } + m->size = AHASH_INIT_SIZE; + t->hregion[nr].ext_size += + ext_size(AHASH_INIT_SIZE, + dsize); + RCU_INIT_POINTER(hbucket(t, key), m); + } else if (m->pos >= m->size) { + struct hbucket *ht; + + if (m->size >= AHASH_MAX(h)) { + ret = -EAGAIN; + } else { + ht = kzalloc(sizeof(*ht) + (m->size + AHASH_INIT_SIZE) * dsize, GFP_ATOMIC); - if (!ht) - ret = -ENOMEM; + if (!ht) + ret = -ENOMEM; + } + if (ret < 0) + goto cleanup; + memcpy(ht, m, sizeof(struct hbucket) + + m->size * dsize); + ht->size = m->size + AHASH_INIT_SIZE; + t->hregion[nr].ext_size += + ext_size(AHASH_INIT_SIZE, + dsize); + kfree(m); + m = ht; + RCU_INIT_POINTER(hbucket(t, key), ht); } - if (ret < 0) - goto cleanup; - memcpy(ht, m, sizeof(struct hbucket) + - m->size * dsize); - ht->size = m->size + AHASH_INIT_SIZE; - extsize += ext_size(AHASH_INIT_SIZE, dsize); - kfree(m); - m = ht; - RCU_INIT_POINTER(hbucket(t, key), ht); - } - d = ahash_data(m, m->pos, dsize); - memcpy(d, data, dsize); - set_bit(m->pos++, m->used); + d = ahash_data(m, m->pos, dsize); + memcpy(d, data, dsize); + set_bit(m->pos++, m->used); + t->hregion[nr].elements++; #ifdef IP_SET_HASH_WITH_NETS - mtype_data_reset_flags(d, &flags); + mtype_data_reset_flags(d, &flags); #endif + } } + rcu_read_unlock_bh(); } - rcu_assign_pointer(h->table, t); - set->ext_size = extsize; - spin_unlock_bh(&set->lock); + /* There can't be any other writer. */ + rcu_assign_pointer(h->table, t); /* Give time to other readers of the set */ synchronize_rcu(); pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, orig->htable_bits, orig, t->htable_bits, t); - /* If there's nobody else dumping the table, destroy it */ + /* Add/delete elements processed by the SET target during resize. + * Kernel-side add cannot trigger a resize and userspace actions + * are serialized by the mutex. + */ + list_for_each_safe(l, lt, &h->ad) { + x = list_entry(l, struct mtype_resize_ad, list); + if (x->ad == IPSET_ADD) { + mtype_add(set, &x->d, &x->ext, &x->mext, x->flags); + } else { + mtype_del(set, &x->d, NULL, NULL, 0); + } + list_del(l); + kfree(l); + } + /* If there's nobody else using the table, destroy it */ if (atomic_dec_and_test(&orig->uref)) { pr_debug("Table destroy by resize %p\n", orig); mtype_ahash_destroy(set, orig, false); @@ -677,15 +810,44 @@ out: return ret; cleanup: + rcu_read_unlock_bh(); atomic_set(&orig->ref, 0); atomic_dec(&orig->uref); - spin_unlock_bh(&set->lock); mtype_ahash_destroy(set, t, false); if (ret == -EAGAIN) goto retry; goto out; } +/* Get the current number of elements and ext_size in the set */ +static void +mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) +{ + struct htype *h = set->data; + const struct htable *t; + u32 i, j, r; + struct hbucket *n; + struct mtype_elem *data; + + t = rcu_dereference_bh(h->table); + for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { + for (i = ahash_bucket_start(r, t->htable_bits); + i < ahash_bucket_end(r, t->htable_bits); i++) { + n = rcu_dereference_bh(hbucket(t, i)); + if (!n) + continue; + for (j = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, set->dsize); + if (!SET_ELEM_EXPIRED(set, data)) + (*elements)++; + } + } + *ext_size += t->hregion[r].ext_size; + } +} + /* Add an element to a hash and update the internal counters when succeeded, * otherwise report the proper error code. */ @@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n, *old = ERR_PTR(-ENOENT); - int i, j = -1; + int i, j = -1, ret; bool flag_exist = flags & IPSET_FLAG_EXIST; bool deleted = false, forceadd = false, reuse = false; - u32 key, multi = 0; + u32 r, key, multi = 0, elements, maxelem; - if (set->elements >= h->maxelem) { - if (SET_WITH_TIMEOUT(set)) - /* FIXME: when set is full, we slow down here */ - mtype_expire(set, h); - if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set)) + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); + r = ahash_region(key, t->htable_bits); + atomic_inc(&t->uref); + elements = t->hregion[r].elements; + maxelem = t->maxelem; + if (elements >= maxelem) { + u32 e; + if (SET_WITH_TIMEOUT(set)) { + rcu_read_unlock_bh(); + mtype_gc_do(set, h, t, r); + rcu_read_lock_bh(); + } + maxelem = h->maxelem; + elements = 0; + for (e = 0; e < ahash_numof_locks(t->htable_bits); e++) + elements += t->hregion[e].elements; + if (elements >= maxelem && SET_WITH_FORCEADD(set)) forceadd = true; } + rcu_read_unlock_bh(); - t = ipset_dereference_protected(h->table, set); - key = HKEY(value, h->initval, t->htable_bits); - n = __ipset_dereference_protected(hbucket(t, key), 1); + spin_lock_bh(&t->hregion[r].lock); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) { - if (forceadd || set->elements >= h->maxelem) + if (forceadd || elements >= maxelem) goto set_full; old = NULL; n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, GFP_ATOMIC); - if (!n) - return -ENOMEM; + if (!n) { + ret = -ENOMEM; + goto unlock; + } n->size = AHASH_INIT_SIZE; - set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); + t->hregion[r].ext_size += + ext_size(AHASH_INIT_SIZE, set->dsize); goto copy_elem; } for (i = 0; i < n->pos; i++) { @@ -737,19 +916,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, } data = ahash_data(n, i, set->dsize); if (mtype_data_equal(data, d, &multi)) { - if (flag_exist || - (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)))) { + if (flag_exist || SET_ELEM_EXPIRED(set, data)) { /* Just the extensions could be overwritten */ j = i; goto overwrite_extensions; } - return -IPSET_ERR_EXIST; + ret = -IPSET_ERR_EXIST; + goto unlock; } /* Reuse first timed out entry */ - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set)) && - j == -1) { + if (SET_ELEM_EXPIRED(set, data) && j == -1) { j = i; reuse = true; } @@ -759,16 +935,16 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (!deleted) { #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_del_cidr(h, + mtype_del_cidr(set, h, NCIDR_PUT(DCIDR_GET(data->cidr, i)), i); #endif ip_set_ext_destroy(set, data); - set->elements--; + t->hregion[r].elements--; } goto copy_data; } - if (set->elements >= h->maxelem) + if (elements >= maxelem) goto set_full; /* Create a new slot */ if (n->pos >= n->size) { @@ -776,28 +952,32 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, if (n->size >= AHASH_MAX(h)) { /* Trigger rehashing */ mtype_data_next(&h->next, d); - return -EAGAIN; + ret = -EAGAIN; + goto resize; } old = n; n = kzalloc(sizeof(*n) + (old->size + AHASH_INIT_SIZE) * set->dsize, GFP_ATOMIC); - if (!n) - return -ENOMEM; + if (!n) { + ret = -ENOMEM; + goto unlock; + } memcpy(n, old, sizeof(struct hbucket) + old->size * set->dsize); n->size = old->size + AHASH_INIT_SIZE; - set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); + t->hregion[r].ext_size += + ext_size(AHASH_INIT_SIZE, set->dsize); } copy_elem: j = n->pos++; data = ahash_data(n, j, set->dsize); copy_data: - set->elements++; + t->hregion[r].elements++; #ifdef IP_SET_HASH_WITH_NETS for (i = 0; i < IPSET_NET_COUNT; i++) - mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); + mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); #endif memcpy(data, d, sizeof(struct mtype_elem)); overwrite_extensions: @@ -820,13 +1000,41 @@ overwrite_extensions: if (old) kfree_rcu(old, rcu); } + ret = 0; +resize: + spin_unlock_bh(&t->hregion[r].lock); + if (atomic_read(&t->ref) && ext->target) { + /* Resize is in process and kernel side add, save values */ + struct mtype_resize_ad *x; + + x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC); + if (!x) + /* Don't bother */ + goto out; + x->ad = IPSET_ADD; + memcpy(&x->d, value, sizeof(struct mtype_elem)); + memcpy(&x->ext, ext, sizeof(struct ip_set_ext)); + memcpy(&x->mext, mext, sizeof(struct ip_set_ext)); + x->flags = flags; + spin_lock_bh(&set->lock); + list_add_tail(&x->list, &h->ad); + spin_unlock_bh(&set->lock); + } + goto out; - return 0; set_full: if (net_ratelimit()) pr_warn("Set %s is full, maxelem %u reached\n", - set->name, h->maxelem); - return -IPSET_ERR_HASH_FULL; + set->name, maxelem); + ret = -IPSET_ERR_HASH_FULL; +unlock: + spin_unlock_bh(&t->hregion[r].lock); +out: + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by add: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + return ret; } /* Delete an element from the hash and free up space if possible. @@ -840,13 +1048,23 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, const struct mtype_elem *d = value; struct mtype_elem *data; struct hbucket *n; - int i, j, k, ret = -IPSET_ERR_EXIST; + struct mtype_resize_ad *x = NULL; + int i, j, k, r, ret = -IPSET_ERR_EXIST; u32 key, multi = 0; size_t dsize = set->dsize; - t = ipset_dereference_protected(h->table, set); + /* Userspace add and resize is excluded by the mutex. + * Kernespace add does not trigger resize. + */ + rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - n = __ipset_dereference_protected(hbucket(t, key), 1); + r = ahash_region(key, t->htable_bits); + atomic_inc(&t->uref); + rcu_read_unlock_bh(); + + spin_lock_bh(&t->hregion[r].lock); + n = rcu_dereference_bh(hbucket(t, key)); if (!n) goto out; for (i = 0, k = 0; i < n->pos; i++) { @@ -857,8 +1075,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, data = ahash_data(n, i, dsize); if (!mtype_data_equal(data, d, &multi)) continue; - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(data, set))) + if (SET_ELEM_EXPIRED(set, data)) goto out; ret = 0; @@ -866,20 +1083,33 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, smp_mb__after_atomic(); if (i + 1 == n->pos) n->pos--; - set->elements--; + t->hregion[r].elements--; #ifdef IP_SET_HASH_WITH_NETS for (j = 0; j < IPSET_NET_COUNT; j++) - mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), - j); + mtype_del_cidr(set, h, + NCIDR_PUT(DCIDR_GET(d->cidr, j)), j); #endif ip_set_ext_destroy(set, data); + if (atomic_read(&t->ref) && ext->target) { + /* Resize is in process and kernel side del, + * save values + */ + x = kzalloc(sizeof(struct mtype_resize_ad), + GFP_ATOMIC); + if (x) { + x->ad = IPSET_DEL; + memcpy(&x->d, value, + sizeof(struct mtype_elem)); + x->flags = flags; + } + } for (; i < n->pos; i++) { if (!test_bit(i, n->used)) k++; } if (n->pos == 0 && k == 0) { - set->ext_size -= ext_size(n->size, dsize); + t->hregion[r].ext_size -= ext_size(n->size, dsize); rcu_assign_pointer(hbucket(t, key), NULL); kfree_rcu(n, rcu); } else if (k >= AHASH_INIT_SIZE) { @@ -898,7 +1128,8 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, k++; } tmp->pos = k; - set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); + t->hregion[r].ext_size -= + ext_size(AHASH_INIT_SIZE, dsize); rcu_assign_pointer(hbucket(t, key), tmp); kfree_rcu(n, rcu); } @@ -906,6 +1137,16 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, } out: + spin_unlock_bh(&t->hregion[r].lock); + if (x) { + spin_lock_bh(&set->lock); + list_add(&x->list, &h->ad); + spin_unlock_bh(&set->lock); + } + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { + pr_debug("Table destroy after resize by del: %p\n", t); + mtype_ahash_destroy(set, t, false); + } return ret; } @@ -991,6 +1232,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, int i, ret = 0; u32 key, multi = 0; + rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); #ifdef IP_SET_HASH_WITH_NETS /* If we test an IP address and not a network address, @@ -1022,6 +1264,7 @@ mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, goto out; } out: + rcu_read_unlock_bh(); return ret; } @@ -1033,23 +1276,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) const struct htable *t; struct nlattr *nested; size_t memsize; + u32 elements = 0; + size_t ext_size = 0; u8 htable_bits; - /* If any members have expired, set->elements will be wrong - * mytype_expire function will update it with the right count. - * we do not hold set->lock here, so grab it first. - * set->elements can still be incorrect in the case of a huge set, - * because elements might time out during the listing. - */ - if (SET_WITH_TIMEOUT(set)) { - spin_lock_bh(&set->lock); - mtype_expire(set, h); - spin_unlock_bh(&set->lock); - } - rcu_read_lock_bh(); - t = rcu_dereference_bh_nfnl(h->table); - memsize = mtype_ahash_memsize(h, t) + set->ext_size; + t = rcu_dereference_bh(h->table); + mtype_ext_size(set, &elements, &ext_size); + memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size; htable_bits = t->htable_bits; rcu_read_unlock_bh(); @@ -1071,7 +1305,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) #endif if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || - nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) + nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; @@ -1091,15 +1325,15 @@ mtype_uref(struct ip_set *set, struct netlink_callback *cb, bool start) if (start) { rcu_read_lock_bh(); - t = rcu_dereference_bh_nfnl(h->table); + t = ipset_dereference_bh_nfnl(h->table); atomic_inc(&t->uref); cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; rcu_read_unlock_bh(); } else if (cb->args[IPSET_CB_PRIVATE]) { t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { - /* Resizing didn't destroy the hash table */ - pr_debug("Table destroy by dump: %p\n", t); + pr_debug("Table destroy after resize " + " by dump: %p\n", t); mtype_ahash_destroy(set, t, false); } cb->args[IPSET_CB_PRIVATE] = 0; @@ -1141,8 +1375,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(i, n->used)) continue; e = ahash_data(n, i, set->dsize); - if (SET_WITH_TIMEOUT(set) && - ip_set_timeout_expired(ext_timeout(e, set))) + if (SET_ELEM_EXPIRED(set, e)) continue; pr_debug("list hash %lu hbucket %p i %u, data %p\n", cb->args[IPSET_CB_ARG0], n, i, e); @@ -1208,6 +1441,7 @@ static const struct ip_set_type_variant mtype_variant = { .uref = mtype_uref, .resize = mtype_resize, .same_set = mtype_same_set, + .region_lock = true, }; #ifdef IP_SET_EMIT_CREATE @@ -1226,6 +1460,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, size_t hsize; struct htype *h; struct htable *t; + u32 i; pr_debug("Create set %s with family %s\n", set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); @@ -1294,6 +1529,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, kfree(h); return -ENOMEM; } + t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits)); + if (!t->hregion) { + kfree(t); + kfree(h); + return -ENOMEM; + } + h->gc.set = set; + for (i = 0; i < ahash_numof_locks(hbits); i++) + spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; #ifdef IP_SET_HASH_WITH_NETMASK h->netmask = netmask; @@ -1304,9 +1548,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, get_random_bytes(&h->initval, sizeof(h->initval)); t->htable_bits = hbits; + t->maxelem = h->maxelem / ahash_numof_locks(hbits); RCU_INIT_POINTER(h->table, t); - h->set = set; + INIT_LIST_HEAD(&h->ad); set->data = h; #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) { @@ -1329,12 +1574,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #ifndef IP_SET_PROTO_UNDEF if (set->family == NFPROTO_IPV4) #endif - IPSET_TOKEN(HTYPE, 4_gc_init)(set, - IPSET_TOKEN(HTYPE, 4_gc)); + IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc); #ifndef IP_SET_PROTO_UNDEF else - IPSET_TOKEN(HTYPE, 6_gc_init)(set, - IPSET_TOKEN(HTYPE, 6_gc)); + IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc); #endif } pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", -- cgit v1.2.3 From c780e86dd48ef6467a1146cf7d0fe1e05a635039 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 6 Feb 2020 15:28:12 +0100 Subject: blktrace: Protect q->blk_trace with RCU KASAN is reporting that __blk_add_trace() has a use-after-free issue when accessing q->blk_trace. Indeed the switching of block tracing (and thus eventual freeing of q->blk_trace) is completely unsynchronized with the currently running tracing and thus it can happen that the blk_trace structure is being freed just while __blk_add_trace() works on it. Protect accesses to q->blk_trace by RCU during tracing and make sure we wait for the end of RCU grace period when shutting down tracing. Luckily that is rare enough event that we can afford that. Note that postponing the freeing of blk_trace to an RCU callback should better be avoided as it could have unexpected user visible side-effects as debugfs files would be still existing for a short while block tracing has been shut down. Link: https://bugzilla.kernel.org/show_bug.cgi?id=205711 CC: stable@vger.kernel.org Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Tested-by: Ming Lei Reviewed-by: Bart Van Assche Reported-by: Tristan Madani Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- include/linux/blktrace_api.h | 18 +++++-- kernel/trace/blktrace.c | 114 +++++++++++++++++++++++++++++++------------ 3 files changed, 97 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 053ea4b51988..10455b2bbbb4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -524,7 +524,7 @@ struct request_queue { unsigned int sg_reserved_size; int node; #ifdef CONFIG_BLK_DEV_IO_TRACE - struct blk_trace *blk_trace; + struct blk_trace __rcu *blk_trace; struct mutex blk_trace_mutex; #endif /* diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 7bb2d8de9f30..3b6ff5902edc 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -51,9 +51,13 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f **/ #define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ do { \ - struct blk_trace *bt = (q)->blk_trace; \ + struct blk_trace *bt; \ + \ + rcu_read_lock(); \ + bt = rcu_dereference((q)->blk_trace); \ if (unlikely(bt)) \ __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ + rcu_read_unlock(); \ } while (0) #define blk_add_trace_msg(q, fmt, ...) \ blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__) @@ -61,10 +65,14 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f static inline bool blk_trace_note_message_enabled(struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; - if (likely(!bt)) - return false; - return bt->act_mask & BLK_TC_NOTIFY; + struct blk_trace *bt; + bool ret; + + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + ret = bt && (bt->act_mask & BLK_TC_NOTIFY); + rcu_read_unlock(); + return ret; } extern void blk_add_driver_data(struct request_queue *q, struct request *rq, diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 0735ae8545d8..4560878f0bac 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -335,6 +335,7 @@ static void put_probe_ref(void) static void blk_trace_cleanup(struct blk_trace *bt) { + synchronize_rcu(); blk_trace_free(bt); put_probe_ref(); } @@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, static int __blk_trace_startstop(struct request_queue *q, int start) { int ret; - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (bt == NULL) return -EINVAL; @@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) void blk_trace_shutdown(struct request_queue *q) { mutex_lock(&q->blk_trace_mutex); - - if (q->blk_trace) { + if (rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } @@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q) #ifdef CONFIG_BLK_CGROUP static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + /* We don't use the 'bt' value here except as an optimization... */ + bt = rcu_dereference_protected(q->blk_trace, 1); if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) return 0; @@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what, u64 cgid) { - struct blk_trace *bt = rq->q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(rq->q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } if (blk_rq_is_passthrough(rq)) what |= BLK_TC_ACT(BLK_TC_PC); @@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error, __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), rq->cmd_flags, what, error, 0, NULL, cgid); + rcu_read_unlock(); } static void blk_add_trace_rq_insert(void *ignore, @@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, u32 what, int error) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, what, error, 0, NULL, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } static void blk_add_trace_bio_bounce(void *ignore, @@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } @@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore, if (bio) blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, 0, 0, NULL, 0); + rcu_read_unlock(); } } static void blk_add_trace_plug(void *ignore, struct request_queue *q) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0); + rcu_read_unlock(); } static void blk_add_trace_unplug(void *ignore, struct request_queue *q, unsigned int depth, bool explicit) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(depth); u32 what; @@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0); } + rcu_read_unlock(); } static void blk_add_trace_split(void *ignore, struct request_queue *q, struct bio *bio, unsigned int pdu) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); if (bt) { __be64 rpdu = cpu_to_be64(pdu); @@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore, BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), &rpdu, blk_trace_bio_get_cgid(q, bio)); } + rcu_read_unlock(); } /** @@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore, struct request_queue *q, struct bio *bio, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(bio_dev(bio)); @@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore, __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); + rcu_read_unlock(); } /** @@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; struct blk_io_trace_remap r; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } r.device_from = cpu_to_be32(dev); r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); @@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } /** @@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt; - if (likely(!bt)) + rcu_read_lock(); + bt = rcu_dereference(q->blk_trace); + if (likely(!bt)) { + rcu_read_unlock(); return; + } __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, blk_trace_request_get_cgid(q, rq)); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q) return -EINVAL; put_probe_ref(); + synchronize_rcu(); blk_trace_free(bt); return 0; } @@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct hd_struct *p = dev_to_part(dev); struct request_queue *q; struct block_device *bdev; + struct blk_trace *bt; ssize_t ret = -ENXIO; bdev = bdget(part_devt(p)); @@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - ret = sprintf(buf, "%u\n", !!q->blk_trace); + ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; } - if (q->blk_trace == NULL) + if (bt == NULL) ret = sprintf(buf, "disabled\n"); else if (attr == &dev_attr_act_mask) - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + ret = blk_trace_mask2str(buf, bt->act_mask); else if (attr == &dev_attr_pid) - ret = sprintf(buf, "%u\n", q->blk_trace->pid); + ret = sprintf(buf, "%u\n", bt->pid); else if (attr == &dev_attr_start_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + ret = sprintf(buf, "%llu\n", bt->start_lba); else if (attr == &dev_attr_end_lba) - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: mutex_unlock(&q->blk_trace_mutex); @@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct block_device *bdev; struct request_queue *q; struct hd_struct *p; + struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); + bt = rcu_dereference_protected(q->blk_trace, + lockdep_is_held(&q->blk_trace_mutex)); if (attr == &dev_attr_enable) { - if (!!value == !!q->blk_trace) { + if (!!value == !!bt) { ret = 0; goto out_unlock_bdev; } @@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } ret = 0; - if (q->blk_trace == NULL) + if (bt == NULL) ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) - q->blk_trace->act_mask = value; + bt->act_mask = value; else if (attr == &dev_attr_pid) - q->blk_trace->pid = value; + bt->pid = value; else if (attr == &dev_attr_start_lba) - q->blk_trace->start_lba = value; + bt->start_lba = value; else if (attr == &dev_attr_end_lba) - q->blk_trace->end_lba = value; + bt->end_lba = value; } out_unlock_bdev: -- cgit v1.2.3 From a8e41f6033a0c5633d55d6e35993c9e2005d872f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 25 Feb 2020 18:05:35 +0800 Subject: icmp: allow icmpv6_ndo_send to work with CONFIG_IPV6=n The icmpv6_send function has long had a static inline implementation with an empty body for CONFIG_IPV6=n, so that code calling it doesn't need to be ifdef'd. The new icmpv6_ndo_send function, which is intended for drivers as a drop-in replacement with an identical function signature, should follow the same pattern. Without this patch, drivers that used to work with CONFIG_IPV6=n now result in a linker error. Cc: Chen Zhou Reported-by: Hulk Robot Fixes: 0b41713b6066 ("icmp: introduce helper for nat'd source address in network device context") Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- include/linux/icmpv6.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h index 93338fd54af8..33d379602314 100644 --- a/include/linux/icmpv6.h +++ b/include/linux/icmpv6.h @@ -22,19 +22,23 @@ extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); +#if IS_ENABLED(CONFIG_NF_NAT) +void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); +#else +#define icmpv6_ndo_send icmpv6_send +#endif + #else static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { - } -#endif -#if IS_ENABLED(CONFIG_NF_NAT) -void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); -#else -#define icmpv6_ndo_send icmpv6_send +static inline void icmpv6_ndo_send(struct sk_buff *skb, + u8 type, u8 code, __u32 info) +{ +} #endif extern int icmpv6_init(void); -- cgit v1.2.3 From fcd07f9adc7dacc2532695cf9dd2284d49e716ff Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 28 Feb 2020 09:49:41 +0100 Subject: KVM: let declaration of kvm_get_running_vcpus match implementation Sparse notices that declaration and implementation do not match: arch/s390/kvm/../../../virt/kvm/kvm_main.c:4435:17: warning: incorrect type in return expression (different address spaces) arch/s390/kvm/../../../virt/kvm/kvm_main.c:4435:17: expected struct kvm_vcpu [noderef] ** arch/s390/kvm/../../../virt/kvm/kvm_main.c:4435:17: got struct kvm_vcpu *[noderef] * Signed-off-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7944ad6ac10b..bcb9b2ac0791 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1344,7 +1344,7 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ struct kvm_vcpu *kvm_get_running_vcpu(void); -struct kvm_vcpu __percpu **kvm_get_running_vcpus(void); +struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS bool kvm_arch_has_irq_bypass(void); -- cgit v1.2.3 From 0b04758b002bde9434053be2fff8064ac3d9d8bb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:08 +0000 Subject: irqchip/gic-v3: Use SGIs without active state if offered To allow the direct injection of SGIs into a guest, the GICv4.1 architecture has to sacrifice the Active state so that SGIs look a lot like LPIs (they are injected by the same mechanism). In order not to break existing software, the architecture gives offers guests OSs the choice: SGIs with or without an active state. It is the hypervisors duty to honor the guest's choice. For this, the architecture offers a discovery bit indicating whether the GIC supports GICv4.1 SGIs (GICD_TYPER2.nASSGIcap), and another bit indicating whether the guest wants Active-less SGIs or not (controlled by GICD_CTLR.nASSGIreq). A hypervisor not supporting GICv4.1 SGIs would leave nASSGIcap clear, and a guest not knowing about GICv4.1 SGIs (or definitely wanting an Active state) would leave nASSGIreq clear (both being thankfully backward compatible with older revisions of the GIC). Since Linux is perfectly happy without an active state on SGIs, inform the hypervisor that we'll use that if offered. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20200304203330.4967-2-maz@kernel.org --- drivers/irqchip/irq-gic-v3.c | 10 ++++++++-- include/linux/irqchip/arm-gic-v3.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index c1f7af9d9ae7..b6b0f86584d6 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -723,6 +723,7 @@ static void __init gic_dist_init(void) unsigned int i; u64 affinity; void __iomem *base = gic_data.dist_base; + u32 val; /* Disable the distributor */ writel_relaxed(0, base + GICD_CTLR); @@ -755,9 +756,14 @@ static void __init gic_dist_init(void) /* Now do the common stuff, and wait for the distributor to drain */ gic_dist_config(base, GIC_LINE_NR, gic_dist_wait_for_rwp); + val = GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | GICD_CTLR_ENABLE_G1; + if (gic_data.rdists.gicd_typer2 & GICD_TYPER2_nASSGIcap) { + pr_info("Enabling SGIs without active state\n"); + val |= GICD_CTLR_nASSGIreq; + } + /* Enable distributor with ARE, Group1 */ - writel_relaxed(GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | GICD_CTLR_ENABLE_G1, - base + GICD_CTLR); + writel_relaxed(val, base + GICD_CTLR); /* * Set all global interrupts to the boot CPU only. ARE must be diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 83439bfb6c5b..c29a02678a6f 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -57,6 +57,7 @@ #define GICD_SPENDSGIR 0x0F20 #define GICD_CTLR_RWP (1U << 31) +#define GICD_CTLR_nASSGIreq (1U << 8) #define GICD_CTLR_DS (1U << 6) #define GICD_CTLR_ARE_NS (1U << 4) #define GICD_CTLR_ENABLE_G1A (1U << 1) @@ -90,6 +91,7 @@ #define GICD_TYPER_ESPIS(typer) \ (((typer) & GICD_TYPER_ESPI) ? GICD_TYPER_SPIS((typer) >> 27) : 0) +#define GICD_TYPER2_nASSGIcap (1U << 8) #define GICD_TYPER2_VIL (1U << 7) #define GICD_TYPER2_VID GENMASK(4, 0) -- cgit v1.2.3 From f3a059219bc718ccc3bf3ff894f089b7e9a93139 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:10 +0000 Subject: irqchip/gic-v4.1: Ensure mutual exclusion between vPE affinity change and RD access Before GICv4.1, all operations would be serialized with the affinity changes by virtue of using the same ITS command queue. With v4.1, things change, as invalidations (and a number of other operations) are issued using the redistributor MMIO frame. We must thus make sure that these redistributor accesses cannot race against aginst the affinity change, or we may end-up talking to the wrong redistributor. To ensure this, we expand the irq_to_cpuid() helper to take a spinlock when the LPI is mapped to a vLPI (a new per-VPE lock) on each operation that requires mutual exclusion. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20200304203330.4967-4-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 56 ++++++++++++++++++++++++++++++++------ include/linux/irqchip/arm-gic-v4.h | 5 ++++ 2 files changed, 53 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index da883a691028..1af713990123 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -239,15 +239,41 @@ static struct its_vlpi_map *get_vlpi_map(struct irq_data *d) return NULL; } -static int irq_to_cpuid(struct irq_data *d) +static int vpe_to_cpuid_lock(struct its_vpe *vpe, unsigned long *flags) +{ + raw_spin_lock_irqsave(&vpe->vpe_lock, *flags); + return vpe->col_idx; +} + +static void vpe_to_cpuid_unlock(struct its_vpe *vpe, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&vpe->vpe_lock, flags); +} + +static int irq_to_cpuid_lock(struct irq_data *d, unsigned long *flags) { - struct its_device *its_dev = irq_data_get_irq_chip_data(d); struct its_vlpi_map *map = get_vlpi_map(d); + int cpu; - if (map) - return map->vpe->col_idx; + if (map) { + cpu = vpe_to_cpuid_lock(map->vpe, flags); + } else { + /* Physical LPIs are already locked via the irq_desc lock */ + struct its_device *its_dev = irq_data_get_irq_chip_data(d); + cpu = its_dev->event_map.col_map[its_get_event_id(d)]; + /* Keep GCC quiet... */ + *flags = 0; + } - return its_dev->event_map.col_map[its_get_event_id(d)]; + return cpu; +} + +static void irq_to_cpuid_unlock(struct irq_data *d, unsigned long flags) +{ + struct its_vlpi_map *map = get_vlpi_map(d); + + if (map) + vpe_to_cpuid_unlock(map->vpe, flags); } static struct its_collection *valid_col(struct its_collection *col) @@ -1329,7 +1355,9 @@ static void direct_lpi_inv(struct irq_data *d) { struct its_vlpi_map *map = get_vlpi_map(d); void __iomem *rdbase; + unsigned long flags; u64 val; + int cpu; if (map) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); @@ -1344,10 +1372,12 @@ static void direct_lpi_inv(struct irq_data *d) } /* Target the redistributor this LPI is currently routed to */ - rdbase = per_cpu_ptr(gic_rdists->rdist, irq_to_cpuid(d))->rd_base; + cpu = irq_to_cpuid_lock(d, &flags); + rdbase = per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base; gic_write_lpir(val, rdbase + GICR_INVLPIR); wait_for_syncr(rdbase); + irq_to_cpuid_unlock(d, flags); } static void lpi_update_config(struct irq_data *d, u8 clr, u8 set) @@ -3486,17 +3516,25 @@ static int its_vpe_set_affinity(struct irq_data *d, { struct its_vpe *vpe = irq_data_get_irq_chip_data(d); int from, cpu = cpumask_first(mask_val); + unsigned long flags; /* * Changing affinity is mega expensive, so let's be as lazy as * we can and only do it if we really have to. Also, if mapped * into the proxy device, we need to move the doorbell * interrupt to its new location. + * + * Another thing is that changing the affinity of a vPE affects + * *other interrupts* such as all the vLPIs that are routed to + * this vPE. This means that the irq_desc lock is not enough to + * protect us, and that we must ensure nobody samples vpe->col_idx + * during the update, hence the lock below which must also be + * taken on any vLPI handling path that evaluates vpe->col_idx. */ - if (vpe->col_idx == cpu) + from = vpe_to_cpuid_lock(vpe, &flags); + if (from == cpu) goto out; - from = vpe->col_idx; vpe->col_idx = cpu; /* @@ -3512,6 +3550,7 @@ static int its_vpe_set_affinity(struct irq_data *d, out: irq_data_update_effective_affinity(d, cpumask_of(cpu)); + vpe_to_cpuid_unlock(vpe, flags); return IRQ_SET_MASK_OK_DONE; } @@ -3855,6 +3894,7 @@ static int its_vpe_init(struct its_vpe *vpe) return -ENOMEM; } + raw_spin_lock_init(&vpe->vpe_lock); vpe->vpe_id = vpe_id; vpe->vpt_page = vpt_page; if (gic_rdists->has_rvpeid) diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index d9c34968467a..439963f4c66a 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -53,6 +53,11 @@ struct its_vpe { }; }; + /* + * Ensures mutual exclusion between affinity setting of the + * vPE and vLPI operations using vpe->col_idx. + */ + raw_spinlock_t vpe_lock; /* * This collection ID is used to indirect the target * redistributor for this VPE. The ID itself isn't involved in -- cgit v1.2.3 From 9058a4e980648e7d068a7f7726a8ea4c67d0e88a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:12 +0000 Subject: irqchip/gic-v4.1: Ensure mutual exclusion betwen invalidations on the same RD The GICv4.1 spec says that it is CONTRAINED UNPREDICTABLE to write to any of the GICR_INV{LPI,ALL}R registers if GICR_SYNCR.Busy == 1. To deal with it, we must ensure that only a single invalidation can happen at a time for a given redistributor. Add a per-RD lock to that effect and take it around the invalidation/syncr-read to deal with this. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200304203330.4967-6-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 6 ++++++ drivers/irqchip/irq-gic-v3.c | 1 + include/linux/irqchip/arm-gic-v3.h | 1 + 3 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index c84370245bea..fc5788584df7 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1373,10 +1373,12 @@ static void direct_lpi_inv(struct irq_data *d) /* Target the redistributor this LPI is currently routed to */ cpu = irq_to_cpuid_lock(d, &flags); + raw_spin_lock(&gic_data_rdist_cpu(cpu)->rd_lock); rdbase = per_cpu_ptr(gic_rdists->rdist, cpu)->rd_base; gic_write_lpir(val, rdbase + GICR_INVLPIR); wait_for_syncr(rdbase); + raw_spin_unlock(&gic_data_rdist_cpu(cpu)->rd_lock); irq_to_cpuid_unlock(d, flags); } @@ -3662,9 +3664,11 @@ static void its_vpe_send_inv(struct irq_data *d) void __iomem *rdbase; /* Target the redistributor this VPE is currently known on */ + raw_spin_lock(&gic_data_rdist_cpu(vpe->col_idx)->rd_lock); rdbase = per_cpu_ptr(gic_rdists->rdist, vpe->col_idx)->rd_base; gic_write_lpir(d->parent_data->hwirq, rdbase + GICR_INVLPIR); wait_for_syncr(rdbase); + raw_spin_unlock(&gic_data_rdist_cpu(vpe->col_idx)->rd_lock); } else { its_vpe_send_cmd(vpe, its_send_inv); } @@ -3825,10 +3829,12 @@ static void its_vpe_4_1_invall(struct its_vpe *vpe) val |= FIELD_PREP(GICR_INVALLR_VPEID, vpe->vpe_id); /* Target the redistributor this vPE is currently known on */ + raw_spin_lock(&gic_data_rdist_cpu(vpe->col_idx)->rd_lock); rdbase = per_cpu_ptr(gic_rdists->rdist, vpe->col_idx)->rd_base; gic_write_lpir(val, rdbase + GICR_INVALLR); wait_for_syncr(rdbase); + raw_spin_unlock(&gic_data_rdist_cpu(vpe->col_idx)->rd_lock); } static int its_vpe_4_1_set_vcpu_affinity(struct irq_data *d, void *vcpu_info) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index b6b0f86584d6..0f716c2647fd 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -834,6 +834,7 @@ static int __gic_populate_rdist(struct redist_region *region, void __iomem *ptr) typer = gic_read_typer(ptr + GICR_TYPER); if ((typer >> 32) == aff) { u64 offset = ptr - region->redist_base; + raw_spin_lock_init(&gic_data_rdist()->rd_lock); gic_data_rdist_rd_base() = ptr; gic_data_rdist()->phys_base = region->phys_base + offset; diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index c29a02678a6f..b28acfa71f82 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -652,6 +652,7 @@ struct rdists { struct { + raw_spinlock_t rd_lock; void __iomem *rd_base; struct page *pend_page; phys_addr_t phys_base; -- cgit v1.2.3 From 3c40706d05fdea421e991da50e72a29d41131a66 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:13 +0000 Subject: irqchip/gic-v4.1: Advertise support v4.1 to KVM Tell KVM that we support v4.1. Nothing uses this information so far. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200304203330.4967-7-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 9 ++++++++- drivers/irqchip/irq-gic-v3.c | 2 ++ include/linux/irqchip/arm-gic-common.h | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fc5788584df7..bcc1a0957cda 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -4870,6 +4870,7 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, struct device_node *of_node; struct its_node *its; bool has_v4 = false; + bool has_v4_1 = false; int err; gic_rdists = rdists; @@ -4890,8 +4891,14 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, if (err) return err; - list_for_each_entry(its, &its_nodes, entry) + list_for_each_entry(its, &its_nodes, entry) { has_v4 |= is_v4(its); + has_v4_1 |= is_v4_1(its); + } + + /* Don't bother with inconsistent systems */ + if (WARN_ON(!has_v4_1 && rdists->has_rvpeid)) + rdists->has_rvpeid = false; if (has_v4 & rdists->has_vlpis) { if (its_init_vpe_domain() || diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index 0f716c2647fd..8c5de59c5213 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -1764,6 +1764,7 @@ static void __init gic_of_setup_kvm_info(struct device_node *node) gic_v3_kvm_info.vcpu = r; gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis; + gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid; gic_set_kvm_info(&gic_v3_kvm_info); } @@ -2079,6 +2080,7 @@ static void __init gic_acpi_setup_kvm_info(void) } gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis; + gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid; gic_set_kvm_info(&gic_v3_kvm_info); } diff --git a/include/linux/irqchip/arm-gic-common.h b/include/linux/irqchip/arm-gic-common.h index b9850f5f1906..fa8c0455c352 100644 --- a/include/linux/irqchip/arm-gic-common.h +++ b/include/linux/irqchip/arm-gic-common.h @@ -32,6 +32,8 @@ struct gic_kvm_info { struct resource vctrl; /* vlpi support */ bool has_v4; + /* rvpeid support */ + bool has_v4_1; }; const struct gic_kvm_info *gic_get_kvm_info(void); -- cgit v1.2.3 From 166cba71818cd49d7d815fdc6f97c63395e94fc5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:15 +0000 Subject: irqchip/gic-v4.1: Plumb skeletal VSGI irqchip Since GICv4.1 has the capability to inject 16 SGIs into each VPE, and that I'm keen not to invent too many specific interfaces to manipulate these interrupts, let's pretend that each of these SGIs is an actual Linux interrupt. For that matter, let's introduce a minimal irqchip and irqdomain setup that will get fleshed up in the following patches. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200304203330.4967-9-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 75 +++++++++++++++++++++++++++++++++++++- drivers/irqchip/irq-gic-v4.c | 8 +++- include/linux/irqchip/arm-gic-v4.h | 9 ++++- 3 files changed, 88 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 54d6fdf7a28e..c9c812191796 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3870,6 +3870,72 @@ static struct irq_chip its_vpe_4_1_irq_chip = { .irq_set_vcpu_affinity = its_vpe_4_1_set_vcpu_affinity, }; +static int its_sgi_set_affinity(struct irq_data *d, + const struct cpumask *mask_val, + bool force) +{ + /* + * There is no notion of affinity for virtual SGIs, at least + * not on the host (since they can only be targetting a vPE). + * Tell the kernel we've done whatever it asked for. + */ + return IRQ_SET_MASK_OK; +} + +static struct irq_chip its_sgi_irq_chip = { + .name = "GICv4.1-sgi", + .irq_set_affinity = its_sgi_set_affinity, +}; + +static int its_sgi_irq_domain_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *args) +{ + struct its_vpe *vpe = args; + int i; + + /* Yes, we do want 16 SGIs */ + WARN_ON(nr_irqs != 16); + + for (i = 0; i < 16; i++) { + vpe->sgi_config[i].priority = 0; + vpe->sgi_config[i].enabled = false; + vpe->sgi_config[i].group = false; + + irq_domain_set_hwirq_and_chip(domain, virq + i, i, + &its_sgi_irq_chip, vpe); + irq_set_status_flags(virq + i, IRQ_DISABLE_UNLAZY); + } + + return 0; +} + +static void its_sgi_irq_domain_free(struct irq_domain *domain, + unsigned int virq, + unsigned int nr_irqs) +{ + /* Nothing to do */ +} + +static int its_sgi_irq_domain_activate(struct irq_domain *domain, + struct irq_data *d, bool reserve) +{ + return 0; +} + +static void its_sgi_irq_domain_deactivate(struct irq_domain *domain, + struct irq_data *d) +{ + /* Nothing to do */ +} + +static const struct irq_domain_ops its_sgi_domain_ops = { + .alloc = its_sgi_irq_domain_alloc, + .free = its_sgi_irq_domain_free, + .activate = its_sgi_irq_domain_activate, + .deactivate = its_sgi_irq_domain_deactivate, +}; + static int its_vpe_id_alloc(void) { return ida_simple_get(&its_vpeid_ida, 0, ITS_MAX_VPEID, GFP_KERNEL); @@ -4912,8 +4978,15 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, rdists->has_rvpeid = false; if (has_v4 & rdists->has_vlpis) { + const struct irq_domain_ops *sgi_ops; + + if (has_v4_1) + sgi_ops = &its_sgi_domain_ops; + else + sgi_ops = NULL; + if (its_init_vpe_domain() || - its_init_v4(parent_domain, &its_vpe_domain_ops)) { + its_init_v4(parent_domain, &its_vpe_domain_ops, sgi_ops)) { rdists->has_vlpis = false; pr_err("ITS: Disabling GICv4 support\n"); } diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 45969927cc81..c01910d53f9e 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -85,6 +85,7 @@ static struct irq_domain *gic_domain; static const struct irq_domain_ops *vpe_domain_ops; +static const struct irq_domain_ops *sgi_domain_ops; int its_alloc_vcpu_irqs(struct its_vm *vm) { @@ -216,12 +217,15 @@ int its_prop_update_vlpi(int irq, u8 config, bool inv) return irq_set_vcpu_affinity(irq, &info); } -int its_init_v4(struct irq_domain *domain, const struct irq_domain_ops *ops) +int its_init_v4(struct irq_domain *domain, + const struct irq_domain_ops *vpe_ops, + const struct irq_domain_ops *sgi_ops) { if (domain) { pr_info("ITS: Enabling GICv4 support\n"); gic_domain = domain; - vpe_domain_ops = ops; + vpe_domain_ops = vpe_ops; + sgi_domain_ops = sgi_ops; return 0; } diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 439963f4c66a..44e8c19e3d56 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -49,6 +49,11 @@ struct its_vpe { }; /* GICv4.1 implementations */ struct { + struct { + u8 priority; + bool enabled; + bool group; + } sgi_config[16]; atomic_t vmapp_count; }; }; @@ -123,6 +128,8 @@ int its_unmap_vlpi(int irq); int its_prop_update_vlpi(int irq, u8 config, bool inv); struct irq_domain_ops; -int its_init_v4(struct irq_domain *domain, const struct irq_domain_ops *ops); +int its_init_v4(struct irq_domain *domain, + const struct irq_domain_ops *vpe_ops, + const struct irq_domain_ops *sgi_ops); #endif -- cgit v1.2.3 From e252cf8a34d92adf41124cb59b19b49d25395548 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:16 +0000 Subject: irqchip/gic-v4.1: Add initial SGI configuration The GICv4.1 ITS has yet another new command (VSGI) which allows a VPE-targeted SGI to be configured (or have its pending state cleared). Add support for this command and plumb it into the activate irqdomain callback so that it is ready to be used. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20200304203330.4967-10-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 92 +++++++++++++++++++++++++++++++++++++- include/linux/irqchip/arm-gic-v3.h | 3 +- 2 files changed, 93 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index c9c812191796..28c856a148f2 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -380,6 +380,15 @@ struct its_cmd_desc { struct { struct its_vpe *vpe; } its_invdb_cmd; + + struct { + struct its_vpe *vpe; + u8 sgi; + u8 priority; + bool enable; + bool group; + bool clear; + } its_vsgi_cmd; }; }; @@ -528,6 +537,31 @@ static void its_encode_db(struct its_cmd_block *cmd, bool db) its_mask_encode(&cmd->raw_cmd[2], db, 63, 63); } +static void its_encode_sgi_intid(struct its_cmd_block *cmd, u8 sgi) +{ + its_mask_encode(&cmd->raw_cmd[0], sgi, 35, 32); +} + +static void its_encode_sgi_priority(struct its_cmd_block *cmd, u8 prio) +{ + its_mask_encode(&cmd->raw_cmd[0], prio >> 4, 23, 20); +} + +static void its_encode_sgi_group(struct its_cmd_block *cmd, bool grp) +{ + its_mask_encode(&cmd->raw_cmd[0], grp, 10, 10); +} + +static void its_encode_sgi_clear(struct its_cmd_block *cmd, bool clr) +{ + its_mask_encode(&cmd->raw_cmd[0], clr, 9, 9); +} + +static void its_encode_sgi_enable(struct its_cmd_block *cmd, bool en) +{ + its_mask_encode(&cmd->raw_cmd[0], en, 8, 8); +} + static inline void its_fixup_cmd(struct its_cmd_block *cmd) { /* Let's fixup BE commands */ @@ -893,6 +927,26 @@ static struct its_vpe *its_build_invdb_cmd(struct its_node *its, return valid_vpe(its, desc->its_invdb_cmd.vpe); } +static struct its_vpe *its_build_vsgi_cmd(struct its_node *its, + struct its_cmd_block *cmd, + struct its_cmd_desc *desc) +{ + if (WARN_ON(!is_v4_1(its))) + return NULL; + + its_encode_cmd(cmd, GITS_CMD_VSGI); + its_encode_vpeid(cmd, desc->its_vsgi_cmd.vpe->vpe_id); + its_encode_sgi_intid(cmd, desc->its_vsgi_cmd.sgi); + its_encode_sgi_priority(cmd, desc->its_vsgi_cmd.priority); + its_encode_sgi_group(cmd, desc->its_vsgi_cmd.group); + its_encode_sgi_clear(cmd, desc->its_vsgi_cmd.clear); + its_encode_sgi_enable(cmd, desc->its_vsgi_cmd.enable); + + its_fixup_cmd(cmd); + + return valid_vpe(its, desc->its_vsgi_cmd.vpe); +} + static u64 its_cmd_ptr_to_offset(struct its_node *its, struct its_cmd_block *ptr) { @@ -3870,6 +3924,26 @@ static struct irq_chip its_vpe_4_1_irq_chip = { .irq_set_vcpu_affinity = its_vpe_4_1_set_vcpu_affinity, }; +static void its_configure_sgi(struct irq_data *d, bool clear) +{ + struct its_vpe *vpe = irq_data_get_irq_chip_data(d); + struct its_cmd_desc desc; + + desc.its_vsgi_cmd.vpe = vpe; + desc.its_vsgi_cmd.sgi = d->hwirq; + desc.its_vsgi_cmd.priority = vpe->sgi_config[d->hwirq].priority; + desc.its_vsgi_cmd.enable = vpe->sgi_config[d->hwirq].enabled; + desc.its_vsgi_cmd.group = vpe->sgi_config[d->hwirq].group; + desc.its_vsgi_cmd.clear = clear; + + /* + * GICv4.1 allows us to send VSGI commands to any ITS as long as the + * destination VPE is mapped there. Since we map them eagerly at + * activation time, we're pretty sure the first GICv4.1 ITS will do. + */ + its_send_single_vcommand(find_4_1_its(), its_build_vsgi_cmd, &desc); +} + static int its_sgi_set_affinity(struct irq_data *d, const struct cpumask *mask_val, bool force) @@ -3920,13 +3994,29 @@ static void its_sgi_irq_domain_free(struct irq_domain *domain, static int its_sgi_irq_domain_activate(struct irq_domain *domain, struct irq_data *d, bool reserve) { + /* Write out the initial SGI configuration */ + its_configure_sgi(d, false); return 0; } static void its_sgi_irq_domain_deactivate(struct irq_domain *domain, struct irq_data *d) { - /* Nothing to do */ + struct its_vpe *vpe = irq_data_get_irq_chip_data(d); + + /* + * The VSGI command is awkward: + * + * - To change the configuration, CLEAR must be set to false, + * leaving the pending bit unchanged. + * - To clear the pending bit, CLEAR must be set to true, leaving + * the configuration unchanged. + * + * You just can't do both at once, hence the two commands below. + */ + vpe->sgi_config[d->hwirq].enabled = false; + its_configure_sgi(d, false); + its_configure_sgi(d, true); } static const struct irq_domain_ops its_sgi_domain_ops = { diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index b28acfa71f82..fd3be49ac9a5 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -502,8 +502,9 @@ #define GITS_CMD_VMAPTI GITS_CMD_GICv4(GITS_CMD_MAPTI) #define GITS_CMD_VMOVI GITS_CMD_GICv4(GITS_CMD_MOVI) #define GITS_CMD_VSYNC GITS_CMD_GICv4(GITS_CMD_SYNC) -/* VMOVP and INVDB are the odd ones, as they dont have a physical counterpart */ +/* VMOVP, VSGI and INVDB are the odd ones, as they dont have a physical counterpart */ #define GITS_CMD_VMOVP GITS_CMD_GICv4(2) +#define GITS_CMD_VSGI GITS_CMD_GICv4(3) #define GITS_CMD_INVDB GITS_CMD_GICv4(0xe) /* -- cgit v1.2.3 From 7017ff0ee1de9d45fafee88a4e7890cce92f482e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:18 +0000 Subject: irqchip/gic-v4.1: Plumb get/set_irqchip_state SGI callbacks To implement the get/set_irqchip_state callbacks (limited to the PENDING state), we have to use a particular set of hacks: - Reading the pending state is done by using a pair of new redistributor registers (GICR_VSGIR, GICR_VSGIPENDR), which allow the 16 interrupts state to be retrieved. - Setting the pending state is done by generating it as we'd otherwise do for a guest (writing to GITS_SGIR). - Clearing the pending state is done by emitting a VSGI command with the "clear" bit set. This requires some interesting locking though: - When talking to the redistributor, we must make sure that the VPE affinity doesn't change, hence taking the VPE lock. - At the same time, we must ensure that nobody accesses the same redistributor's GICR_VSGIR registers for a different VPE, which would corrupt the reading of the pending bits. We thus take the per-RD spinlock. Much fun. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20200304203330.4967-12-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 77 ++++++++++++++++++++++++++++++++++++++ include/linux/irqchip/arm-gic-v3.h | 14 +++++++ 2 files changed, 91 insertions(+) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index bc6666aed1cb..c614f0c19807 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3972,11 +3972,88 @@ static int its_sgi_set_affinity(struct irq_data *d, return IRQ_SET_MASK_OK; } +static int its_sgi_set_irqchip_state(struct irq_data *d, + enum irqchip_irq_state which, + bool state) +{ + if (which != IRQCHIP_STATE_PENDING) + return -EINVAL; + + if (state) { + struct its_vpe *vpe = irq_data_get_irq_chip_data(d); + struct its_node *its = find_4_1_its(); + u64 val; + + val = FIELD_PREP(GITS_SGIR_VPEID, vpe->vpe_id); + val |= FIELD_PREP(GITS_SGIR_VINTID, d->hwirq); + writeq_relaxed(val, its->sgir_base + GITS_SGIR - SZ_128K); + } else { + its_configure_sgi(d, true); + } + + return 0; +} + +static int its_sgi_get_irqchip_state(struct irq_data *d, + enum irqchip_irq_state which, bool *val) +{ + struct its_vpe *vpe = irq_data_get_irq_chip_data(d); + void __iomem *base; + unsigned long flags; + u32 count = 1000000; /* 1s! */ + u32 status; + int cpu; + + if (which != IRQCHIP_STATE_PENDING) + return -EINVAL; + + /* + * Locking galore! We can race against two different events: + * + * - Concurent vPE affinity change: we must make sure it cannot + * happen, or we'll talk to the wrong redistributor. This is + * identical to what happens with vLPIs. + * + * - Concurrent VSGIPENDR access: As it involves accessing two + * MMIO registers, this must be made atomic one way or another. + */ + cpu = vpe_to_cpuid_lock(vpe, &flags); + raw_spin_lock(&gic_data_rdist_cpu(cpu)->rd_lock); + base = gic_data_rdist_cpu(cpu)->rd_base + SZ_128K; + writel_relaxed(vpe->vpe_id, base + GICR_VSGIR); + do { + status = readl_relaxed(base + GICR_VSGIPENDR); + if (!(status & GICR_VSGIPENDR_BUSY)) + goto out; + + count--; + if (!count) { + pr_err_ratelimited("Unable to get SGI status\n"); + goto out; + } + cpu_relax(); + udelay(1); + } while (count); + +out: + raw_spin_unlock(&gic_data_rdist_cpu(cpu)->rd_lock); + vpe_to_cpuid_unlock(vpe, flags); + + if (!count) + return -ENXIO; + + *val = !!(status & (1 << d->hwirq)); + + return 0; +} + static struct irq_chip its_sgi_irq_chip = { .name = "GICv4.1-sgi", .irq_mask = its_sgi_mask_irq, .irq_unmask = its_sgi_unmask_irq, .irq_set_affinity = its_sgi_set_affinity, + .irq_set_irqchip_state = its_sgi_set_irqchip_state, + .irq_get_irqchip_state = its_sgi_get_irqchip_state, }; static int its_sgi_irq_domain_alloc(struct irq_domain *domain, diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index fd3be49ac9a5..590cdbeba9d5 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -345,6 +345,15 @@ #define GICR_VPENDBASER_4_1_VGRP1EN (1ULL << 58) #define GICR_VPENDBASER_4_1_VPEID GENMASK_ULL(15, 0) +#define GICR_VSGIR 0x0080 + +#define GICR_VSGIR_VPEID GENMASK(15, 0) + +#define GICR_VSGIPENDR 0x0088 + +#define GICR_VSGIPENDR_BUSY (1U << 31) +#define GICR_VSGIPENDR_PENDING GENMASK(15, 0) + /* * ITS registers, offsets from ITS_base */ @@ -368,6 +377,11 @@ #define GITS_TRANSLATER 0x10040 +#define GITS_SGIR 0x20020 + +#define GITS_SGIR_VPEID GENMASK_ULL(47, 32) +#define GITS_SGIR_VINTID GENMASK_ULL(3, 0) + #define GITS_CTLR_ENABLE (1U << 0) #define GITS_CTLR_ImDe (1U << 1) #define GITS_CTLR_ITS_NUMBER_SHIFT 4 -- cgit v1.2.3 From 05d32df13c6b3c0850b68928048536e9a736d520 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:19 +0000 Subject: irqchip/gic-v4.1: Plumb set_vcpu_affinity SGI callbacks Just like for vLPIs, there is some configuration information that cannot be directly communicated through the normal irqchip API, and we have to use our good old friend set_vcpu_affinity as a side-band communication mechanism. This is used to configure group and priority for a given vSGI. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20200304203330.4967-13-maz@kernel.org --- drivers/irqchip/irq-gic-v3-its.c | 18 ++++++++++++++++++ include/linux/irqchip/arm-gic-v4.h | 5 +++++ 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index c614f0c19807..aae53326d26f 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -4047,6 +4047,23 @@ out: return 0; } +static int its_sgi_set_vcpu_affinity(struct irq_data *d, void *vcpu_info) +{ + struct its_vpe *vpe = irq_data_get_irq_chip_data(d); + struct its_cmd_info *info = vcpu_info; + + switch (info->cmd_type) { + case PROP_UPDATE_VSGI: + vpe->sgi_config[d->hwirq].priority = info->priority; + vpe->sgi_config[d->hwirq].group = info->group; + its_configure_sgi(d, false); + return 0; + + default: + return -EINVAL; + } +} + static struct irq_chip its_sgi_irq_chip = { .name = "GICv4.1-sgi", .irq_mask = its_sgi_mask_irq, @@ -4054,6 +4071,7 @@ static struct irq_chip its_sgi_irq_chip = { .irq_set_affinity = its_sgi_set_affinity, .irq_set_irqchip_state = its_sgi_set_irqchip_state, .irq_get_irqchip_state = its_sgi_get_irqchip_state, + .irq_set_vcpu_affinity = its_sgi_set_vcpu_affinity, }; static int its_sgi_irq_domain_alloc(struct irq_domain *domain, diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 44e8c19e3d56..1b34100e3536 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -103,6 +103,7 @@ enum its_vcpu_info_cmd_type { SCHEDULE_VPE, DESCHEDULE_VPE, INVALL_VPE, + PROP_UPDATE_VSGI, }; struct its_cmd_info { @@ -115,6 +116,10 @@ struct its_cmd_info { bool g0en; bool g1en; }; + struct { + u8 priority; + bool group; + }; }; }; -- cgit v1.2.3 From ae699ad348cdcd416cbf28e8a02fc468780161f7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:20 +0000 Subject: irqchip/gic-v4.1: Move doorbell management to the GICv4 abstraction layer In order to hide some of the differences between v4.0 and v4.1, move the doorbell management out of the KVM code, and into the GICv4-specific layer. This allows the calling code to ask for the doorbell when blocking, and otherwise to leave the doorbell permanently disabled. This matches the v4.1 code perfectly, and only results in a minor refactoring of the v4.0 code. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20200304203330.4967-14-maz@kernel.org --- drivers/irqchip/irq-gic-v4.c | 45 ++++++++++++++++++++++++++++++++++---- include/kvm/arm_vgic.h | 1 + include/linux/irqchip/arm-gic-v4.h | 3 ++- virt/kvm/arm/vgic/vgic-v3.c | 4 +++- virt/kvm/arm/vgic/vgic-v4.c | 34 ++++++++++++---------------- 5 files changed, 61 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index c01910d53f9e..117ba6db023d 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -87,6 +87,11 @@ static struct irq_domain *gic_domain; static const struct irq_domain_ops *vpe_domain_ops; static const struct irq_domain_ops *sgi_domain_ops; +static bool has_v4_1(void) +{ + return !!sgi_domain_ops; +} + int its_alloc_vcpu_irqs(struct its_vm *vm) { int vpe_base_irq, i; @@ -139,18 +144,50 @@ static int its_send_vpe_cmd(struct its_vpe *vpe, struct its_cmd_info *info) return irq_set_vcpu_affinity(vpe->irq, info); } -int its_schedule_vpe(struct its_vpe *vpe, bool on) +int its_make_vpe_non_resident(struct its_vpe *vpe, bool db) { - struct its_cmd_info info; + struct irq_desc *desc = irq_to_desc(vpe->irq); + struct its_cmd_info info = { }; int ret; WARN_ON(preemptible()); - info.cmd_type = on ? SCHEDULE_VPE : DESCHEDULE_VPE; + info.cmd_type = DESCHEDULE_VPE; + if (has_v4_1()) { + /* GICv4.1 can directly deal with doorbells */ + info.req_db = db; + } else { + /* Undo the nested disable_irq() calls... */ + while (db && irqd_irq_disabled(&desc->irq_data)) + enable_irq(vpe->irq); + } + + ret = its_send_vpe_cmd(vpe, &info); + if (!ret) + vpe->resident = false; + + return ret; +} + +int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, bool g1en) +{ + struct its_cmd_info info = { }; + int ret; + + WARN_ON(preemptible()); + + info.cmd_type = SCHEDULE_VPE; + if (has_v4_1()) { + info.g0en = g0en; + info.g1en = g1en; + } else { + /* Disabled the doorbell, as we're about to enter the guest */ + disable_irq_nosync(vpe->irq); + } ret = its_send_vpe_cmd(vpe, &info); if (!ret) - vpe->resident = on; + vpe->resident = true; return ret; } diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9d53f545a3d5..63457908c9c4 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -70,6 +70,7 @@ struct vgic_global { /* Hardware has GICv4? */ bool has_gicv4; + bool has_gicv4_1; /* GIC system register CPU interface */ struct static_key_false gicv3_cpuif; diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 1b34100e3536..34ed4b5754dd 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -125,7 +125,8 @@ struct its_cmd_info { int its_alloc_vcpu_irqs(struct its_vm *vm); void its_free_vcpu_irqs(struct its_vm *vm); -int its_schedule_vpe(struct its_vpe *vpe, bool on); +int its_make_vpe_resident(struct its_vpe *vpe, bool g0en, bool g1en); +int its_make_vpe_non_resident(struct its_vpe *vpe, bool db); int its_invall_vpe(struct its_vpe *vpe); int its_map_vlpi(int irq, struct its_vlpi_map *map); int its_get_vlpi(int irq, struct its_vlpi_map *map); diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index f45635a6f0ec..1bc09b523486 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -595,7 +595,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info) /* GICv4 support? */ if (info->has_v4) { kvm_vgic_global_state.has_gicv4 = gicv4_enable; - kvm_info("GICv4 support %sabled\n", + kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; + kvm_info("GICv4%s support %sabled\n", + kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", gicv4_enable ? "en" : "dis"); } diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c index 46f875589c47..1eb0f8c76219 100644 --- a/virt/kvm/arm/vgic/vgic-v4.c +++ b/virt/kvm/arm/vgic/vgic-v4.c @@ -67,10 +67,10 @@ * it. And if we've migrated our vcpu from one CPU to another, we must * tell the ITS (so that the messages reach the right redistributor). * This is done in two steps: first issue a irq_set_affinity() on the - * irq corresponding to the vcpu, then call its_schedule_vpe(). You - * must be in a non-preemptible context. On exit, another call to - * its_schedule_vpe() tells the redistributor that we're done with the - * vcpu. + * irq corresponding to the vcpu, then call its_make_vpe_resident(). + * You must be in a non-preemptible context. On exit, a call to + * its_make_vpe_non_resident() tells the redistributor that we're done + * with the vcpu. * * Finally, the doorbell handling: Each vcpu is allocated an interrupt * which will fire each time a VLPI is made pending whilst the vcpu is @@ -86,7 +86,8 @@ static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) struct kvm_vcpu *vcpu = info; /* We got the message, no need to fire again */ - if (!irqd_irq_disabled(&irq_to_desc(irq)->irq_data)) + if (!kvm_vgic_global_state.has_gicv4_1 && + !irqd_irq_disabled(&irq_to_desc(irq)->irq_data)) disable_irq_nosync(irq); vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; @@ -199,19 +200,11 @@ void vgic_v4_teardown(struct kvm *kvm) int vgic_v4_put(struct kvm_vcpu *vcpu, bool need_db) { struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; - struct irq_desc *desc = irq_to_desc(vpe->irq); if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident) return 0; - /* - * If blocking, a doorbell is required. Undo the nested - * disable_irq() calls... - */ - while (need_db && irqd_irq_disabled(&desc->irq_data)) - enable_irq(vpe->irq); - - return its_schedule_vpe(vpe, false); + return its_make_vpe_non_resident(vpe, need_db); } int vgic_v4_load(struct kvm_vcpu *vcpu) @@ -232,18 +225,19 @@ int vgic_v4_load(struct kvm_vcpu *vcpu) if (err) return err; - /* Disabled the doorbell, as we're about to enter the guest */ - disable_irq_nosync(vpe->irq); - - err = its_schedule_vpe(vpe, true); + err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled); if (err) return err; /* * Now that the VPE is resident, let's get rid of a potential - * doorbell interrupt that would still be pending. + * doorbell interrupt that would still be pending. This is a + * GICv4.0 only "feature"... */ - return irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false); + if (!kvm_vgic_global_state.has_gicv4_1) + err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false); + + return err; } static struct vgic_its *vgic_get_its(struct kvm *kvm, -- cgit v1.2.3 From 6d31b6ff985dbd144b2c4d519cf573b8f81865d9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:21 +0000 Subject: irqchip/gic-v4.1: Add VSGI allocation/teardown Allocate per-VPE SGIs when initializing the GIC-specific part of the VPE data structure. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20200304203330.4967-15-maz@kernel.org --- drivers/irqchip/irq-gic-v4.c | 68 +++++++++++++++++++++++++++++++++++++- include/linux/irqchip/arm-gic-v4.h | 2 ++ 2 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 117ba6db023d..99b33f60ac63 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -92,6 +92,47 @@ static bool has_v4_1(void) return !!sgi_domain_ops; } +static int its_alloc_vcpu_sgis(struct its_vpe *vpe, int idx) +{ + char *name; + int sgi_base; + + if (!has_v4_1()) + return 0; + + name = kasprintf(GFP_KERNEL, "GICv4-sgi-%d", task_pid_nr(current)); + if (!name) + goto err; + + vpe->fwnode = irq_domain_alloc_named_id_fwnode(name, idx); + if (!vpe->fwnode) + goto err; + + kfree(name); + name = NULL; + + vpe->sgi_domain = irq_domain_create_linear(vpe->fwnode, 16, + sgi_domain_ops, vpe); + if (!vpe->sgi_domain) + goto err; + + sgi_base = __irq_domain_alloc_irqs(vpe->sgi_domain, -1, 16, + NUMA_NO_NODE, vpe, + false, NULL); + if (sgi_base <= 0) + goto err; + + return 0; + +err: + if (vpe->sgi_domain) + irq_domain_remove(vpe->sgi_domain); + if (vpe->fwnode) + irq_domain_free_fwnode(vpe->fwnode); + kfree(name); + return -ENOMEM; +} + int its_alloc_vcpu_irqs(struct its_vm *vm) { int vpe_base_irq, i; @@ -118,8 +159,13 @@ int its_alloc_vcpu_irqs(struct its_vm *vm) if (vpe_base_irq <= 0) goto err; - for (i = 0; i < vm->nr_vpes; i++) + for (i = 0; i < vm->nr_vpes; i++) { + int ret; vm->vpes[i]->irq = vpe_base_irq + i; + ret = its_alloc_vcpu_sgis(vm->vpes[i], i); + if (ret) + goto err; + } return 0; @@ -132,8 +178,28 @@ err: return -ENOMEM; } +static void its_free_sgi_irqs(struct its_vm *vm) +{ + int i; + + if (!has_v4_1()) + return; + + for (i = 0; i < vm->nr_vpes; i++) { + unsigned int irq = irq_find_mapping(vm->vpes[i]->sgi_domain, 0); + + if (WARN_ON(!irq)) + continue; + + irq_domain_free_irqs(irq, 16); + irq_domain_remove(vm->vpes[i]->sgi_domain); + irq_domain_free_fwnode(vm->vpes[i]->fwnode); + } +} + void its_free_vcpu_irqs(struct its_vm *vm) { + its_free_sgi_irqs(vm); irq_domain_free_irqs(vm->vpes[0]->irq, vm->nr_vpes); irq_domain_remove(vm->domain); irq_domain_free_fwnode(vm->fwnode); diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index 34ed4b5754dd..b120a01952fe 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -49,6 +49,8 @@ struct its_vpe { }; /* GICv4.1 implementations */ struct { + struct fwnode_handle *fwnode; + struct irq_domain *sgi_domain; struct { u8 priority; bool enabled; -- cgit v1.2.3 From d50676f5ce8481b98f9bbc1514b5d3f8747dd3c2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 4 Mar 2020 20:33:22 +0000 Subject: irqchip/gic-v4.1: Add VSGI property setup Add the SGI configuration entry point for KVM to use. Signed-off-by: Marc Zyngier Reviewed-by: Zenghui Yu Link: https://lore.kernel.org/r/20200304203330.4967-16-maz@kernel.org --- drivers/irqchip/irq-gic-v4.c | 13 +++++++++++++ include/linux/irqchip/arm-gic-v4.h | 1 + 2 files changed, 14 insertions(+) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v4.c b/drivers/irqchip/irq-gic-v4.c index 99b33f60ac63..0c18714ae13e 100644 --- a/drivers/irqchip/irq-gic-v4.c +++ b/drivers/irqchip/irq-gic-v4.c @@ -320,6 +320,19 @@ int its_prop_update_vlpi(int irq, u8 config, bool inv) return irq_set_vcpu_affinity(irq, &info); } +int its_prop_update_vsgi(int irq, u8 priority, bool group) +{ + struct its_cmd_info info = { + .cmd_type = PROP_UPDATE_VSGI, + { + .priority = priority, + .group = group, + }, + }; + + return irq_set_vcpu_affinity(irq, &info); +} + int its_init_v4(struct irq_domain *domain, const struct irq_domain_ops *vpe_ops, const struct irq_domain_ops *sgi_ops) diff --git a/include/linux/irqchip/arm-gic-v4.h b/include/linux/irqchip/arm-gic-v4.h index b120a01952fe..6976b8331b60 100644 --- a/include/linux/irqchip/arm-gic-v4.h +++ b/include/linux/irqchip/arm-gic-v4.h @@ -134,6 +134,7 @@ int its_map_vlpi(int irq, struct its_vlpi_map *map); int its_get_vlpi(int irq, struct its_vlpi_map *map); int its_unmap_vlpi(int irq); int its_prop_update_vlpi(int irq, u8 config, bool inv); +int its_prop_update_vsgi(int irq, u8 priority, bool group); struct irq_domain_ops; int its_init_v4(struct irq_domain *domain, -- cgit v1.2.3