From 9fad9d560af5c654bb38e0b07ee54a4e9acdc5cd Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 8 May 2024 17:22:51 +0000 Subject: scsi: sr: Fix unintentional arithmetic wraparound Running syzkaller with the newly reintroduced signed integer overflow sanitizer produces this report: [ 65.194362] ------------[ cut here ]------------ [ 65.197752] UBSAN: signed-integer-overflow in ../drivers/scsi/sr_ioctl.c:436:9 [ 65.203607] -2147483648 * 177 cannot be represented in type 'int' [ 65.207911] CPU: 2 PID: 10416 Comm: syz-executor.1 Not tainted 6.8.0-rc2-00035-gb3ef86b5a957 #1 [ 65.213585] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 65.219923] Call Trace: [ 65.221556] [ 65.223029] dump_stack_lvl+0x93/0xd0 [ 65.225573] handle_overflow+0x171/0x1b0 [ 65.228219] sr_select_speed+0xeb/0xf0 [ 65.230786] ? __pm_runtime_resume+0xe6/0x130 [ 65.233606] sr_block_ioctl+0x15d/0x1d0 ... Historically, the signed integer overflow sanitizer did not work in the kernel due to its interaction with `-fwrapv` but this has since been changed [1] in the newest version of Clang. It was re-enabled in the kernel with Commit 557f8c582a9b ("ubsan: Reintroduce signed overflow sanitizer"). Firstly, let's change the type of "speed" to unsigned long as sr_select_speed()'s only caller passes in an unsigned long anyways. $ git grep '\.select_speed' | drivers/scsi/sr.c: .select_speed = sr_select_speed, ... | static int cdrom_ioctl_select_speed(struct cdrom_device_info *cdi, | unsigned long arg) | { | ... | return cdi->ops->select_speed(cdi, arg); | } Next, let's add an extra check to make sure we don't exceed 0xffff/177 (350) since 0xffff is the max speed. This has two benefits: 1) we deal with integer overflow before it happens and 2) we properly respect the max speed of 0xffff. There are some "magic" numbers here but I did not want to change more than what was necessary. Link: https://github.com/llvm/llvm-project/pull/82432 [1] Closes: https://github.com/KSPP/linux/issues/357 Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Link: https://lore.kernel.org/r/20240508-b4-b4-sio-sr_select_speed-v2-1-00b68f724290@google.com Reviewed-by: Kees Cook Signed-off-by: Martin K. Petersen --- include/linux/cdrom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 98c6fd0b39b6..fdfb61ccf55a 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -77,7 +77,7 @@ struct cdrom_device_ops { unsigned int clearing, int slot); int (*tray_move) (struct cdrom_device_info *, int); int (*lock_door) (struct cdrom_device_info *, int); - int (*select_speed) (struct cdrom_device_info *, int); + int (*select_speed) (struct cdrom_device_info *, unsigned long); int (*get_last_session) (struct cdrom_device_info *, struct cdrom_multisession *); int (*get_mcn) (struct cdrom_device_info *, -- cgit v1.2.3 From e61bcf42d290e73025bab38e0e55a5586c2d8ad5 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 15 Apr 2024 22:50:27 +0200 Subject: i2c: Remove I2C_CLASS_SPD Remove this class after all users have been gone. Signed-off-by: Heiner Kallweit Signed-off-by: Andi Shyti --- include/linux/i2c.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 5e6cd43a6dbd..9709537370ee 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -852,7 +852,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) /* i2c adapter classes (bitmask) */ #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ -#define I2C_CLASS_SPD (1<<7) /* Memory modules */ /* Warn users that the adapter doesn't support classes anymore */ #define I2C_CLASS_DEPRECATED (1<<8) -- cgit v1.2.3 From 79c137454815ba5554caa8eeb4ad5c94e96e45ce Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Tue, 21 May 2024 19:49:38 +0800 Subject: filemap: add helper mapping_max_folio_size() Add mapping_max_folio_size() to get the maximum folio size for this pagecache mapping. Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") Cc: stable@vger.kernel.org Reviewed-by: Darrick J. Wong Signed-off-by: Xu Yang Link: https://lore.kernel.org/r/20240521114939.2541461-1-xu.yang_2@nxp.com Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3d69589c00a4..ee633712bba0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -346,6 +346,19 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + /** * mapping_set_large_folios() - Indicate the file supports large folios. * @mapping: The file. @@ -372,6 +385,14 @@ static inline bool mapping_large_folio_support(struct address_space *mapping) test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } +/* Return the maximum folio size for this pagecache mapping, in bytes. */ +static inline size_t mapping_max_folio_size(struct address_space *mapping) +{ + if (mapping_large_folio_support(mapping)) + return PAGE_SIZE << MAX_PAGECACHE_ORDER; + return PAGE_SIZE; +} + static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS @@ -530,19 +551,6 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } -/* - * There are some parts of the kernel which assume that PMD entries - * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, - * limit the maximum allocation order to PMD size. I'm not aware of any - * assumptions about maximum order if THP are disabled, but 8 seems like - * a good order (that's 1MB if you're using 4kB pages) - */ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER -#else -#define MAX_PAGECACHE_ORDER 8 -#endif - #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); #else -- cgit v1.2.3 From 1b9f86c6d53245dab087f1b2c05727b5982142ff Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 22 May 2024 22:26:54 +0300 Subject: net/mlx5: Fix MTMP register capability offset in MCAM register The MTMP register (0x900a) capability offset is off-by-one, move it to the right place. Fixes: 1f507e80c700 ("net/mlx5: Expose NIC temperature via hardware monitoring kernel API") Signed-off-by: Gal Pressman Reviewed-by: Cosmin Ratiu Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f468763478ae..5df52e15f7d6 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10308,9 +10308,9 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 mfrl[0x1]; u8 regs_39_to_32[0x8]; - u8 regs_31_to_10[0x16]; + u8 regs_31_to_11[0x15]; u8 mtmp[0x1]; - u8 regs_8_to_0[0x9]; + u8 regs_9_to_0[0xa]; }; struct mlx5_ifc_mcam_access_reg_bits1 { -- cgit v1.2.3 From 3998d184267dfcff858aaa84d3de17429253629d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 24 May 2024 18:36:17 +0200 Subject: netkit: Fix pkt_type override upon netkit pass verdict When running Cilium connectivity test suite with netkit in L2 mode, we found that compared to tcx a few tests were failing which pushed traffic into an L7 proxy sitting in host namespace. The problem in particular is around the invocation of eth_type_trans() in netkit. In case of tcx, this is run before the tcx ingress is triggered inside host namespace and thus if the BPF program uses the bpf_skb_change_type() helper the newly set type is retained. However, in case of netkit, the late eth_type_trans() invocation overrides the earlier decision from the BPF program which eventually leads to the test failure. Instead of eth_type_trans(), split out the relevant parts, meaning, reset of mac header and call to eth_skb_pkt_type() before the BPF program is run in order to have the same behavior as with tcx, and refactor a small helper called eth_skb_pull_mac() which is run in case it's passed up the stack where the mac header must be pulled. With this all connectivity tests pass. Fixes: 35dfaad7188c ("netkit, bpf: Add bpf programmable net device") Signed-off-by: Daniel Borkmann Acked-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20240524163619.26001-2-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- drivers/net/netkit.c | 4 +++- include/linux/etherdevice.h | 8 ++++++++ net/ethernet/eth.c | 4 +--- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 272894053e2c..16789cd446e9 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -55,6 +55,7 @@ static void netkit_prep_forward(struct sk_buff *skb, bool xnet) skb_scrub_packet(skb, xnet); skb->priority = 0; nf_skip_egress(skb, true); + skb_reset_mac_header(skb); } static struct netkit *netkit_priv(const struct net_device *dev) @@ -78,6 +79,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) skb_orphan_frags(skb, GFP_ATOMIC))) goto drop; netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); + eth_skb_pkt_type(skb, peer); skb->dev = peer; entry = rcu_dereference(nk->active); if (entry) @@ -85,7 +87,7 @@ static netdev_tx_t netkit_xmit(struct sk_buff *skb, struct net_device *dev) switch (ret) { case NETKIT_NEXT: case NETKIT_PASS: - skb->protocol = eth_type_trans(skb, skb->dev); + eth_skb_pull_mac(skb); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) { dev_sw_netstats_tx_add(dev, 1, len); diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2ad1ffa4ccb9..0ed47d00549b 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -636,6 +636,14 @@ static inline void eth_skb_pkt_type(struct sk_buff *skb, } } +static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb) +{ + struct ethhdr *eth = (struct ethhdr *)skb->data; + + skb_pull_inline(skb, ETH_HLEN); + return eth; +} + /** * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame * @skb: Buffer to pad diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 049c3adeb850..4e3651101b86 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -161,9 +161,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) skb->dev = dev; skb_reset_mac_header(skb); - eth = (struct ethhdr *)skb->data; - skb_pull_inline(skb, ETH_HLEN); - + eth = eth_skb_pull_mac(skb); eth_skb_pkt_type(skb, dev); /* -- cgit v1.2.3 From f89ea63f1c65d3e93b255f14f9d9e05df87955fa Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 May 2024 15:26:11 +0100 Subject: netfs, 9p: Fix race between umount and async request completion There's a problem in 9p's interaction with netfslib whereby a crash occurs because the 9p_fid structs get forcibly destroyed during client teardown (without paying attention to their refcounts) before netfslib has finished with them. However, it's not a simple case of deferring the clunking that p9_fid_put() does as that requires the p9_client record to still be present. The problem is that netfslib has to unlock pages and clear the IN_PROGRESS flag before destroying the objects involved - including the fid - and, in any case, nothing checks to see if writeback completed barring looking at the page flags. Fix this by keeping a count of outstanding I/O requests (of any type) and waiting for it to quiesce during inode eviction. Reported-by: syzbot+df038d463cca332e8414@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005be0aa061846f8d6@google.com/ Reported-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000b86c5e06130da9c6@google.com/ Reported-by: syzbot+1527696d41a634cc1819@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/000000000000041f960618206d7e@google.com/ Signed-off-by: David Howells Link: https://lore.kernel.org/r/755891.1716560771@warthog.procyon.org.uk Tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Reviewed-by: Dominique Martinet cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Christian Schoenebeck cc: Jeff Layton cc: Steve French cc: Hillf Danton cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Reported-and-tested-by: syzbot+d7c7a495a5e466c031b6@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/9p/vfs_inode.c | 1 + fs/afs/inode.c | 1 + fs/netfs/objects.c | 5 +++++ fs/smb/client/cifsfs.c | 1 + include/linux/netfs.h | 18 ++++++++++++++++++ 5 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7a3308d77606..fd72fc38c8f5 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -348,6 +348,7 @@ void v9fs_evict_inode(struct inode *inode) __le32 __maybe_unused version; if (!is_bad_inode(inode)) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); version = cpu_to_le32(v9inode->qid.version); diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 94fc049aff58..15bb7989c387 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -648,6 +648,7 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); afs_set_cache_aux(vnode, &aux); diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index c90d482b1650..f4a642727479 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -72,6 +72,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } } + atomic_inc(&ctx->io_count); trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); @@ -124,6 +125,7 @@ static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); @@ -142,6 +144,9 @@ static void netfs_free_request(struct work_struct *work) } kvfree(rreq->direct_bv); } + + if (atomic_dec_and_test(&ictx->io_count)) + wake_up_var(&ictx->io_count); call_rcu(&rreq->rcu, netfs_free_request_rcu); } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index ec5b639f421a..14810ffd15c8 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -431,6 +431,7 @@ cifs_free_inode(struct inode *inode) static void cifs_evict_inode(struct inode *inode) { + netfs_wait_for_outstanding_io(inode); truncate_inode_pages_final(&inode->i_data); if (inode->i_state & I_PINNING_NETFS_WB) cifs_fscache_unuse_inode_cookie(inode, true); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index ca56a4428043..3b22ce0d064c 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -68,6 +68,7 @@ struct netfs_inode { loff_t remote_i_size; /* Size of the remote file */ loff_t zero_point; /* Size after which we assume there's no data * on the server */ + atomic_t io_count; /* Number of outstanding reqs */ unsigned long flags; #define NETFS_ICTX_ODIRECT 0 /* The file has DIO in progress */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ @@ -472,6 +473,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx, ctx->remote_i_size = i_size_read(&ctx->inode); ctx->zero_point = LLONG_MAX; ctx->flags = 0; + atomic_set(&ctx->io_count, 0); #if IS_ENABLED(CONFIG_FSCACHE) ctx->cache = NULL; #endif @@ -515,4 +517,20 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) #endif } +/** + * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete + * @ctx: The netfs inode to wait on + * + * Wait for outstanding I/O requests of any type to complete. This is intended + * to be called from inode eviction routines. This makes sure that any + * resources held by those requests are cleaned up before we let the inode get + * cleaned up. + */ +static inline void netfs_wait_for_outstanding_io(struct inode *inode) +{ + struct netfs_inode *ictx = netfs_inode(inode); + + wait_var_event(&ictx->io_count, atomic_read(&ictx->io_count) == 0); +} + #endif /* _LINUX_NETFS_H */ -- cgit v1.2.3 From 80e4e17ac9e05adba3f1f0e1793398086e6d4007 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 May 2024 15:16:06 -0700 Subject: block: remove blk_queue_max_integrity_segments This is unused now that all the atomic queue limit conversions are merged. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240521221606.393040-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blk-integrity.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index e253e7bd0d17..7428cb43952d 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -66,12 +66,6 @@ blk_integrity_queue_supports_integrity(struct request_queue *q) return q->integrity.profile; } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ - q->limits.max_integrity_segments = segs; -} - static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { @@ -151,10 +145,6 @@ static inline void blk_integrity_register(struct gendisk *d, static inline void blk_integrity_unregister(struct gendisk *d) { } -static inline void blk_queue_max_integrity_segments(struct request_queue *q, - unsigned int segs) -{ -} static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { -- cgit v1.2.3 From f3d7ba9e1bc0c9080834f263d4887bd9c9ea491f Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Mon, 27 May 2024 13:56:27 +0300 Subject: tpm: Open code tpm_buf_parameters() With only single call site, this makes no sense (slipped out of the radar during the review). Open code and document the action directly to the site, to make it more readable. Fixes: 1b6d7f9eb150 ("tpm: add session encryption protection to tpm2_get_random()") Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-buf.c | 26 -------------------------- drivers/char/tpm/tpm2-cmd.c | 10 +++++++++- include/linux/tpm.h | 2 -- 3 files changed, 9 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c index 647c6ca92ac3..cad0048bcc3c 100644 --- a/drivers/char/tpm/tpm-buf.c +++ b/drivers/char/tpm/tpm-buf.c @@ -223,30 +223,4 @@ u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset) } EXPORT_SYMBOL_GPL(tpm_buf_read_u32); -static u16 tpm_buf_tag(struct tpm_buf *buf) -{ - struct tpm_header *head = (struct tpm_header *)buf->data; - - return be16_to_cpu(head->tag); -} - -/** - * tpm_buf_parameters - return the TPM response parameters area of the tpm_buf - * @buf: tpm_buf to use - * - * Where the parameters are located depends on the tag of a TPM - * command (it's immediately after the header for TPM_ST_NO_SESSIONS - * or 4 bytes after for TPM_ST_SESSIONS). Evaluate this and return a - * pointer to the first byte of the parameters area. - * - * @return: pointer to parameters area - */ -u8 *tpm_buf_parameters(struct tpm_buf *buf) -{ - int offset = TPM_HEADER_SIZE; - - if (tpm_buf_tag(buf) == TPM2_ST_SESSIONS) - offset += 4; - return &buf->data[offset]; -} diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c index 0cdf892ec2a7..1e856259219e 100644 --- a/drivers/char/tpm/tpm2-cmd.c +++ b/drivers/char/tpm/tpm2-cmd.c @@ -281,6 +281,7 @@ struct tpm2_get_random_out { int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) { struct tpm2_get_random_out *out; + struct tpm_header *head; struct tpm_buf buf; u32 recd; u32 num_bytes = max; @@ -288,6 +289,7 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) int total = 0; int retries = 5; u8 *dest_ptr = dest; + off_t offset; if (!num_bytes || max > TPM_MAX_RNG_DATA) return -EINVAL; @@ -320,7 +322,13 @@ int tpm2_get_random(struct tpm_chip *chip, u8 *dest, size_t max) goto out; } - out = (struct tpm2_get_random_out *)tpm_buf_parameters(&buf); + head = (struct tpm_header *)buf.data; + offset = TPM_HEADER_SIZE; + /* Skip the parameter size field: */ + if (be16_to_cpu(head->tag) == TPM2_ST_SESSIONS) + offset += 4; + + out = (struct tpm2_get_random_out *)&buf.data[offset]; recd = min_t(u32, be16_to_cpu(out->size), num_bytes); if (tpm_buf_length(&buf) < TPM_HEADER_SIZE + diff --git a/include/linux/tpm.h b/include/linux/tpm.h index c17e4efbb2e5..b3217200df28 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -437,8 +437,6 @@ u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset); u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset); u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset); -u8 *tpm_buf_parameters(struct tpm_buf *buf); - /* * Check if TPM device is in the firmware upgrade mode. */ -- cgit v1.2.3 From f09fc6cee0dcfc38148ee6b6dd04f93e353d22f2 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 28 May 2024 12:52:21 +0300 Subject: tpm: Rename TPM2_OA_TMPL to TPM2_OA_NULL_KEY and make it local Rename and document TPM2_OA_TMPL, as originally requested in the patch set review, but left unaddressed without any appropriate reasoning. The new name is TPM2_OA_NULL_KEY, has a documentation and is local only to tpm2-sessions.c. Link: https://lore.kernel.org/linux-integrity/ddbeb8111f48a8ddb0b8fca248dff6cc9d7079b2.camel@HansenPartnership.com/ Link: https://lore.kernel.org/linux-integrity/CZCKTWU6ZCC9.2UTEQPEVICYHL@suppilovahvero/ Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm2-sessions.c | 21 +++++++++++++++++++-- include/linux/tpm.h | 15 --------------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c index ea8860661876..907ac9956a78 100644 --- a/drivers/char/tpm/tpm2-sessions.c +++ b/drivers/char/tpm/tpm2-sessions.c @@ -80,6 +80,9 @@ /* maximum number of names the TPM must remember for authorization */ #define AUTH_MAX_NAMES 3 +#define AES_KEY_BYTES AES_KEYSIZE_128 +#define AES_KEY_BITS (AES_KEY_BYTES*8) + static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, u32 *handle, u8 *name); @@ -954,6 +957,20 @@ int tpm2_start_auth_session(struct tpm_chip *chip) } EXPORT_SYMBOL(tpm2_start_auth_session); +/* + * A mask containing the object attributes for the kernel held null primary key + * used in HMAC encryption. For more information on specific attributes look up + * to "8.3 TPMA_OBJECT (Object Attributes)". + */ +#define TPM2_OA_NULL_KEY ( \ + TPM2_OA_NO_DA | \ + TPM2_OA_FIXED_TPM | \ + TPM2_OA_FIXED_PARENT | \ + TPM2_OA_SENSITIVE_DATA_ORIGIN | \ + TPM2_OA_USER_WITH_AUTH | \ + TPM2_OA_DECRYPT | \ + TPM2_OA_RESTRICTED) + /** * tpm2_parse_create_primary() - parse the data returned from TPM_CC_CREATE_PRIMARY * @@ -1018,7 +1035,7 @@ static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf, val = tpm_buf_read_u32(buf, &offset_t); /* object properties */ - if (val != TPM2_OA_TMPL) + if (val != TPM2_OA_NULL_KEY) return -EINVAL; /* auth policy (empty) */ @@ -1178,7 +1195,7 @@ static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy, tpm_buf_append_u16(&template, TPM_ALG_SHA256); /* object properties */ - tpm_buf_append_u32(&template, TPM2_OA_TMPL); + tpm_buf_append_u32(&template, TPM2_OA_NULL_KEY); /* sauth policy (empty) */ tpm_buf_append_u16(&template, 0); diff --git a/include/linux/tpm.h b/include/linux/tpm.h index b3217200df28..21a67dc9efe8 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -394,21 +394,6 @@ enum tpm2_object_attributes { TPM2_OA_SIGN = BIT(18), }; -/* - * definitions for the canonical template. These are mandated - * by the TCG key template documents - */ - -#define AES_KEY_BYTES AES_KEYSIZE_128 -#define AES_KEY_BITS (AES_KEY_BYTES*8) -#define TPM2_OA_TMPL (TPM2_OA_NO_DA | \ - TPM2_OA_FIXED_TPM | \ - TPM2_OA_FIXED_PARENT | \ - TPM2_OA_SENSITIVE_DATA_ORIGIN | \ - TPM2_OA_USER_WITH_AUTH | \ - TPM2_OA_DECRYPT | \ - TPM2_OA_RESTRICTED) - enum tpm2_session_attributes { TPM2_SA_CONTINUE_SESSION = BIT(0), TPM2_SA_AUDIT_EXCLUSIVE = BIT(1), -- cgit v1.2.3 From db003a28e03f95f2bcb63f037a2078b8870b1ecd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 May 2024 14:33:20 +0200 Subject: netfs: fix kernel doc for nets_wait_for_outstanding_io() The @inode parameter wasn't documented leading to new doc build warnings. Fixes: f89ea63f1c65 ("netfs, 9p: Fix race between umount and async request completion") Link: https://lore.kernel.org/r/20240528133050.7e09d78e@canb.auug.org.au Signed-off-by: Christian Brauner --- include/linux/netfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 3ca3906bb8da..5d0288938cc2 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -521,7 +521,7 @@ static inline struct fscache_cookie *netfs_i_cookie(struct netfs_inode *ctx) /** * netfs_wait_for_outstanding_io - Wait for outstanding I/O to complete - * @ctx: The netfs inode to wait on + * @inode: The netfs inode to wait on * * Wait for outstanding I/O requests of any type to complete. This is intended * to be called from inode eviction routines. This makes sure that any -- cgit v1.2.3 From c7a5096781732e0f9784551309484f3e103f6750 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 May 2024 13:25:02 +0300 Subject: PNP: Make dev_is_pnp() to be a function and export it for modules Since we have a dev_is_pnp() macro that utilises the address of the pnp_bus_type variable, the users, which can be compiled as modules, will fail to build. Convert the macro to be a function and export it to the modules to prevent build breakage. Reported-by: Woody Suwalski Closes: https://lore.kernel.org/r/cc8a93b2-2504-9754-e26c-5d5c3bd1265c@gmail.com Fixes: 2a49b45cd0e7 ("PNP: Add dev_is_pnp() macro") Signed-off-by: Andy Shevchenko Reviewed-by: Christoph Hellwig Signed-off-by: Rafael J. Wysocki --- drivers/pnp/driver.c | 6 ++++++ include/linux/pnp.h | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c index 0a5d0d8befa8..3483e52e3a81 100644 --- a/drivers/pnp/driver.c +++ b/drivers/pnp/driver.c @@ -266,6 +266,12 @@ const struct bus_type pnp_bus_type = { .dev_groups = pnp_dev_groups, }; +bool dev_is_pnp(const struct device *dev) +{ + return dev->bus == &pnp_bus_type; +} +EXPORT_SYMBOL_GPL(dev_is_pnp); + int pnp_register_driver(struct pnp_driver *drv) { drv->driver.name = drv->name; diff --git a/include/linux/pnp.h b/include/linux/pnp.h index 82561242cda4..a8def1cea32c 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -469,7 +469,7 @@ int compare_pnp_id(struct pnp_id *pos, const char *id); int pnp_register_driver(struct pnp_driver *drv); void pnp_unregister_driver(struct pnp_driver *drv); -#define dev_is_pnp(d) ((d)->bus == &pnp_bus_type) +bool dev_is_pnp(const struct device *dev); #else @@ -502,7 +502,7 @@ static inline int compare_pnp_id(struct pnp_id *pos, const char *id) { return -E static inline int pnp_register_driver(struct pnp_driver *drv) { return -ENODEV; } static inline void pnp_unregister_driver(struct pnp_driver *drv) { } -#define dev_is_pnp(d) false +static inline bool dev_is_pnp(const struct device *dev) { return false; } #endif /* CONFIG_PNP */ -- cgit v1.2.3 From edcde848c01eb071a91d479a6b3101d9cf48e905 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 28 May 2024 13:25:03 +0300 Subject: PNP: Hide pnp_bus_type from the non-PNP code The pnp_bus_type is defined only when CONFIG_PNP=y, while being not guarded by ifdeffery in the header. Moreover, it's not used outside of the PNP code. Move it to the internal header to make sure no-one will try to (ab)use it. Signed-off-by: Andy Shevchenko Reviewed-by: Christoph Hellwig Signed-off-by: Rafael J. Wysocki --- drivers/pnp/base.h | 1 + include/linux/pnp.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pnp/base.h b/drivers/pnp/base.h index e74a0f6a3157..4e80273dfb1e 100644 --- a/drivers/pnp/base.h +++ b/drivers/pnp/base.h @@ -6,6 +6,7 @@ extern struct mutex pnp_lock; extern const struct attribute_group *pnp_dev_groups[]; +extern const struct bus_type pnp_bus_type; int pnp_register_protocol(struct pnp_protocol *protocol); void pnp_unregister_protocol(struct pnp_protocol *protocol); diff --git a/include/linux/pnp.h b/include/linux/pnp.h index a8def1cea32c..7f2ff95d2deb 100644 --- a/include/linux/pnp.h +++ b/include/linux/pnp.h @@ -435,8 +435,6 @@ struct pnp_protocol { #define protocol_for_each_dev(protocol, dev) \ list_for_each_entry(dev, &(protocol)->devices, protocol_list) -extern const struct bus_type pnp_bus_type; - #if defined(CONFIG_PNP) /* device management */ -- cgit v1.2.3 From 779b8a14afde110dd3502566be907289eba72447 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 28 May 2024 14:09:23 +0200 Subject: cpufreq: amd-pstate: remove global header file When extra warnings are enabled, gcc points out a global variable definition in a header: In file included from drivers/cpufreq/amd-pstate-ut.c:29: include/linux/amd-pstate.h:123:27: error: 'amd_pstate_mode_string' defined but not used [-Werror=unused-const-variable=] 123 | static const char * const amd_pstate_mode_string[] = { | ^~~~~~~~~~~~~~~~~~~~~~ This header is only included from two files in the same directory, and one of them uses only a single definition from it, so clean it up by moving most of the contents into the driver that uses them, and making shared bits a local header file. Fixes: 36c5014e5460 ("cpufreq: amd-pstate: optimize driver working mode selection in amd_pstate_param()") Signed-off-by: Arnd Bergmann Acked-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 1 - drivers/cpufreq/amd-pstate-ut.c | 3 +- drivers/cpufreq/amd-pstate.c | 34 +++++++++- drivers/cpufreq/amd-pstate.h | 104 ++++++++++++++++++++++++++++++ include/linux/amd-pstate.h | 137 ---------------------------------------- 5 files changed, 139 insertions(+), 140 deletions(-) create mode 100644 drivers/cpufreq/amd-pstate.h delete mode 100644 include/linux/amd-pstate.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index d6c90161c7bf..fc31870379f8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1107,7 +1107,6 @@ L: linux-pm@vger.kernel.org S: Supported F: Documentation/admin-guide/pm/amd-pstate.rst F: drivers/cpufreq/amd-pstate* -F: include/linux/amd-pstate.h F: tools/power/x86/amd_pstate_tracer/amd_pstate_trace.py AMD PTDMA DRIVER diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index f04ae67dda37..fc275d41d51e 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -26,10 +26,11 @@ #include #include #include -#include #include +#include "amd-pstate.h" + /* * Abbreviations: * amd_pstate_ut: used as a shortform for AMD P-State unit test. diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 1b7e82a0ad2e..91993647e09e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include @@ -46,6 +45,8 @@ #include #include #include + +#include "amd-pstate.h" #include "amd-pstate-trace.h" #define AMD_PSTATE_TRANSITION_LATENCY 20000 @@ -53,6 +54,37 @@ #define CPPC_HIGHEST_PERF_PERFORMANCE 196 #define CPPC_HIGHEST_PERF_DEFAULT 166 +#define AMD_CPPC_EPP_PERFORMANCE 0x00 +#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 +#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF +#define AMD_CPPC_EPP_POWERSAVE 0xFF + +/* + * enum amd_pstate_mode - driver working mode of amd pstate + */ +enum amd_pstate_mode { + AMD_PSTATE_UNDEFINED = 0, + AMD_PSTATE_DISABLE, + AMD_PSTATE_PASSIVE, + AMD_PSTATE_ACTIVE, + AMD_PSTATE_GUIDED, + AMD_PSTATE_MAX, +}; + +static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_UNDEFINED] = "undefined", + [AMD_PSTATE_DISABLE] = "disable", + [AMD_PSTATE_PASSIVE] = "passive", + [AMD_PSTATE_ACTIVE] = "active", + [AMD_PSTATE_GUIDED] = "guided", + NULL, +}; + +struct quirk_entry { + u32 nominal_freq; + u32 lowest_freq; +}; + /* * TODO: We need more time to fine tune processors with shared memory solution * with community together. diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h new file mode 100644 index 000000000000..e6a28e7f4dbf --- /dev/null +++ b/drivers/cpufreq/amd-pstate.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 Advanced Micro Devices, Inc. + * + * Author: Meng Li + */ + +#ifndef _LINUX_AMD_PSTATE_H +#define _LINUX_AMD_PSTATE_H + +#include + +/********************************************************************* + * AMD P-state INTERFACE * + *********************************************************************/ +/** + * struct amd_aperf_mperf + * @aperf: actual performance frequency clock count + * @mperf: maximum performance frequency clock count + * @tsc: time stamp counter + */ +struct amd_aperf_mperf { + u64 aperf; + u64 mperf; + u64 tsc; +}; + +/** + * struct amd_cpudata - private CPU data for AMD P-State + * @cpu: CPU number + * @req: constraint request to apply + * @cppc_req_cached: cached performance request hints + * @highest_perf: the maximum performance an individual processor may reach, + * assuming ideal conditions + * For platforms that do not support the preferred core feature, the + * highest_pef may be configured with 166 or 255, to avoid max frequency + * calculated wrongly. we take the fixed value as the highest_perf. + * @nominal_perf: the maximum sustained performance level of the processor, + * assuming ideal operating conditions + * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power + * savings are achieved + * @lowest_perf: the absolute lowest performance level of the processor + * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher + * priority. + * @min_limit_perf: Cached value of the performance corresponding to policy->min + * @max_limit_perf: Cached value of the performance corresponding to policy->max + * @min_limit_freq: Cached value of policy->min (in khz) + * @max_limit_freq: Cached value of policy->max (in khz) + * @max_freq: the frequency (in khz) that mapped to highest_perf + * @min_freq: the frequency (in khz) that mapped to lowest_perf + * @nominal_freq: the frequency (in khz) that mapped to nominal_perf + * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf + * @cur: Difference of Aperf/Mperf/tsc count between last and current sample + * @prev: Last Aperf/Mperf/tsc count value read from register + * @freq: current cpu frequency value (in khz) + * @boost_supported: check whether the Processor or SBIOS supports boost mode + * @hw_prefcore: check whether HW supports preferred core featue. + * Only when hw_prefcore and early prefcore param are true, + * AMD P-State driver supports preferred core featue. + * @epp_policy: Last saved policy used to set energy-performance preference + * @epp_cached: Cached CPPC energy-performance preference value + * @policy: Cpufreq policy value + * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value + * + * The amd_cpudata is key private data for each CPU thread in AMD P-State, and + * represents all the attributes and goals that AMD P-State requests at runtime. + */ +struct amd_cpudata { + int cpu; + + struct freq_qos_request req[2]; + u64 cppc_req_cached; + + u32 highest_perf; + u32 nominal_perf; + u32 lowest_nonlinear_perf; + u32 lowest_perf; + u32 prefcore_ranking; + u32 min_limit_perf; + u32 max_limit_perf; + u32 min_limit_freq; + u32 max_limit_freq; + + u32 max_freq; + u32 min_freq; + u32 nominal_freq; + u32 lowest_nonlinear_freq; + + struct amd_aperf_mperf cur; + struct amd_aperf_mperf prev; + + u64 freq; + bool boost_supported; + bool hw_prefcore; + + /* EPP feature related attributes*/ + s16 epp_policy; + s16 epp_cached; + u32 policy; + u64 cppc_cap1_cached; + bool suspended; +}; + +#endif /* _LINUX_AMD_PSTATE_H */ diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h deleted file mode 100644 index d58fc022ec46..000000000000 --- a/include/linux/amd-pstate.h +++ /dev/null @@ -1,137 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * linux/include/linux/amd-pstate.h - * - * Copyright (C) 2022 Advanced Micro Devices, Inc. - * - * Author: Meng Li - */ - -#ifndef _LINUX_AMD_PSTATE_H -#define _LINUX_AMD_PSTATE_H - -#include - -#define AMD_CPPC_EPP_PERFORMANCE 0x00 -#define AMD_CPPC_EPP_BALANCE_PERFORMANCE 0x80 -#define AMD_CPPC_EPP_BALANCE_POWERSAVE 0xBF -#define AMD_CPPC_EPP_POWERSAVE 0xFF - -/********************************************************************* - * AMD P-state INTERFACE * - *********************************************************************/ -/** - * struct amd_aperf_mperf - * @aperf: actual performance frequency clock count - * @mperf: maximum performance frequency clock count - * @tsc: time stamp counter - */ -struct amd_aperf_mperf { - u64 aperf; - u64 mperf; - u64 tsc; -}; - -/** - * struct amd_cpudata - private CPU data for AMD P-State - * @cpu: CPU number - * @req: constraint request to apply - * @cppc_req_cached: cached performance request hints - * @highest_perf: the maximum performance an individual processor may reach, - * assuming ideal conditions - * For platforms that do not support the preferred core feature, the - * highest_pef may be configured with 166 or 255, to avoid max frequency - * calculated wrongly. we take the fixed value as the highest_perf. - * @nominal_perf: the maximum sustained performance level of the processor, - * assuming ideal operating conditions - * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power - * savings are achieved - * @lowest_perf: the absolute lowest performance level of the processor - * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher - * priority. - * @min_limit_perf: Cached value of the performance corresponding to policy->min - * @max_limit_perf: Cached value of the performance corresponding to policy->max - * @min_limit_freq: Cached value of policy->min (in khz) - * @max_limit_freq: Cached value of policy->max (in khz) - * @max_freq: the frequency (in khz) that mapped to highest_perf - * @min_freq: the frequency (in khz) that mapped to lowest_perf - * @nominal_freq: the frequency (in khz) that mapped to nominal_perf - * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf - * @cur: Difference of Aperf/Mperf/tsc count between last and current sample - * @prev: Last Aperf/Mperf/tsc count value read from register - * @freq: current cpu frequency value (in khz) - * @boost_supported: check whether the Processor or SBIOS supports boost mode - * @hw_prefcore: check whether HW supports preferred core featue. - * Only when hw_prefcore and early prefcore param are true, - * AMD P-State driver supports preferred core featue. - * @epp_policy: Last saved policy used to set energy-performance preference - * @epp_cached: Cached CPPC energy-performance preference value - * @policy: Cpufreq policy value - * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value - * - * The amd_cpudata is key private data for each CPU thread in AMD P-State, and - * represents all the attributes and goals that AMD P-State requests at runtime. - */ -struct amd_cpudata { - int cpu; - - struct freq_qos_request req[2]; - u64 cppc_req_cached; - - u32 highest_perf; - u32 nominal_perf; - u32 lowest_nonlinear_perf; - u32 lowest_perf; - u32 prefcore_ranking; - u32 min_limit_perf; - u32 max_limit_perf; - u32 min_limit_freq; - u32 max_limit_freq; - - u32 max_freq; - u32 min_freq; - u32 nominal_freq; - u32 lowest_nonlinear_freq; - - struct amd_aperf_mperf cur; - struct amd_aperf_mperf prev; - - u64 freq; - bool boost_supported; - bool hw_prefcore; - - /* EPP feature related attributes*/ - s16 epp_policy; - s16 epp_cached; - u32 policy; - u64 cppc_cap1_cached; - bool suspended; -}; - -/* - * enum amd_pstate_mode - driver working mode of amd pstate - */ -enum amd_pstate_mode { - AMD_PSTATE_UNDEFINED = 0, - AMD_PSTATE_DISABLE, - AMD_PSTATE_PASSIVE, - AMD_PSTATE_ACTIVE, - AMD_PSTATE_GUIDED, - AMD_PSTATE_MAX, -}; - -static const char * const amd_pstate_mode_string[] = { - [AMD_PSTATE_UNDEFINED] = "undefined", - [AMD_PSTATE_DISABLE] = "disable", - [AMD_PSTATE_PASSIVE] = "passive", - [AMD_PSTATE_ACTIVE] = "active", - [AMD_PSTATE_GUIDED] = "guided", - NULL, -}; - -struct quirk_entry { - u32 nominal_freq; - u32 lowest_freq; -}; - -#endif /* _LINUX_AMD_PSTATE_H */ -- cgit v1.2.3 From 29459c3eaa5c6261fbe0dea7bdeb9b48d35d862a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 30 May 2024 14:40:34 +0900 Subject: block: Fix zone write plugging handling of devices with a runt zone A zoned device may have a last sequential write required zone that is smaller than other zones. However, all tests to check if a zone write plug write offset exceeds the zone capacity use the same capacity value stored in the gendisk zone_capacity field. This is incorrect for a zoned device with a last runt (smaller) zone. Add the new field last_zone_capacity to struct gendisk to store the capacity of the last zone of the device. blk_revalidate_seq_zone() and blk_revalidate_conv_zone() are both modified to get this value when disk_zone_is_last() returns true. Similarly to zone_capacity, the value is first stored using the last_zone_capacity field of struct blk_revalidate_zone_args. Once zone revalidation of all zones is done, this is used to set the gendisk last_zone_capacity field. The checks to determine if a zone is full or if a sector offset in a zone exceeds the zone capacity in disk_should_remove_zone_wplug(), disk_zone_wplug_abort_unaligned(), blk_zone_write_plug_init_request(), and blk_zone_wplug_prepare_bio() are modified to use the new helper functions disk_zone_is_full() and disk_zone_wplug_is_full(). disk_zone_is_full() uses the zone index to determine if the zone being tested is the last one of the disk and uses the either the disk zone_capacity or last_zone_capacity accordingly. Fixes: dd291d77cc90 ("block: Introduce zone write plugging") Signed-off-by: Damien Le Moal Reviewed-by: Bart Van Assche Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/20240530054035.491497-4-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 35 +++++++++++++++++++++++++++-------- include/linux/blkdev.h | 1 + 2 files changed, 28 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 402a50a1ac4d..52abebf56027 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -455,6 +455,20 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) return zone->start + zone->len >= get_capacity(disk); } +static bool disk_zone_is_full(struct gendisk *disk, + unsigned int zno, unsigned int offset_in_zone) +{ + if (zno < disk->nr_zones - 1) + return offset_in_zone >= disk->zone_capacity; + return offset_in_zone >= disk->last_zone_capacity; +} + +static bool disk_zone_wplug_is_full(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); +} + static bool disk_insert_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { @@ -548,7 +562,7 @@ static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, return false; /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; + return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); } static void disk_remove_zone_wplug(struct gendisk *disk, @@ -669,13 +683,12 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - unsigned int zone_capacity = disk->zone_capacity; unsigned int wp_offset = zwplug->wp_offset; struct bio_list bl = BIO_EMPTY_LIST; struct bio *bio; while ((bio = bio_list_pop(&zwplug->bio_list))) { - if (wp_offset >= zone_capacity || + if (disk_zone_is_full(disk, zwplug->zone_no, wp_offset) || (bio_op(bio) != REQ_OP_ZONE_APPEND && bio_offset_from_zone_start(bio) != wp_offset)) { blk_zone_wplug_bio_io_error(zwplug, bio); @@ -914,7 +927,6 @@ void blk_zone_write_plug_init_request(struct request *req) sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); struct request_queue *q = req->q; struct gendisk *disk = q->disk; - unsigned int zone_capacity = disk->zone_capacity; struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, blk_rq_pos(req)); unsigned long flags; @@ -938,7 +950,7 @@ void blk_zone_write_plug_init_request(struct request *req) * into the back of the request. */ spin_lock_irqsave(&zwplug->lock, flags); - while (zwplug->wp_offset < zone_capacity) { + while (!disk_zone_wplug_is_full(disk, zwplug)) { bio = bio_list_peek(&zwplug->bio_list); if (!bio) break; @@ -984,7 +996,7 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, * We know such BIO will fail, and that would potentially overflow our * write pointer offset beyond the end of the zone. */ - if (zwplug->wp_offset >= disk->zone_capacity) + if (disk_zone_wplug_is_full(disk, zwplug)) goto err; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { @@ -1561,6 +1573,7 @@ void disk_free_zone_resources(struct gendisk *disk) kfree(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; disk->zone_capacity = 0; + disk->last_zone_capacity = 0; disk->nr_zones = 0; } @@ -1605,6 +1618,7 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; sector_t sector; }; @@ -1622,6 +1636,7 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; + disk->last_zone_capacity = args->last_zone_capacity; swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); if (disk->conv_zones_bitmap) nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, @@ -1673,6 +1688,9 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, return -ENODEV; } + if (disk_zone_is_last(disk, zone)) + args->last_zone_capacity = zone->capacity; + if (!disk_need_zone_resources(disk)) return 0; @@ -1703,8 +1721,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, */ if (!args->zone_capacity) args->zone_capacity = zone->capacity; - if (!disk_zone_is_last(disk, zone) && - zone->capacity != args->zone_capacity) { + if (disk_zone_is_last(disk, zone)) { + args->last_zone_capacity = zone->capacity; + } else if (zone->capacity != args->zone_capacity) { pr_warn("%s: Invalid variable zone capacity\n", disk->disk_name); return -ENODEV; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aefdda9f4ec7..24c36929920b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -186,6 +186,7 @@ struct gendisk { */ unsigned int nr_zones; unsigned int zone_capacity; + unsigned int last_zone_capacity; unsigned long *conv_zones_bitmap; unsigned int zone_wplugs_hash_bits; spinlock_t zone_wplugs_lock; -- cgit v1.2.3 From b7c5e64fecfa88764791679cca4786ac65de739e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 May 2024 22:52:30 -0600 Subject: vfio: Create vfio_fs_type with inode per device By linking all the device fds we provide to userspace to an address space through a new pseudo fs, we can use tools like unmap_mapping_range() to zap all vmas associated with a device. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240530045236.1005864-2-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/device_cdev.c | 7 +++++++ drivers/vfio/group.c | 7 +++++++ drivers/vfio/vfio_main.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/vfio.h | 1 + 4 files changed, 59 insertions(+) (limited to 'include/linux') diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index e75da0a70d1f..bb1817bd4ff3 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -39,6 +39,13 @@ int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep) filep->private_data = df; + /* + * Use the pseudo fs inode on the device to link all mmaps + * to the same address space, allowing us to unmap all vmas + * associated to this device using unmap_mapping_range(). + */ + filep->f_mapping = device->inode->i_mapping; + return 0; err_put_registration: diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 610a429c6191..ded364588d29 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -286,6 +286,13 @@ static struct file *vfio_device_open_file(struct vfio_device *device) */ filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); + /* + * Use the pseudo fs inode on the device to link all mmaps + * to the same address space, allowing us to unmap all vmas + * associated to this device using unmap_mapping_range(). + */ + filep->f_mapping = device->inode->i_mapping; + if (device->group->type == VFIO_NO_IOMMU) dev_warn(device->dev, "vfio-noiommu device opened by user " "(%s:%d)\n", current->comm, task_pid_nr(current)); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index e97d796a54fb..a5a62d9d963f 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -43,9 +45,13 @@ #define DRIVER_AUTHOR "Alex Williamson " #define DRIVER_DESC "VFIO - User Level meta-driver" +#define VFIO_MAGIC 0x5646494f /* "VFIO" */ + static struct vfio { struct class *device_class; struct ida device_ida; + struct vfsmount *vfs_mount; + int fs_count; } vfio; #ifdef CONFIG_VFIO_NOIOMMU @@ -186,6 +192,8 @@ static void vfio_device_release(struct device *dev) if (device->ops->release) device->ops->release(device); + iput(device->inode); + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); kvfree(device); } @@ -228,6 +236,34 @@ out_free: } EXPORT_SYMBOL_GPL(_vfio_alloc_device); +static int vfio_fs_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type vfio_fs_type = { + .name = "vfio", + .owner = THIS_MODULE, + .init_fs_context = vfio_fs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static struct inode *vfio_fs_inode_new(void) +{ + struct inode *inode; + int ret; + + ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count); + if (ret) + return ERR_PTR(ret); + + inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb); + if (IS_ERR(inode)) + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); + + return inode; +} + /* * Initialize a vfio_device so it can be registered to vfio core. */ @@ -246,6 +282,11 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev, init_completion(&device->comp); device->dev = dev; device->ops = ops; + device->inode = vfio_fs_inode_new(); + if (IS_ERR(device->inode)) { + ret = PTR_ERR(device->inode); + goto out_inode; + } if (ops->init) { ret = ops->init(device); @@ -260,6 +301,9 @@ static int vfio_init_device(struct vfio_device *device, struct device *dev, return 0; out_uninit: + iput(device->inode); + simple_release_fs(&vfio.vfs_mount, &vfio.fs_count); +out_inode: vfio_release_device_set(device); ida_free(&vfio.device_ida, device->index); return ret; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 8b1a29820409..000a6cab2d31 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -64,6 +64,7 @@ struct vfio_device { struct completion comp; struct iommufd_access *iommufd_access; void (*put_kvm)(struct kvm *kvm); + struct inode *inode; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; u8 iommufd_attached:1; -- cgit v1.2.3 From aac6db75a9fc2c7a6f73e152df8f15101dda38e6 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 May 2024 22:52:31 -0600 Subject: vfio/pci: Use unmap_mapping_range() With the vfio device fd tied to the address space of the pseudo fs inode, we can use the mm to track all vmas that might be mmap'ing device BARs, which removes our vma_list and all the complicated lock ordering necessary to manually zap each related vma. Note that we can no longer store the pfn in vm_pgoff if we want to use unmap_mapping_range() to zap a selective portion of the device fd corresponding to BAR mappings. This also converts our mmap fault handler to use vmf_insert_pfn() because we no longer have a vma_list to avoid the concurrency problem with io_remap_pfn_range(). The goal is to eventually use the vm_ops huge_fault handler to avoid the additional faulting overhead, but vmf_insert_pfn_{pmd,pud}() need to learn about pfnmaps first. Also, Jason notes that a race exists between unmap_mapping_range() and the fops mmap callback if we were to call io_remap_pfn_range() to populate the vma on mmap. Specifically, mmap_region() does call_mmap() before it does vma_link_file() which gives a window where the vma is populated but invisible to unmap_mapping_range(). Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20240530045236.1005864-3-alex.williamson@redhat.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 264 ++++++++------------------------------- include/linux/vfio_pci_core.h | 2 - 2 files changed, 55 insertions(+), 211 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 80cae87fff36..db31c27bf78b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1610,100 +1610,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu } EXPORT_SYMBOL_GPL(vfio_pci_core_write); -/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ -static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try) +static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev) { - struct vfio_pci_mmap_vma *mmap_vma, *tmp; + struct vfio_device *core_vdev = &vdev->vdev; + loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX); + loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX); + loff_t len = end - start; - /* - * Lock ordering: - * vma_lock is nested under mmap_lock for vm_ops callback paths. - * The memory_lock semaphore is used by both code paths calling - * into this function to zap vmas and the vm_ops.fault callback - * to protect the memory enable state of the device. - * - * When zapping vmas we need to maintain the mmap_lock => vma_lock - * ordering, which requires using vma_lock to walk vma_list to - * acquire an mm, then dropping vma_lock to get the mmap_lock and - * reacquiring vma_lock. This logic is derived from similar - * requirements in uverbs_user_mmap_disassociate(). - * - * mmap_lock must always be the top-level lock when it is taken. - * Therefore we can only hold the memory_lock write lock when - * vma_list is empty, as we'd need to take mmap_lock to clear - * entries. vma_list can only be guaranteed empty when holding - * vma_lock, thus memory_lock is nested under vma_lock. - * - * This enables the vm_ops.fault callback to acquire vma_lock, - * followed by memory_lock read lock, while already holding - * mmap_lock without risk of deadlock. - */ - while (1) { - struct mm_struct *mm = NULL; - - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) - return 0; - } else { - mutex_lock(&vdev->vma_lock); - } - while (!list_empty(&vdev->vma_list)) { - mmap_vma = list_first_entry(&vdev->vma_list, - struct vfio_pci_mmap_vma, - vma_next); - mm = mmap_vma->vma->vm_mm; - if (mmget_not_zero(mm)) - break; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - mm = NULL; - } - if (!mm) - return 1; - mutex_unlock(&vdev->vma_lock); - - if (try) { - if (!mmap_read_trylock(mm)) { - mmput(mm); - return 0; - } - } else { - mmap_read_lock(mm); - } - if (try) { - if (!mutex_trylock(&vdev->vma_lock)) { - mmap_read_unlock(mm); - mmput(mm); - return 0; - } - } else { - mutex_lock(&vdev->vma_lock); - } - list_for_each_entry_safe(mmap_vma, tmp, - &vdev->vma_list, vma_next) { - struct vm_area_struct *vma = mmap_vma->vma; - - if (vma->vm_mm != mm) - continue; - - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - - zap_vma_ptes(vma, vma->vm_start, - vma->vm_end - vma->vm_start); - } - mutex_unlock(&vdev->vma_lock); - mmap_read_unlock(mm); - mmput(mm); - } + unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true); } void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev) { - vfio_pci_zap_and_vma_lock(vdev, false); down_write(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + vfio_pci_zap_bars(vdev); } u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev) @@ -1725,99 +1645,41 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c up_write(&vdev->memory_lock); } -/* Caller holds vma_lock */ -static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, - struct vm_area_struct *vma) -{ - struct vfio_pci_mmap_vma *mmap_vma; - - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); - if (!mmap_vma) - return -ENOMEM; - - mmap_vma->vma = vma; - list_add(&mmap_vma->vma_next, &vdev->vma_list); - - return 0; -} - -/* - * Zap mmaps on open so that we can fault them in on access and therefore - * our vma_list only tracks mappings accessed since last zap. - */ -static void vfio_pci_mmap_open(struct vm_area_struct *vma) -{ - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); -} - -static void vfio_pci_mmap_close(struct vm_area_struct *vma) +static unsigned long vma_to_pfn(struct vm_area_struct *vma) { struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; + int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + u64 pgoff; - mutex_lock(&vdev->vma_lock); - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) { - list_del(&mmap_vma->vma_next); - kfree(mmap_vma); - break; - } - } - mutex_unlock(&vdev->vma_lock); + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; - struct vfio_pci_mmap_vma *mmap_vma; - vm_fault_t ret = VM_FAULT_NOPAGE; + unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; + vm_fault_t ret = VM_FAULT_SIGBUS; - mutex_lock(&vdev->vma_lock); - down_read(&vdev->memory_lock); + pfn = vma_to_pfn(vma); - /* - * Memory region cannot be accessed if the low power feature is engaged - * or memory access is disabled. - */ - if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { - ret = VM_FAULT_SIGBUS; - goto up_out; - } + down_read(&vdev->memory_lock); - /* - * We populate the whole vma on fault, so we need to test whether - * the vma has already been mapped, such as for concurrent faults - * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if - * we ask it to fill the same range again. - */ - list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) { - if (mmap_vma->vma == vma) - goto up_out; - } + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) + goto out_disabled; - if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot)) { - ret = VM_FAULT_SIGBUS; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - goto up_out; - } + ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); - if (__vfio_pci_add_vma(vdev, vma)) { - ret = VM_FAULT_OOM; - zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); - } - -up_out: +out_disabled: up_read(&vdev->memory_lock); - mutex_unlock(&vdev->vma_lock); + return ret; } static const struct vm_operations_struct vfio_pci_mmap_ops = { - .open = vfio_pci_mmap_open, - .close = vfio_pci_mmap_close, .fault = vfio_pci_mmap_fault, }; @@ -1880,11 +1742,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); /* - * See remap_pfn_range(), called from vfio_pci_fault() but we can't - * change vm_flags within the fault handler. Set them now. + * Set vm_flags now, they should not be changed in the fault handler. + * We want the same flags and page protection (decrypted above) as + * io_remap_pfn_range() would set. * * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64, * allowing KVM stage 2 device mapping attributes to use Normal-NC @@ -2202,8 +2065,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev) mutex_init(&vdev->ioeventfds_lock); INIT_LIST_HEAD(&vdev->dummy_resources_list); INIT_LIST_HEAD(&vdev->ioeventfds_list); - mutex_init(&vdev->vma_lock); - INIT_LIST_HEAD(&vdev->vma_list); INIT_LIST_HEAD(&vdev->sriov_pfs_item); init_rwsem(&vdev->memory_lock); xa_init(&vdev->ctx); @@ -2219,7 +2080,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) mutex_destroy(&vdev->igate); mutex_destroy(&vdev->ioeventfds_lock); - mutex_destroy(&vdev->vma_lock); kfree(vdev->region); kfree(vdev->pm_save); } @@ -2497,26 +2357,15 @@ unwind: return ret; } -/* - * We need to get memory_lock for each device, but devices can share mmap_lock, - * therefore we need to zap and hold the vma_lock for each device, and only then - * get each memory_lock. - */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, struct vfio_pci_group_info *groups, struct iommufd_ctx *iommufd_ctx) { - struct vfio_pci_core_device *cur_mem; - struct vfio_pci_core_device *cur_vma; - struct vfio_pci_core_device *cur; + struct vfio_pci_core_device *vdev; struct pci_dev *pdev; - bool is_mem = true; int ret; mutex_lock(&dev_set->lock); - cur_mem = list_first_entry(&dev_set->device_list, - struct vfio_pci_core_device, - vdev.dev_set_list); pdev = vfio_pci_dev_set_resettable(dev_set); if (!pdev) { @@ -2533,7 +2382,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, if (ret) goto err_unlock; - list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) { bool owned; /* @@ -2557,38 +2406,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * Otherwise, reset is not allowed. */ if (iommufd_ctx) { - int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + int devid = vfio_iommufd_get_dev_id(&vdev->vdev, iommufd_ctx); owned = (devid > 0 || devid == -ENOENT); } else { - owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + owned = vfio_dev_in_groups(&vdev->vdev, groups); } if (!owned) { ret = -EINVAL; - goto err_undo; + break; } /* - * Locking multiple devices is prone to deadlock, runaway and - * unwind if we hit contention. + * Take the memory write lock for each device and zap BAR + * mappings to prevent the user accessing the device while in + * reset. Locking multiple devices is prone to deadlock, + * runaway and unwind if we hit contention. */ - if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) { + if (!down_write_trylock(&vdev->memory_lock)) { ret = -EBUSY; - goto err_undo; + break; } + + vfio_pci_zap_bars(vdev); } - cur_vma = NULL; - list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) { - if (!down_write_trylock(&cur_mem->memory_lock)) { - ret = -EBUSY; - goto err_undo; - } - mutex_unlock(&cur_mem->vma_lock); + if (!list_entry_is_head(vdev, + &dev_set->device_list, vdev.dev_set_list)) { + vdev = list_prev_entry(vdev, vdev.dev_set_list); + goto err_undo; } - cur_mem = NULL; /* * The pci_reset_bus() will reset all the devices in the bus. @@ -2599,25 +2448,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, * cause the PCI config space reset without restoring the original * state (saved locally in 'vdev->pm_save'). */ - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - vfio_pci_set_power_state(cur, PCI_D0); + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + vfio_pci_set_power_state(vdev, PCI_D0); ret = pci_reset_bus(pdev); + vdev = list_last_entry(&dev_set->device_list, + struct vfio_pci_core_device, vdev.dev_set_list); + err_undo: - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - if (cur == cur_mem) - is_mem = false; - if (cur == cur_vma) - break; - if (is_mem) - up_write(&cur->memory_lock); - else - mutex_unlock(&cur->vma_lock); - } + list_for_each_entry_from_reverse(vdev, &dev_set->device_list, + vdev.dev_set_list) + up_write(&vdev->memory_lock); + + list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&vdev->pdev->dev); - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) - pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a2c8b8bba711..f87067438ed4 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -93,8 +93,6 @@ struct vfio_pci_core_device { struct list_head sriov_pfs_item; struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; - struct mutex vma_lock; - struct list_head vma_list; struct rw_semaphore memory_lock; }; -- cgit v1.2.3 From 89e8a2366e3bce584b6c01549d5019c5cda1205e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Tue, 28 May 2024 12:25:28 +0800 Subject: iommu: Return right value in iommu_sva_bind_device() iommu_sva_bind_device() should return either a sva bond handle or an ERR_PTR value in error cases. Existing drivers (idxd and uacce) only check the return value with IS_ERR(). This could potentially lead to a kernel NULL pointer dereference issue if the function returns NULL instead of an error pointer. In reality, this doesn't cause any problems because iommu_sva_bind_device() only returns NULL when the kernel is not configured with CONFIG_IOMMU_SVA. In this case, iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) will return an error, and the device drivers won't call iommu_sva_bind_device() at all. Fixes: 26b25a2b98e4 ("iommu: Bind process address spaces to devices") Signed-off-by: Lu Baolu Reviewed-by: Jean-Philippe Brucker Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20240528042528.71396-1-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7bc8dff7cf6d..17b3f36ad843 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1533,7 +1533,7 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) { - return NULL; + return ERR_PTR(-ENODEV); } static inline void iommu_sva_unbind_device(struct iommu_sva *handle) -- cgit v1.2.3 From f85d39dd7ed89ffdd622bc1de247ffba8d961504 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 27 May 2024 19:35:38 +0200 Subject: kcov, usb: disable interrupts in kcov_remote_start_usb_softirq After commit 8fea0c8fda30 ("usb: core: hcd: Convert from tasklet to BH workqueue"), usb_giveback_urb_bh() runs in the BH workqueue with interrupts enabled. Thus, the remote coverage collection section in usb_giveback_urb_bh()-> __usb_hcd_giveback_urb() might be interrupted, and the interrupt handler might invoke __usb_hcd_giveback_urb() again. This breaks KCOV, as it does not support nested remote coverage collection sections within the same context (neither in task nor in softirq). Update kcov_remote_start/stop_usb_softirq() to disable interrupts for the duration of the coverage collection section to avoid nested sections in the softirq context (in addition to such in the task context, which are already handled). Reported-by: Tetsuo Handa Closes: https://lore.kernel.org/linux-usb/0f4d1964-7397-485b-bc48-11c01e2fcbca@I-love.SAKURA.ne.jp/ Closes: https://syzkaller.appspot.com/bug?extid=0438378d6f157baae1a2 Suggested-by: Alan Stern Fixes: 8fea0c8fda30 ("usb: core: hcd: Convert from tasklet to BH workqueue") Cc: stable@vger.kernel.org Acked-by: Dmitry Vyukov Signed-off-by: Andrey Konovalov Link: https://lore.kernel.org/r/20240527173538.4989-1-andrey.konovalov@linux.dev Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hcd.c | 12 +++++++----- include/linux/kcov.h | 47 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index e3366f4d82b9..1ff7d901fede 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1623,6 +1623,7 @@ static void __usb_hcd_giveback_urb(struct urb *urb) struct usb_hcd *hcd = bus_to_hcd(urb->dev->bus); struct usb_anchor *anchor = urb->anchor; int status = urb->unlinked; + unsigned long flags; urb->hcpriv = NULL; if (unlikely((urb->transfer_flags & URB_SHORT_NOT_OK) && @@ -1640,13 +1641,14 @@ static void __usb_hcd_giveback_urb(struct urb *urb) /* pass ownership to the completion handler */ urb->status = status; /* - * This function can be called in task context inside another remote - * coverage collection section, but kcov doesn't support that kind of - * recursion yet. Only collect coverage in softirq context for now. + * Only collect coverage in the softirq context and disable interrupts + * to avoid scenarios with nested remote coverage collection sections + * that KCOV does not support. + * See the comment next to kcov_remote_start_usb_softirq() for details. */ - kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); + flags = kcov_remote_start_usb_softirq((u64)urb->dev->bus->busnum); urb->complete(urb); - kcov_remote_stop_softirq(); + kcov_remote_stop_softirq(flags); usb_anchor_resume_wakeups(anchor); atomic_dec(&urb->use_count); diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b851ba415e03..1068a7318d89 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -55,21 +55,47 @@ static inline void kcov_remote_start_usb(u64 id) /* * The softirq flavor of kcov_remote_*() functions is introduced as a temporary - * work around for kcov's lack of nested remote coverage sections support in - * task context. Adding support for nested sections is tracked in: - * https://bugzilla.kernel.org/show_bug.cgi?id=210337 + * workaround for KCOV's lack of nested remote coverage sections support. + * + * Adding support is tracked in https://bugzilla.kernel.org/show_bug.cgi?id=210337. + * + * kcov_remote_start_usb_softirq(): + * + * 1. Only collects coverage when called in the softirq context. This allows + * avoiding nested remote coverage collection sections in the task context. + * For example, USB/IP calls usb_hcd_giveback_urb() in the task context + * within an existing remote coverage collection section. Thus, KCOV should + * not attempt to start collecting coverage within the coverage collection + * section in __usb_hcd_giveback_urb() in this case. + * + * 2. Disables interrupts for the duration of the coverage collection section. + * This allows avoiding nested remote coverage collection sections in the + * softirq context (a softirq might occur during the execution of a work in + * the BH workqueue, which runs with in_serving_softirq() > 0). + * For example, usb_giveback_urb_bh() runs in the BH workqueue with + * interrupts enabled, so __usb_hcd_giveback_urb() might be interrupted in + * the middle of its remote coverage collection section, and the interrupt + * handler might invoke __usb_hcd_giveback_urb() again. */ -static inline void kcov_remote_start_usb_softirq(u64 id) +static inline unsigned long kcov_remote_start_usb_softirq(u64 id) { - if (in_serving_softirq()) + unsigned long flags = 0; + + if (in_serving_softirq()) { + local_irq_save(flags); kcov_remote_start_usb(id); + } + + return flags; } -static inline void kcov_remote_stop_softirq(void) +static inline void kcov_remote_stop_softirq(unsigned long flags) { - if (in_serving_softirq()) + if (in_serving_softirq()) { kcov_remote_stop(); + local_irq_restore(flags); + } } #ifdef CONFIG_64BIT @@ -103,8 +129,11 @@ static inline u64 kcov_common_handle(void) } static inline void kcov_remote_start_common(u64 id) {} static inline void kcov_remote_start_usb(u64 id) {} -static inline void kcov_remote_start_usb_softirq(u64 id) {} -static inline void kcov_remote_stop_softirq(void) {} +static inline unsigned long kcov_remote_start_usb_softirq(u64 id) +{ + return 0; +} +static inline void kcov_remote_stop_softirq(unsigned long flags) {} #endif /* CONFIG_KCOV */ #endif /* _LINUX_KCOV_H */ -- cgit v1.2.3 From 971187350602d03c4a27c0783ff412502b95720a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 4 Jul 2023 14:17:19 +0100 Subject: driver core: remove devm_device_add_groups() There is no more in-kernel users of this function, and no driver should ever be using it, so remove it from the kernel. Acked-by: Dmitry Torokhov Acked-by: "Rafael J. Wysocki" Link: https://lore.kernel.org/r/20230704131715.44454-8-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 45 --------------------------------------------- include/linux/device.h | 2 -- 2 files changed, 47 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 131d96c6090b..2e776fcc31b6 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2845,15 +2845,6 @@ static void devm_attr_group_remove(struct device *dev, void *res) sysfs_remove_group(&dev->kobj, group); } -static void devm_attr_groups_remove(struct device *dev, void *res) -{ - union device_attr_group_devres *devres = res; - const struct attribute_group **groups = devres->groups; - - dev_dbg(dev, "%s: removing groups %p\n", __func__, groups); - sysfs_remove_groups(&dev->kobj, groups); -} - /** * devm_device_add_group - given a device, create a managed attribute group * @dev: The device to create the group for @@ -2886,42 +2877,6 @@ int devm_device_add_group(struct device *dev, const struct attribute_group *grp) } EXPORT_SYMBOL_GPL(devm_device_add_group); -/** - * devm_device_add_groups - create a bunch of managed attribute groups - * @dev: The device to create the group for - * @groups: The attribute groups to create, NULL terminated - * - * This function creates a bunch of managed attribute groups. If an error - * occurs when creating a group, all previously created groups will be - * removed, unwinding everything back to the original state when this - * function was called. It will explicitly warn and error if any of the - * attribute files being created already exist. - * - * Returns 0 on success or error code from sysfs_create_group on failure. - */ -int devm_device_add_groups(struct device *dev, - const struct attribute_group **groups) -{ - union device_attr_group_devres *devres; - int error; - - devres = devres_alloc(devm_attr_groups_remove, - sizeof(*devres), GFP_KERNEL); - if (!devres) - return -ENOMEM; - - error = sysfs_create_groups(&dev->kobj, groups); - if (error) { - devres_free(devres); - return error; - } - - devres->groups = groups; - devres_add(dev, devres); - return 0; -} -EXPORT_SYMBOL_GPL(devm_device_add_groups); - static int device_add_attrs(struct device *dev) { const struct class *class = dev->class; diff --git a/include/linux/device.h b/include/linux/device.h index fc3bd7116ab9..ace039151cb8 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1220,8 +1220,6 @@ static inline void device_remove_group(struct device *dev, return device_remove_groups(dev, groups); } -int __must_check devm_device_add_groups(struct device *dev, - const struct attribute_group **groups); int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); -- cgit v1.2.3 From 44a45be57f85165761fdabf072f9a97aa026ff61 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Thu, 23 May 2024 13:00:00 +0200 Subject: sysfs: Unbreak the build around sysfs_bin_attr_simple_read() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Günter reports build breakage for m68k "m5208evb_defconfig" plus CONFIG_BLK_DEV_INITRD=y caused by commit 66bc1a173328 ("treewide: Use sysfs_bin_attr_simple_read() helper"). The defconfig disables CONFIG_SYSFS, so sysfs_bin_attr_simple_read() is not compiled into the kernel. But init/initramfs.c references that function in the initializer of a struct bin_attribute. Add an empty static inline to avoid the build breakage. Fixes: 66bc1a173328 ("treewide: Use sysfs_bin_attr_simple_read() helper") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/r/e12b0027-b199-4de7-b83d-668171447ccc@roeck-us.net Signed-off-by: Lukas Wunner Tested-by: Guenter Roeck Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/05f4290439a58730738a15b0c99cd8576c4aa0d9.1716461752.git.lukas@wunner.de Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index a7d725fbf739..c4e64dc11206 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -750,6 +750,15 @@ static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } + +static inline ssize_t sysfs_bin_attr_simple_read(struct file *file, + struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, + size_t count) +{ + return 0; +} #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, -- cgit v1.2.3 From c9d52fb313d3719d69a040f4ca78a3e2e95fba21 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 30 May 2024 18:04:24 -0700 Subject: PCI: Revert the cfg_access_lock lockdep mechanism While the experiment did reveal that there are additional places that are missing the lock during secondary bus reset, one of the places that needs to take cfg_access_lock (pci_bus_lock()) is not prepared for lockdep annotation. Specifically, pci_bus_lock() takes pci_dev_lock() recursively and is currently dependent on the fact that the device_lock() is marked lockdep_set_novalidate_class(&dev->mutex). Otherwise, without that annotation, pci_bus_lock() would need to use something like a new pci_dev_lock_nested() helper, a scheme to track a PCI device's depth in the topology, and a hope that the depth of a PCI tree never exceeds the max value for a lockdep subclass. The alternative to ripping out the lockdep coverage would be to deploy a dynamic lock key for every PCI device. Unfortunately, there is evidence that increasing the number of keys that lockdep needs to track to be per-PCI-device is prohibitively expensive for something like the cfg_access_lock. The main motivation for adding the annotation in the first place was to catch unlocked secondary bus resets, not necessarily catch lock ordering problems between cfg_access_lock and other locks. Solve that narrower problem with follow-on patches, and just due to targeted revert for now. Link: https://lore.kernel.org/r/171711746402.1628941.14575335981264103013.stgit@dwillia2-xfh.jf.intel.com Fixes: 7e89efc6e9e4 ("PCI: Lock upstream bridge for pci_reset_function()") Reported-by: Imre Deak Closes: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_134186v1/shard-dg2-1/igt@device_reset@unbind-reset-rebind.html Signed-off-by: Dan Williams Signed-off-by: Bjorn Helgaas Tested-by: Hans de Goede Tested-by: Kalle Valo Reviewed-by: Dave Jiang Cc: Jani Saarinen --- drivers/pci/access.c | 4 ---- drivers/pci/pci.c | 1 - drivers/pci/probe.c | 3 --- include/linux/lockdep.h | 5 ----- include/linux/pci.h | 2 -- 5 files changed, 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/access.c b/drivers/pci/access.c index 30f031de9cfe..b123da16b63b 100644 --- a/drivers/pci/access.c +++ b/drivers/pci/access.c @@ -289,8 +289,6 @@ void pci_cfg_access_lock(struct pci_dev *dev) { might_sleep(); - lock_map_acquire(&dev->cfg_access_lock); - raw_spin_lock_irq(&pci_lock); if (dev->block_cfg_access) pci_wait_cfg(dev); @@ -345,8 +343,6 @@ void pci_cfg_access_unlock(struct pci_dev *dev) raw_spin_unlock_irqrestore(&pci_lock, flags); wake_up_all(&pci_cfg_wait); - - lock_map_release(&dev->cfg_access_lock); } EXPORT_SYMBOL_GPL(pci_cfg_access_unlock); diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 59e0949fb079..35fb1f17a589 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -4883,7 +4883,6 @@ void __weak pcibios_reset_secondary_bus(struct pci_dev *dev) */ int pci_bridge_secondary_bus_reset(struct pci_dev *dev) { - lock_map_assert_held(&dev->cfg_access_lock); pcibios_reset_secondary_bus(dev); return pci_bridge_wait_for_secondary_bus(dev, "bus reset"); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 8e696e547565..5fbabb4e3425 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2546,9 +2546,6 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus) dev->dev.dma_mask = &dev->dma_mask; dev->dev.dma_parms = &dev->dma_parms; dev->dev.coherent_dma_mask = 0xffffffffull; - lockdep_register_key(&dev->cfg_access_key); - lockdep_init_map(&dev->cfg_access_lock, dev_name(&dev->dev), - &dev->cfg_access_key, 0); dma_set_max_seg_size(&dev->dev, 65536); dma_set_seg_boundary(&dev->dev, 0xffffffff); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 5e51b0de4c4b..08b0d1d9d78b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -297,9 +297,6 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); .wait_type_inner = _wait_type, \ .lock_type = LD_LOCK_WAIT_OVERRIDE, } -#define lock_map_assert_held(l) \ - lockdep_assert(lock_is_held(l) != LOCK_STATE_NOT_HELD) - #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) @@ -391,8 +388,6 @@ extern int lockdep_is_held(const void *); #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} -#define lock_map_assert_held(l) do { (void)(l); } while (0) - #endif /* !LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING diff --git a/include/linux/pci.h b/include/linux/pci.h index fb004fd4e889..cafc5ab1cbcb 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -413,8 +413,6 @@ struct pci_dev { struct resource driver_exclusive_resource; /* driver exclusive resource ranges */ bool match_driver; /* Skip attaching driver */ - struct lock_class_key cfg_access_key; - struct lockdep_map cfg_access_lock; unsigned int transparent:1; /* Subtractive decode bridge */ unsigned int io_window:1; /* Bridge has I/O window */ -- cgit v1.2.3 From f92a59f6d12e31ead999fee9585471b95a8ae8a3 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Wed, 15 May 2024 13:37:10 +0000 Subject: locking/atomic: scripts: fix ${atomic}_sub_and_test() kerneldoc For ${atomic}_sub_and_test() the @i parameter is the value to subtract, not add. Fix the typo in the kerneldoc template and generate the headers with this update. Fixes: ad8110706f38 ("locking/atomic: scripts: generate kerneldoc comments") Suggested-by: Mark Rutland Signed-off-by: Carlos Llamas Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Reviewed-by: Kees Cook Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20240515133844.3502360-1-cmllamas@google.com --- include/linux/atomic/atomic-arch-fallback.h | 6 +++--- include/linux/atomic/atomic-instrumented.h | 8 ++++---- include/linux/atomic/atomic-long.h | 4 ++-- scripts/atomic/kerneldoc/sub_and_test | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index 956bcba5dbf2..2f9d36b72bd8 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -2242,7 +2242,7 @@ raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) /** * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: int value to add + * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4368,7 +4368,7 @@ raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) /** * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: s64 value to add + * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4690,4 +4690,4 @@ raw_atomic64_dec_if_positive(atomic64_t *v) } #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// 14850c0b0db20c62fdc78ccd1d42b98b88d76331 +// b565db590afeeff0d7c9485ccbca5bb6e155749f diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index debd487fe971..9409a6ddf3e0 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -1349,7 +1349,7 @@ atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) /** * atomic_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: int value to add + * @i: int value to subtract * @v: pointer to atomic_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -2927,7 +2927,7 @@ atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) /** * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: s64 value to add + * @i: s64 value to subtract * @v: pointer to atomic64_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -4505,7 +4505,7 @@ atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) /** * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: long value to add + * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// ce5b65e0f1f8a276268b667194581d24bed219d4 +// 8829b337928e9508259079d32581775ececd415b diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h index 3ef844b3ab8a..f86b29d90877 100644 --- a/include/linux/atomic/atomic-long.h +++ b/include/linux/atomic/atomic-long.h @@ -1535,7 +1535,7 @@ raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) /** * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering - * @i: long value to add + * @i: long value to subtract * @v: pointer to atomic_long_t * * Atomically updates @v to (@v - @i) with full ordering. @@ -1809,4 +1809,4 @@ raw_atomic_long_dec_if_positive(atomic_long_t *v) } #endif /* _LINUX_ATOMIC_LONG_H */ -// 1c4a26fc77f345342953770ebe3c4d08e7ce2f9a +// eadf183c3600b8b92b91839dd3be6bcc560c752d diff --git a/scripts/atomic/kerneldoc/sub_and_test b/scripts/atomic/kerneldoc/sub_and_test index d3760f7749d4..96615e50836b 100644 --- a/scripts/atomic/kerneldoc/sub_and_test +++ b/scripts/atomic/kerneldoc/sub_and_test @@ -1,7 +1,7 @@ cat < Date: Thu, 23 May 2024 10:36:39 +0800 Subject: mm: drop the 'anon_' prefix for swap-out mTHP counters The mTHP swap related counters: 'anon_swpout' and 'anon_swpout_fallback' are confusing with an 'anon_' prefix, since the shmem can swap out non-anonymous pages. So drop the 'anon_' prefix to keep consistent with the old swap counter names. This is needed in 6.10-rcX to avoid having an inconsistent ABI out in the field. Link: https://lkml.kernel.org/r/7a8989c13299920d7589007a30065c3e2c19f0e0.1716431702.git.baolin.wang@linux.alibaba.com Fixes: d0f048ac39f6 ("mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters") Fixes: 42248b9d34ea ("mm: add docs for per-order mTHP counters and transhuge_page ABI") Signed-off-by: Baolin Wang Suggested-by: "Huang, Ying" Acked-by: Barry Song Cc: David Hildenbrand Cc: Lance Yang Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 4 ++-- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 8 ++++---- mm/page_io.c | 2 +- mm/vmscan.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 076443cc10a6..d414d3f5592a 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -467,11 +467,11 @@ anon_fault_fallback_charge instead falls back to using huge pages with lower orders or small pages even though the allocation was successful. -anon_swpout +swpout is incremented every time a huge page is swapped out in one piece without splitting. -anon_swpout_fallback +swpout_fallback is incremented if a huge page has to be split before swapout. Usually because failed to allocate some continuous swap space for the huge page. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c8d3ec116e29..8c72d3786583 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -269,8 +269,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, - MTHP_STAT_ANON_SWPOUT, - MTHP_STAT_ANON_SWPOUT_FALLBACK, + MTHP_STAT_SWPOUT, + MTHP_STAT_SWPOUT_FALLBACK, __MTHP_STAT_COUNT }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 317de2afd371..89932fd0f62e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -558,15 +558,15 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); -DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT); -DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); +DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); static struct attribute *stats_attrs[] = { &anon_fault_alloc_attr.attr, &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, - &anon_swpout_attr.attr, - &anon_swpout_fallback_attr.attr, + &swpout_attr.attr, + &swpout_fallback_attr.attr, NULL, }; diff --git a/mm/page_io.c b/mm/page_io.c index 46c603dddf04..0a150c240bf4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -217,7 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } - count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index d55e8d07ffc4..2e34de9cd0d4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1227,7 +1227,7 @@ retry: THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } - count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) goto activate_locked_split; -- cgit v1.2.3 From 94d46bf17916965e918bd2f3d2eec057f7c5578d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 24 May 2024 08:50:48 +1200 Subject: mm: huge_mm: fix undefined reference to `mthp_stats' for CONFIG_SYSFS=n if CONFIG_SYSFS is not enabled in config, we get the below error, All errors (new ones prefixed by >>): s390-linux-ld: mm/memory.o: in function `count_mthp_stat': >> include/linux/huge_mm.h:285:(.text+0x191c): undefined reference to `mthp_stats' s390-linux-ld: mm/huge_memory.o:(.rodata+0x10): undefined reference to `mthp_stats' vim +285 include/linux/huge_mm.h 279 280 static inline void count_mthp_stat(int order, enum mthp_stat_item item) 281 { 282 if (order <= 0 || order > PMD_ORDER) 283 return; 284 > 285 this_cpu_inc(mthp_stats.stats[order][item]); 286 } 287 Link: https://lkml.kernel.org/r/20240523210045.40444-1-21cnbao@gmail.com Fixes: ec33687c6749 ("mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback counters") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202405231728.tCAogiSI-lkp@intel.com/ Signed-off-by: Barry Song Tested-by: Yujie Liu Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8c72d3786583..2aa986a5cd1b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -278,6 +278,7 @@ struct mthp_stat { unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT]; }; +#ifdef CONFIG_SYSFS DECLARE_PER_CPU(struct mthp_stat, mthp_stats); static inline void count_mthp_stat(int order, enum mthp_stat_item item) @@ -287,6 +288,11 @@ static inline void count_mthp_stat(int order, enum mthp_stat_item item) this_cpu_inc(mthp_stats.stats[order][item]); } +#else +static inline void count_mthp_stat(int order, enum mthp_stat_item item) +{ +} +#endif #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ -- cgit v1.2.3 From c2dc78b86e0821ecf9a9d0c35dba2618279a5bb6 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 28 May 2024 13:15:22 +0800 Subject: mm/ksm: fix ksm_zero_pages accounting We normally ksm_zero_pages++ in ksmd when page is merged with zero page, but ksm_zero_pages-- is done from page tables side, where there is no any accessing protection of ksm_zero_pages. So we can read very exceptional value of ksm_zero_pages in rare cases, such as -1, which is very confusing to users. Fix it by changing to use atomic_long_t, and the same case with the mm->ksm_zero_pages. Link: https://lkml.kernel.org/r/20240528-b4-ksm-counters-v3-2-34bb358fdc13@linux.dev Fixes: e2942062e01d ("ksm: count all zero pages placed by KSM") Fixes: 6080d19f0704 ("ksm: add ksm zero pages for each process") Signed-off-by: Chengming Zhou Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Ran Xiaokai Cc: Stefan Roesch Cc: xu xin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- fs/proc/base.c | 2 +- include/linux/ksm.h | 17 ++++++++++++++--- include/linux/mm_types.h | 2 +- mm/ksm.c | 11 +++++------ 4 files changed, 21 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/base.c b/fs/proc/base.c index 18550c071d71..72a1acd03675 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3214,7 +3214,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); - seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); + seq_printf(m, "ksm_zero_pages %ld\n", mm_ksm_zero_pages(mm)); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 52c63a9c5a9c..11690dacd986 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -33,16 +33,27 @@ void __ksm_exit(struct mm_struct *mm); */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) -extern unsigned long ksm_zero_pages; +extern atomic_long_t ksm_zero_pages; + +static inline void ksm_map_zero_page(struct mm_struct *mm) +{ + atomic_long_inc(&ksm_zero_pages); + atomic_long_inc(&mm->ksm_zero_pages); +} static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { - ksm_zero_pages--; - mm->ksm_zero_pages--; + atomic_long_dec(&ksm_zero_pages); + atomic_long_dec(&mm->ksm_zero_pages); } } +static inline long mm_ksm_zero_pages(struct mm_struct *mm) +{ + return atomic_long_read(&mm->ksm_zero_pages); +} + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 24323c7d0bd4..af3a0256fa93 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -985,7 +985,7 @@ struct mm_struct { * Represent how many empty pages are merged with kernel zero * pages when enabling KSM use_zero_pages. */ - unsigned long ksm_zero_pages; + atomic_long_t ksm_zero_pages; #endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN_WALKS_MMU struct { diff --git a/mm/ksm.c b/mm/ksm.c index 9e99cb12d330..34c4820e0d3d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -296,7 +296,7 @@ static bool ksm_use_zero_pages __read_mostly; static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ -unsigned long ksm_zero_pages; +atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0); /* The number of pages that have been skipped due to "smart scanning" */ static unsigned long ksm_pages_skipped; @@ -1429,8 +1429,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); - ksm_zero_pages++; - mm->ksm_zero_pages++; + ksm_map_zero_page(mm); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we @@ -3374,7 +3373,7 @@ static void wait_while_offlining(void) #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { - return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE - + return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ @@ -3663,7 +3662,7 @@ KSM_ATTR_RO(pages_skipped); static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%ld\n", ksm_zero_pages); + return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages)); } KSM_ATTR_RO(ksm_zero_pages); @@ -3672,7 +3671,7 @@ static ssize_t general_profit_show(struct kobject *kobj, { long general_profit; - general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE - + general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); -- cgit v1.2.3 From 0ee14725471cea66e03e3cd4f4c582d759de502c Mon Sep 17 00:00:00 2001 From: Jean-Philippe Brucker Date: Thu, 6 Jun 2024 15:46:09 +0100 Subject: mm/util: Swap kmemdup_array() arguments GCC 14.1 complains about the argument usage of kmemdup_array(): drivers/soc/tegra/fuse/fuse-tegra.c:130:65: error: 'kmemdup_array' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Werror=calloc-transposed-args] 130 | fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), | ^ drivers/soc/tegra/fuse/fuse-tegra.c:130:65: note: earlier argument should specify number of elements, later size of each element The annotation introduced by commit 7d78a7773355 ("string: Add additional __realloc_size() annotations for "dup" helpers") lets the compiler think that kmemdup_array() follows the same format as calloc(), with the number of elements preceding the size of one element. So we could simply swap the arguments to __realloc_size() to get rid of that warning, but it seems cleaner to instead have kmemdup_array() follow the same format as krealloc_array(), memdup_array_user(), calloc() etc. Fixes: 7d78a7773355 ("string: Add additional __realloc_size() annotations for "dup" helpers") Signed-off-by: Jean-Philippe Brucker Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20240606144608.97817-2-jean-philippe@linaro.org Signed-off-by: Kees Cook --- drivers/soc/tegra/fuse/fuse-tegra.c | 4 ++-- include/linux/string.h | 2 +- lib/fortify_kunit.c | 2 +- mm/util.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index b6bfd6729df3..d27667283846 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -127,8 +127,8 @@ static void tegra_fuse_print_sku_info(struct tegra_sku_info *tegra_sku_info) static int tegra_fuse_add_lookups(struct tegra_fuse *fuse) { - fuse->lookups = kmemdup_array(fuse->soc->lookups, sizeof(*fuse->lookups), - fuse->soc->num_lookups, GFP_KERNEL); + fuse->lookups = kmemdup_array(fuse->soc->lookups, fuse->soc->num_lookups, + sizeof(*fuse->lookups), GFP_KERNEL); if (!fuse->lookups) return -ENOMEM; diff --git a/include/linux/string.h b/include/linux/string.h index 60168aa2af07..9edace076ddb 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -289,7 +289,7 @@ extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_si extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); -extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +extern void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) __realloc_size(2, 3); /* lib/argv_split.c */ diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index f9cc467334ce..e17d520f532c 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -374,7 +374,7 @@ static const char * const test_strs[] = { for (i = 0; i < ARRAY_SIZE(test_strs); i++) { \ len = strlen(test_strs[i]); \ KUNIT_EXPECT_EQ(test, __builtin_constant_p(len), 0); \ - checker(len, kmemdup_array(test_strs[i], len, 1, gfp), \ + checker(len, kmemdup_array(test_strs[i], 1, len, gfp), \ kfree(p)); \ checker(len, kmemdup(test_strs[i], len, gfp), \ kfree(p)); \ diff --git a/mm/util.c b/mm/util.c index c9e519e6811f..6682097372ef 100644 --- a/mm/util.c +++ b/mm/util.c @@ -139,14 +139,14 @@ EXPORT_SYMBOL(kmemdup_noprof); * kmemdup_array - duplicate a given array. * * @src: array to duplicate. - * @element_size: size of each element of array. * @count: number of elements to duplicate from array. + * @element_size: size of each element of array. * @gfp: GFP mask to use. * * Return: duplicated array of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ -void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp) +void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp) { return kmemdup(src, size_mul(element_size, count), gfp); } -- cgit v1.2.3 From 231035f18d6b80e5c28732a20872398116a54ecd Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Thu, 6 Jun 2024 16:52:15 +0800 Subject: workqueue: Increase worker desc's length to 32 Commit 31c89007285d ("workqueue.c: Increase workqueue name length") increased WQ_NAME_LEN from 24 to 32, but forget to increase WORKER_DESC_LEN, which would cause truncation when setting kworker's desc from workqueue_struct's name, process_one_work() for example. Fixes: 31c89007285d ("workqueue.c: Increase workqueue name length") Signed-off-by: Wenchao Hao CC: Audra Mitchell Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fb3993894536..d9968bfc8eac 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -95,7 +95,7 @@ enum wq_misc_consts { WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ - WORKER_DESC_LEN = 24, + WORKER_DESC_LEN = 32, }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ -- cgit v1.2.3 From 144ba8580bcb82b2686c3d1a043299d844b9a682 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Mon, 10 Jun 2024 10:34:26 +0200 Subject: net: pse-pd: Use EOPNOTSUPP error code instead of ENOTSUPP ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP as reported by checkpatch script. Fixes: 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet Power Equipment") Reviewed-by: Andrew Lunn Acked-by: Oleksij Rempel Signed-off-by: Kory Maincent Link: https://lore.kernel.org/r/20240610083426.740660-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 6d07c95dabb9..6eec24ffa866 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -167,14 +167,14 @@ static inline int pse_ethtool_get_status(struct pse_control *psec, struct netlink_ext_ack *extack, struct pse_control_status *status) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int pse_ethtool_set_config(struct pse_control *psec, struct netlink_ext_ack *extack, const struct pse_control_config *config) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline bool pse_has_podl(struct pse_control *psec) -- cgit v1.2.3 From 3b9c181bcde8555ca81b2394c2dc2201cefc2dd4 Mon Sep 17 00:00:00 2001 From: José Roberto de Souza Date: Tue, 11 Jun 2024 10:47:15 -0700 Subject: devcoredump: Add dev_coredumpm_timeout() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add function to set a custom coredump timeout. For Xe driver usage, current 5 minutes timeout may be too short for users to search and understand what needs to be done to capture coredump to report bugs. We have plans to automate(distribute a udev script) it but at the end will be up to distros and users to pack it so having a option to increase the timeout is a safer option. v2: - replace dev_coredump_timeout_set() by dev_coredumpm_timeout() (Mukesh) v3: - make dev_coredumpm() static inline (Johannes) v5: - rename DEVCOREDUMP_TIMEOUT -> DEVCD_TIMEOUT to avoid redefinition in include/net/bluetooth/coredump.h v6: - fix definition of dev_coredumpm_timeout() when CONFIG_DEV_COREDUMP is disabled Cc: Rodrigo Vivi Cc: Mukesh Ojha Cc: Johannes Berg Cc: Jonathan Cavitt Reviewed-by: Rodrigo Vivi Reviewed-by: Jonathan Cavitt Signed-off-by: José Roberto de Souza Acked-by: Greg Kroah-Hartman Acked-by: Johannes Berg Link: https://patchwork.freedesktop.org/patch/msgid/20240611174716.72660-1-jose.souza@intel.com Signed-off-by: Rodrigo Vivi --- drivers/base/devcoredump.c | 23 ++++++++++---------- include/linux/devcoredump.h | 53 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 54 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c index 82aeb09b3d1b..c795edad1b96 100644 --- a/drivers/base/devcoredump.c +++ b/drivers/base/devcoredump.c @@ -18,9 +18,6 @@ static struct class devcd_class; /* global disable flag, for security purposes */ static bool devcd_disabled; -/* if data isn't read by userspace after 5 minutes then delete it */ -#define DEVCD_TIMEOUT (HZ * 60 * 5) - struct devcd_entry { struct device devcd_dev; void *data; @@ -328,7 +325,8 @@ void dev_coredump_put(struct device *dev) EXPORT_SYMBOL_GPL(dev_coredump_put); /** - * dev_coredumpm - create device coredump with read/free methods + * dev_coredumpm_timeout - create device coredump with read/free methods with a + * custom timeout. * @dev: the struct device for the crashed device * @owner: the module that contains the read/free functions, use %THIS_MODULE * @data: data cookie for the @read/@free functions @@ -336,17 +334,20 @@ EXPORT_SYMBOL_GPL(dev_coredump_put); * @gfp: allocation flags * @read: function to read from the given buffer * @free: function to free the given buffer + * @timeout: time in jiffies to remove coredump * * Creates a new device coredump for the given device. If a previous one hasn't * been read yet, the new coredump is discarded. The data lifetime is determined * by the device coredump framework and when it is no longer needed the @free * function will be called to free the data. */ -void dev_coredumpm(struct device *dev, struct module *owner, - void *data, size_t datalen, gfp_t gfp, - ssize_t (*read)(char *buffer, loff_t offset, size_t count, - void *data, size_t datalen), - void (*free)(void *data)) +void dev_coredumpm_timeout(struct device *dev, struct module *owner, + void *data, size_t datalen, gfp_t gfp, + ssize_t (*read)(char *buffer, loff_t offset, + size_t count, void *data, + size_t datalen), + void (*free)(void *data), + unsigned long timeout) { static atomic_t devcd_count = ATOMIC_INIT(0); struct devcd_entry *devcd; @@ -403,7 +404,7 @@ void dev_coredumpm(struct device *dev, struct module *owner, dev_set_uevent_suppress(&devcd->devcd_dev, false); kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD); INIT_DELAYED_WORK(&devcd->del_wk, devcd_del); - schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT); + schedule_delayed_work(&devcd->del_wk, timeout); mutex_unlock(&devcd->mutex); return; put_device: @@ -414,7 +415,7 @@ void dev_coredumpm(struct device *dev, struct module *owner, free: free(data); } -EXPORT_SYMBOL_GPL(dev_coredumpm); +EXPORT_SYMBOL_GPL(dev_coredumpm_timeout); /** * dev_coredumpsg - create device coredump that uses scatterlist as data diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h index c8f7eb6cc191..377892604ff4 100644 --- a/include/linux/devcoredump.h +++ b/include/linux/devcoredump.h @@ -12,6 +12,9 @@ #include #include +/* if data isn't read by userspace after 5 minutes then delete it */ +#define DEVCD_TIMEOUT (HZ * 60 * 5) + /* * _devcd_free_sgtable - free all the memory of the given scatterlist table * (i.e. both pages and scatterlist instances) @@ -50,16 +53,17 @@ static inline void _devcd_free_sgtable(struct scatterlist *table) kfree(delete_iter); } - #ifdef CONFIG_DEV_COREDUMP void dev_coredumpv(struct device *dev, void *data, size_t datalen, gfp_t gfp); -void dev_coredumpm(struct device *dev, struct module *owner, - void *data, size_t datalen, gfp_t gfp, - ssize_t (*read)(char *buffer, loff_t offset, size_t count, - void *data, size_t datalen), - void (*free)(void *data)); +void dev_coredumpm_timeout(struct device *dev, struct module *owner, + void *data, size_t datalen, gfp_t gfp, + ssize_t (*read)(char *buffer, loff_t offset, + size_t count, void *data, + size_t datalen), + void (*free)(void *data), + unsigned long timeout); void dev_coredumpsg(struct device *dev, struct scatterlist *table, size_t datalen, gfp_t gfp); @@ -73,11 +77,13 @@ static inline void dev_coredumpv(struct device *dev, void *data, } static inline void -dev_coredumpm(struct device *dev, struct module *owner, - void *data, size_t datalen, gfp_t gfp, - ssize_t (*read)(char *buffer, loff_t offset, size_t count, - void *data, size_t datalen), - void (*free)(void *data)) +dev_coredumpm_timeout(struct device *dev, struct module *owner, + void *data, size_t datalen, gfp_t gfp, + ssize_t (*read)(char *buffer, loff_t offset, + size_t count, void *data, + size_t datalen), + void (*free)(void *data), + unsigned long timeout) { free(data); } @@ -92,4 +98,29 @@ static inline void dev_coredump_put(struct device *dev) } #endif /* CONFIG_DEV_COREDUMP */ +/** + * dev_coredumpm - create device coredump with read/free methods + * @dev: the struct device for the crashed device + * @owner: the module that contains the read/free functions, use %THIS_MODULE + * @data: data cookie for the @read/@free functions + * @datalen: length of the data + * @gfp: allocation flags + * @read: function to read from the given buffer + * @free: function to free the given buffer + * + * Creates a new device coredump for the given device. If a previous one hasn't + * been read yet, the new coredump is discarded. The data lifetime is determined + * by the device coredump framework and when it is no longer needed the @free + * function will be called to free the data. + */ +static inline void dev_coredumpm(struct device *dev, struct module *owner, + void *data, size_t datalen, gfp_t gfp, + ssize_t (*read)(char *buffer, loff_t offset, size_t count, + void *data, size_t datalen), + void (*free)(void *data)) +{ + dev_coredumpm_timeout(dev, owner, data, datalen, gfp, read, free, + DEVCD_TIMEOUT); +} + #endif /* __DEVCOREDUMP_H */ -- cgit v1.2.3 From e038ee6189842e9662d2fc59d09dbcf48350cf99 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Mon, 10 Jun 2024 16:41:44 +0530 Subject: block: unmap and free user mapped integrity via submitter The user mapped intergity is copied back and unpinned by bio_integrity_free which is a low-level routine. Do it via the submitter rather than doing it in the low-level block layer code, to split the submitter side from the consumer side of the bio. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20240610111144.14647-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 26 ++++++++++++++++++++++++-- drivers/nvme/host/ioctl.c | 15 +++++++++++---- include/linux/bio.h | 4 ++++ 3 files changed, 39 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2e3e8e04961e..8b528e12136f 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -144,16 +144,38 @@ void bio_integrity_free(struct bio *bio) struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; + if (bip->bip_flags & BIP_INTEGRITY_USER) + return; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); - else if (bip->bip_flags & BIP_INTEGRITY_USER) - bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } +/** + * bio_integrity_unmap_free_user - Unmap and free bio user integrity payload + * @bio: bio containing bip to be unmapped and freed + * + * Description: Used to unmap and free the user mapped integrity portion of a + * bio. Submitter attaching the user integrity buffer is responsible for + * unmapping and freeing it during completion. + */ +void bio_integrity_unmap_free_user(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_set *bs = bio->bi_pool; + + if (WARN_ON_ONCE(!(bip->bip_flags & BIP_INTEGRITY_USER))) + return; + bio_integrity_unmap_user(bip); + __bio_integrity_free(bs, bip); + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; +} +EXPORT_SYMBOL(bio_integrity_unmap_free_user); + /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9d9d2a127c4e..8b69427a4476 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -111,6 +111,13 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, return req; } +static void nvme_unmap_bio(struct bio *bio) +{ + if (bio_integrity(bio)) + bio_integrity_unmap_free_user(bio); + blk_rq_unmap_user(bio); +} + static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags) @@ -157,7 +164,7 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, out_unmap: if (bio) - blk_rq_unmap_user(bio); + nvme_unmap_bio(bio); out: blk_mq_free_request(req); return ret; @@ -195,7 +202,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (result) *result = le64_to_cpu(nvme_req(req)->result.u64); if (bio) - blk_rq_unmap_user(bio); + nvme_unmap_bio(bio); blk_mq_free_request(req); if (effects) @@ -406,7 +413,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); if (pdu->bio) - blk_rq_unmap_user(pdu->bio); + nvme_unmap_bio(pdu->bio); io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); } @@ -432,7 +439,7 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, */ if (blk_rq_is_poll(req)) { if (pdu->bio) - blk_rq_unmap_user(pdu->bio); + nvme_unmap_bio(pdu->bio); io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); diff --git a/include/linux/bio.h b/include/linux/bio.h index d5379548d684..818e93612947 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -731,6 +731,7 @@ static inline bool bioset_initialized(struct bio_set *bs) bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed); +void bio_integrity_unmap_free_user(struct bio *bio); extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_prep(struct bio *); @@ -807,6 +808,9 @@ static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, { return -EINVAL; } +static inline void bio_integrity_unmap_free_user(struct bio *bio) +{ +} #endif /* CONFIG_BLK_DEV_INTEGRITY */ -- cgit v1.2.3 From 92424801261d1564a0bb759da3cf3ccd69fdf5a2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jun 2024 13:53:08 +0200 Subject: bpf: Fix reg_set_min_max corruption of fake_reg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Juan reported that after doing some changes to buzzer [0] and implementing a new fuzzing strategy guided by coverage, they noticed the following in one of the probes: [...] 13: (79) r6 = *(u64 *)(r0 +0) ; R0=map_value(ks=4,vs=8) R6_w=scalar() 14: (b7) r0 = 0 ; R0_w=0 15: (b4) w0 = -1 ; R0_w=0xffffffff 16: (74) w0 >>= 1 ; R0_w=0x7fffffff 17: (5c) w6 &= w0 ; R0_w=0x7fffffff R6_w=scalar(smin=smin32=0,smax=umax=umax32=0x7fffffff,var_off=(0x0; 0x7fffffff)) 18: (44) w6 |= 2 ; R6_w=scalar(smin=umin=smin32=umin32=2,smax=umax=umax32=0x7fffffff,var_off=(0x2; 0x7ffffffd)) 19: (56) if w6 != 0x7ffffffd goto pc+1 REG INVARIANTS VIOLATION (true_reg2): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg1): range bounds violation u64=[0x7fffffff, 0x7ffffffd] s64=[0x7fffffff, 0x7ffffffd] u32=[0x7fffffff, 0x7ffffffd] s32=[0x7fffffff, 0x7ffffffd] var_off=(0x7fffffff, 0x0) REG INVARIANTS VIOLATION (false_reg2): const tnum out of sync with range bounds u64=[0x0, 0xffffffffffffffff] s64=[0x8000000000000000, 0x7fffffffffffffff] u32=[0x0, 0xffffffff] s32=[0x80000000, 0x7fffffff] var_off=(0x7fffffff, 0x0) 19: R6_w=0x7fffffff 20: (95) exit from 19 to 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: R0=0x7fffffff R6=scalar(smin=umin=smin32=umin32=2,smax=umax=smax32=umax32=0x7ffffffe,var_off=(0x2; 0x7ffffffd)) R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 21: (14) w6 -= 2147483632 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=14,var_off=(0x2; 0xfffffffd)) 22: (76) if w6 s>= 0xe goto pc+1 ; R6_w=scalar(smin=umin=umin32=2,smax=umax=0xffffffff,smin32=0x80000012,smax32=13,var_off=(0x2; 0xfffffffd)) 23: (95) exit from 22 to 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: R0=0x7fffffff R6_w=14 R7=map_ptr(ks=4,vs=8) R9=ctx() R10=fp0 fp-24=map_ptr(ks=4,vs=8) fp-40=mmmmmmmm 24: (14) w6 -= 14 ; R6_w=0 [...] What can be seen here is a register invariant violation on line 19. After the binary-or in line 18, the verifier knows that bit 2 is set but knows nothing about the rest of the content which was loaded from a map value, meaning, range is [2,0x7fffffff] with var_off=(0x2; 0x7ffffffd). When in line 19 the verifier analyzes the branch, it splits the register states in reg_set_min_max() into the registers of the true branch (true_reg1, true_reg2) and the registers of the false branch (false_reg1, false_reg2). Since the test is w6 != 0x7ffffffd, the src_reg is a known constant. Internally, the verifier creates a "fake" register initialized as scalar to the value of 0x7ffffffd, and then passes it onto reg_set_min_max(). Now, for line 19, it is mathematically impossible to take the false branch of this program, yet the verifier analyzes it. It is impossible because the second bit of r6 will be set due to the prior or operation and the constant in the condition has that bit unset (hex(fd) == binary(1111 1101). When the verifier first analyzes the false / fall-through branch, it will compute an intersection between the var_off of r6 and of the constant. This is because the verifier creates a "fake" register initialized to the value of the constant. The intersection result later refines both registers in regs_refine_cond_op(): [...] t = tnum_intersect(tnum_subreg(reg1->var_off), tnum_subreg(reg2->var_off)); reg1->var_off = tnum_with_subreg(reg1->var_off, t); reg2->var_off = tnum_with_subreg(reg2->var_off, t); [...] Since the verifier is analyzing the false branch of the conditional jump, reg1 is equal to false_reg1 and reg2 is equal to false_reg2, i.e. the reg2 is the "fake" register that was meant to hold a constant value. The resulting var_off of the intersection says that both registers now hold a known value of var_off=(0x7fffffff, 0x0) or in other words: this operation manages to make the verifier think that the "constant" value that was passed in the jump operation now holds a different value. Normally this would not be an issue since it should not influence the true branch, however, false_reg2 and true_reg2 are pointers to the same "fake" register. Meaning, the false branch can influence the results of the true branch. In line 24, the verifier assumes R6_w=0, but the actual runtime value in this case is 1. The fix is simply not passing in the same "fake" register location as inputs to reg_set_min_max(), but instead making a copy. Moving the fake_reg into the env also reduces stack consumption by 120 bytes. With this, the verifier successfully rejects invalid accesses from the test program. [0] https://github.com/google/buzzer Fixes: 67420501e868 ("bpf: generalize reg_set_min_max() to handle non-const register comparisons") Reported-by: Juan José López Jaimez Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/r/20240613115310.25383-1-daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/verifier.c | 14 ++++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 50aa87f8d77f..e4070fb02b11 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -746,6 +746,8 @@ struct bpf_verifier_env { /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u64 prev_log_pos, prev_insn_print_pos; + /* buffer used to temporary hold constants as scalar registers */ + struct bpf_reg_state fake_reg[2]; /* buffer used to generate temporary string representations, * e.g., in reg_type_str() to generate reg_type string */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36ef8e96787e..f455548ba46c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15113,7 +15113,6 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; struct bpf_reg_state *eq_branch_regs; - struct bpf_reg_state fake_reg = {}; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -15179,7 +15178,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); return -EINVAL; } - src_reg = &fake_reg; + src_reg = &env->fake_reg[0]; + memset(src_reg, 0, sizeof(*src_reg)); src_reg->type = SCALAR_VALUE; __mark_reg_known(src_reg, insn->imm); } @@ -15239,10 +15239,16 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, &other_branch_regs[insn->src_reg], dst_reg, src_reg, opcode, is_jmp32); } else /* BPF_SRC(insn->code) == BPF_K */ { + /* reg_set_min_max() can mangle the fake_reg. Make a copy + * so that these are two different memory locations. The + * src_reg is not used beyond here in context of K. + */ + memcpy(&env->fake_reg[1], &env->fake_reg[0], + sizeof(env->fake_reg[0])); err = reg_set_min_max(env, &other_branch_regs[insn->dst_reg], - src_reg /* fake one */, - dst_reg, src_reg /* same fake one */, + &env->fake_reg[0], + dst_reg, &env->fake_reg[1], opcode, is_jmp32); } if (err) -- cgit v1.2.3 From 9a95c5bfbf02a0a7f5983280fe284a0ff0836c34 Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Tue, 7 May 2024 01:25:41 +0000 Subject: ima: Avoid blocking in RCU read-side critical section A panic happens in ima_match_policy: BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 PGD 42f873067 P4D 0 Oops: 0000 [#1] SMP NOPTI CPU: 5 PID: 1286325 Comm: kubeletmonit.sh Kdump: loaded Tainted: P Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 RIP: 0010:ima_match_policy+0x84/0x450 Code: 49 89 fc 41 89 cf 31 ed 89 44 24 14 eb 1c 44 39 7b 18 74 26 41 83 ff 05 74 20 48 8b 1b 48 3b 1d f2 b9 f4 00 0f 84 9c 01 00 00 <44> 85 73 10 74 ea 44 8b 6b 14 41 f6 c5 01 75 d4 41 f6 c5 02 74 0f RSP: 0018:ff71570009e07a80 EFLAGS: 00010207 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000200 RDX: ffffffffad8dc7c0 RSI: 0000000024924925 RDI: ff3e27850dea2000 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffffffabfce739 R10: ff3e27810cc42400 R11: 0000000000000000 R12: ff3e2781825ef970 R13: 00000000ff3e2785 R14: 000000000000000c R15: 0000000000000001 FS: 00007f5195b51740(0000) GS:ff3e278b12d40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000010 CR3: 0000000626d24002 CR4: 0000000000361ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ima_get_action+0x22/0x30 process_measurement+0xb0/0x830 ? page_add_file_rmap+0x15/0x170 ? alloc_set_pte+0x269/0x4c0 ? prep_new_page+0x81/0x140 ? simple_xattr_get+0x75/0xa0 ? selinux_file_open+0x9d/0xf0 ima_file_check+0x64/0x90 path_openat+0x571/0x1720 do_filp_open+0x9b/0x110 ? page_counter_try_charge+0x57/0xc0 ? files_cgroup_alloc_fd+0x38/0x60 ? __alloc_fd+0xd4/0x250 ? do_sys_open+0x1bd/0x250 do_sys_open+0x1bd/0x250 do_syscall_64+0x5d/0x1d0 entry_SYSCALL_64_after_hwframe+0x65/0xca Commit c7423dbdbc9e ("ima: Handle -ESTALE returned by ima_filter_rule_match()") introduced call to ima_lsm_copy_rule within a RCU read-side critical section which contains kmalloc with GFP_KERNEL. This implies a possible sleep and violates limitations of RCU read-side critical sections on non-PREEMPT systems. Sleeping within RCU read-side critical section might cause synchronize_rcu() returning early and break RCU protection, allowing a UAF to happen. The root cause of this issue could be described as follows: | Thread A | Thread B | | |ima_match_policy | | | rcu_read_lock | |ima_lsm_update_rule | | | synchronize_rcu | | | | kmalloc(GFP_KERNEL)| | | sleep | ==> synchronize_rcu returns early | kfree(entry) | | | | entry = entry->next| ==> UAF happens and entry now becomes NULL (or could be anything). | | entry->action | ==> Accessing entry might cause panic. To fix this issue, we are converting all kmalloc that is called within RCU read-side critical section to use GFP_ATOMIC. Fixes: c7423dbdbc9e ("ima: Handle -ESTALE returned by ima_filter_rule_match()") Cc: stable@vger.kernel.org Signed-off-by: GUO Zihua Acked-by: John Johansen Reviewed-by: Mimi Zohar Reviewed-by: Casey Schaufler [PM: fixed missing comment, long lines, !CONFIG_IMA_LSM_RULES case] Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 2 +- include/linux/security.h | 5 +++-- kernel/auditfilter.c | 5 +++-- security/apparmor/audit.c | 6 +++--- security/apparmor/include/audit.h | 2 +- security/integrity/ima/ima.h | 2 +- security/integrity/ima/ima_policy.c | 15 +++++++++------ security/security.c | 6 ++++-- security/selinux/include/audit.h | 4 +++- security/selinux/ss/services.c | 5 +++-- security/smack/smack_lsm.c | 4 +++- 11 files changed, 34 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index f804b76cde44..44488b1ab9a9 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -413,7 +413,7 @@ LSM_HOOK(void, LSM_RET_VOID, key_post_create_or_update, struct key *keyring, #ifdef CONFIG_AUDIT LSM_HOOK(int, 0, audit_rule_init, u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) LSM_HOOK(int, 0, audit_rule_known, struct audit_krule *krule) LSM_HOOK(int, 0, audit_rule_match, u32 secid, u32 field, u32 op, void *lsmrule) LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) diff --git a/include/linux/security.h b/include/linux/security.h index 21cf70346b33..de3af33e6ff5 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2048,7 +2048,8 @@ static inline void security_key_post_create_or_update(struct key *keyring, #ifdef CONFIG_AUDIT #ifdef CONFIG_SECURITY -int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule); +int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, + gfp_t gfp); int security_audit_rule_known(struct audit_krule *krule); int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule); void security_audit_rule_free(void *lsmrule); @@ -2056,7 +2057,7 @@ void security_audit_rule_free(void *lsmrule); #else static inline int security_audit_rule_init(u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) { return 0; } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index be8c680121e4..d6ef4f4f9cba 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -529,7 +529,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, entry->rule.buflen += f_val; f->lsm_str = str; err = security_audit_rule_init(f->type, f->op, str, - (void **)&f->lsm_rule); + (void **)&f->lsm_rule, + GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (err == -EINVAL) { @@ -799,7 +800,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, /* our own (refreshed) copy of lsm_rule */ ret = security_audit_rule_init(df->type, df->op, df->lsm_str, - (void **)&df->lsm_rule); + (void **)&df->lsm_rule, GFP_KERNEL); /* Keep currently invalid fields around in case they * become valid after a policy reload. */ if (ret == -EINVAL) { diff --git a/security/apparmor/audit.c b/security/apparmor/audit.c index 45beb1c5f747..6b5181c668b5 100644 --- a/security/apparmor/audit.c +++ b/security/apparmor/audit.c @@ -217,7 +217,7 @@ void aa_audit_rule_free(void *vrule) } } -int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp) { struct aa_audit_rule *rule; @@ -230,14 +230,14 @@ int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) return -EINVAL; } - rule = kzalloc(sizeof(struct aa_audit_rule), GFP_KERNEL); + rule = kzalloc(sizeof(struct aa_audit_rule), gfp); if (!rule) return -ENOMEM; /* Currently rules are treated as coming from the root ns */ rule->label = aa_label_parse(&root_ns->unconfined->label, rulestr, - GFP_KERNEL, true, false); + gfp, true, false); if (IS_ERR(rule->label)) { int err = PTR_ERR(rule->label); aa_audit_rule_free(rule); diff --git a/security/apparmor/include/audit.h b/security/apparmor/include/audit.h index acbb03b9bd25..0c8cc86b417b 100644 --- a/security/apparmor/include/audit.h +++ b/security/apparmor/include/audit.h @@ -200,7 +200,7 @@ static inline int complain_error(int error) } void aa_audit_rule_free(void *vrule); -int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule); +int aa_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, gfp_t gfp); int aa_audit_rule_known(struct audit_krule *rule); int aa_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule); diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 3e568126cd48..c51e24d24d1e 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -546,7 +546,7 @@ static inline void ima_free_modsig(struct modsig *modsig) #else static inline int ima_filter_rule_init(u32 field, u32 op, char *rulestr, - void **lsmrule) + void **lsmrule, gfp_t gfp) { return -EINVAL; } diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index c0556907c2e6..09da8e639239 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -401,7 +401,8 @@ static void ima_free_rule(struct ima_rule_entry *entry) kfree(entry); } -static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) +static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry, + gfp_t gfp) { struct ima_rule_entry *nentry; int i; @@ -410,7 +411,7 @@ static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) * Immutable elements are copied over as pointers and data; only * lsm rules can change */ - nentry = kmemdup(entry, sizeof(*nentry), GFP_KERNEL); + nentry = kmemdup(entry, sizeof(*nentry), gfp); if (!nentry) return NULL; @@ -425,7 +426,8 @@ static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry) ima_filter_rule_init(nentry->lsm[i].type, Audit_equal, nentry->lsm[i].args_p, - &nentry->lsm[i].rule); + &nentry->lsm[i].rule, + gfp); if (!nentry->lsm[i].rule) pr_warn("rule for LSM \'%s\' is undefined\n", nentry->lsm[i].args_p); @@ -438,7 +440,7 @@ static int ima_lsm_update_rule(struct ima_rule_entry *entry) int i; struct ima_rule_entry *nentry; - nentry = ima_lsm_copy_rule(entry); + nentry = ima_lsm_copy_rule(entry, GFP_KERNEL); if (!nentry) return -ENOMEM; @@ -664,7 +666,7 @@ retry: } if (rc == -ESTALE && !rule_reinitialized) { - lsm_rule = ima_lsm_copy_rule(rule); + lsm_rule = ima_lsm_copy_rule(rule, GFP_ATOMIC); if (lsm_rule) { rule_reinitialized = true; goto retry; @@ -1140,7 +1142,8 @@ static int ima_lsm_rule_init(struct ima_rule_entry *entry, entry->lsm[lsm_rule].type = audit_type; result = ima_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal, entry->lsm[lsm_rule].args_p, - &entry->lsm[lsm_rule].rule); + &entry->lsm[lsm_rule].rule, + GFP_KERNEL); if (!entry->lsm[lsm_rule].rule) { pr_warn("rule for LSM \'%s\' is undefined\n", entry->lsm[lsm_rule].args_p); diff --git a/security/security.c b/security/security.c index e5da848c50b9..e5ca08789f74 100644 --- a/security/security.c +++ b/security/security.c @@ -5332,15 +5332,17 @@ void security_key_post_create_or_update(struct key *keyring, struct key *key, * @op: rule operator * @rulestr: rule context * @lsmrule: receive buffer for audit rule struct + * @gfp: GFP flag used for kmalloc * * Allocate and initialize an LSM audit rule structure. * * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of * an invalid rule. */ -int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule) +int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule, + gfp_t gfp) { - return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule); + return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp); } /** diff --git a/security/selinux/include/audit.h b/security/selinux/include/audit.h index 52aca71210b4..29c7d4c86f6d 100644 --- a/security/selinux/include/audit.h +++ b/security/selinux/include/audit.h @@ -21,12 +21,14 @@ * @op: the operator the rule uses * @rulestr: the text "target" of the rule * @rule: pointer to the new rule structure returned via this + * @gfp: GFP flag used for kmalloc * * Returns 0 if successful, -errno if not. On success, the rule structure * will be allocated internally. The caller must free this structure with * selinux_audit_rule_free() after use. */ -int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **rule); +int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **rule, + gfp_t gfp); /** * selinux_audit_rule_free - free an selinux audit rule structure. diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index f20e1968b7f7..e33e55384b75 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -3507,7 +3507,8 @@ void selinux_audit_rule_free(void *vrule) } } -int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, + gfp_t gfp) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; @@ -3548,7 +3549,7 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) return -EINVAL; } - tmprule = kzalloc(sizeof(struct selinux_audit_rule), GFP_KERNEL); + tmprule = kzalloc(sizeof(struct selinux_audit_rule), gfp); if (!tmprule) return -ENOMEM; context_init(&tmprule->au_ctxt); diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 70ba2841e181..f5cbec1e6a92 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -4693,11 +4693,13 @@ static int smack_post_notification(const struct cred *w_cred, * @op: required testing operator (=, !=, >, <, ...) * @rulestr: smack label to be audited * @vrule: pointer to save our own audit rule representation + * @gfp: type of the memory for the allocation * * Prepare to audit cases where (@field @op @rulestr) is true. * The label to be audited is created if necessay. */ -static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) +static int smack_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule, + gfp_t gfp) { struct smack_known *skp; char **rule = (char **)vrule; -- cgit v1.2.3 From f4a1254f2a076afb0edd473589bf40f9b4d36b41 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 14 Jun 2024 01:04:29 +0100 Subject: io_uring: fix cancellation overwriting req->flags Only the current owner of a request is allowed to write into req->flags. Hence, the cancellation path should never touch it. Add a new field instead of the flag, move it into the 3rd cache line because it should always be initialised. poll_refs can move further as polling is an involved process anyway. It's a minimal patch, in the future we can and should find a better place for it and remove now unused REQ_F_CANCEL_SEQ. Fixes: 521223d7c229f ("io_uring/cancel: don't default to setting req->work.cancel_seq") Cc: stable@vger.kernel.org Reported-by: Li Shi Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6827b129f8f0ad76fa9d1f0a773de938b240ffab.1718323430.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 ++- io_uring/cancel.h | 4 ++-- io_uring/io_uring.c | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 7a6b190c7da7..b48570eaa449 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -648,7 +648,7 @@ struct io_kiocb { struct io_rsrc_node *rsrc_node; atomic_t refs; - atomic_t poll_refs; + bool cancel_seq_set; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; @@ -657,6 +657,7 @@ struct io_kiocb { /* opcode allocated if it needs to store data for async defer */ void *async_data; /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ + atomic_t poll_refs; struct io_kiocb *link; /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 76b32e65c03c..b33995e00ba9 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -27,10 +27,10 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { - if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq) + if (req->cancel_seq_set && sequence == req->work.cancel_seq) return true; - req->flags |= REQ_F_CANCEL_SEQ; + req->cancel_seq_set = true; req->work.cancel_seq = sequence; return false; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 816e93e7f949..154b25b8a613 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2058,6 +2058,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->file = NULL; req->rsrc_node = NULL; req->task = current; + req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; -- cgit v1.2.3 From 0a5d3258d7c97295a89d22e54733b54aacb62562 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 3 Jun 2024 22:23:15 -0700 Subject: compiler_types.h: Define __retain for __attribute__((__retain__)) Some code includes the __used macro to prevent functions and data from being optimized out. This macro implements __attribute__((__used__)), which operates at the compiler and IR-level, and so still allows a linker to remove objects intended to be kept. Compilers supporting __attribute__((__retain__)) can address this gap by setting the flag SHF_GNU_RETAIN on the section of a function/variable, indicating to the linker the object should be retained. This attribute is available since gcc 11, clang 13, and binutils 2.36. Provide a __retain macro implementing __attribute__((__retain__)), whose first user will be the '__bpf_kfunc' tag. [ Additional remark from discussion: Why is CONFIG_LTO_CLANG added here? The __used macro permits garbage collection at section level, so CLANG_LTO_CLANG without CONFIG_LD_DEAD_CODE_DATA_ELIMINATION should not change final section dynamics? The conditional guard was included to ensure consistent behaviour between __retain and other features forcing split sections. In particular, the same guard is used in vmlinux.lds.h to merge split sections where needed. For example, using __retain in LLVM builds without CONFIG_LTO was failing CI tests on kernel-patches/bpf because the kernel didn't boot properly. And in further testing, the kernel had no issues loading BPF kfunc modules with such split sections, so the module (partial) linking scripts were left alone. ] Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Cc: Yonghong Song Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/ Link: https://lore.kernel.org/bpf/b31bca5a5e6765a0f32cc8c19b1d9cdbfaa822b5.1717477560.git.Tony.Ambardar@gmail.com --- include/linux/compiler_types.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 93600de3800b..f14c275950b5 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -143,6 +143,29 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __preserve_most #endif +/* + * Annotating a function/variable with __retain tells the compiler to place + * the object in its own section and set the flag SHF_GNU_RETAIN. This flag + * instructs the linker to retain the object during garbage-cleanup or LTO + * phases. + * + * Note that the __used macro is also used to prevent functions or data + * being optimized out, but operates at the compiler/IR-level and may still + * allow unintended removal of objects during linking. + * + * Optional: only supported since gcc >= 11, clang >= 13 + * + * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-retain-function-attribute + * clang: https://clang.llvm.org/docs/AttributeReference.html#retain + */ +#if __has_attribute(__retain__) && \ + (defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || \ + defined(CONFIG_LTO_CLANG)) +# define __retain __attribute__((__retain__)) +#else +# define __retain +#endif + /* Compiler specific macros. */ #ifdef __clang__ #include -- cgit v1.2.3 From 7bdcedd5c8fb88e7176b93812b139eca5fe0aa46 Mon Sep 17 00:00:00 2001 From: Tony Ambardar Date: Mon, 3 Jun 2024 22:23:16 -0700 Subject: bpf: Harden __bpf_kfunc tag against linker kfunc removal BPF kfuncs are often not directly referenced and may be inadvertently removed by optimization steps during kernel builds, thus the __bpf_kfunc tag mitigates against this removal by including the __used macro. However, this macro alone does not prevent removal during linking, and may still yield build warnings (e.g. on mips64el): [...] LD vmlinux BTFIDS vmlinux WARN: resolve_btfids: unresolved symbol bpf_verify_pkcs7_signature WARN: resolve_btfids: unresolved symbol bpf_lookup_user_key WARN: resolve_btfids: unresolved symbol bpf_lookup_system_key WARN: resolve_btfids: unresolved symbol bpf_key_put WARN: resolve_btfids: unresolved symbol bpf_iter_task_next WARN: resolve_btfids: unresolved symbol bpf_iter_css_task_new WARN: resolve_btfids: unresolved symbol bpf_get_file_xattr WARN: resolve_btfids: unresolved symbol bpf_ct_insert_entry WARN: resolve_btfids: unresolved symbol bpf_cgroup_release WARN: resolve_btfids: unresolved symbol bpf_cgroup_from_id WARN: resolve_btfids: unresolved symbol bpf_cgroup_acquire WARN: resolve_btfids: unresolved symbol bpf_arena_free_pages NM System.map SORTTAB vmlinux OBJCOPY vmlinux.32 [...] Update the __bpf_kfunc tag to better guard against linker optimization by including the new __retain compiler macro, which fixes the warnings above. Verify the __retain macro with readelf by checking object flags for 'R': $ readelf -Wa kernel/trace/bpf_trace.o Section Headers: [Nr] Name Type Address Off Size ES Flg Lk Inf Al [...] [178] .text.bpf_key_put PROGBITS 00000000 6420 0050 00 AXR 0 0 8 [...] Key to Flags: [...] R (retain), D (mbind), p (processor specific) Fixes: 57e7c169cd6a ("bpf: Add __bpf_kfunc tag for marking kernel functions as kfuncs") Reported-by: kernel test robot Signed-off-by: Tony Ambardar Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Reviewed-by: Jiri Olsa Cc: Yonghong Song Closes: https://lore.kernel.org/r/202401211357.OCX9yllM-lkp@intel.com/ Link: https://lore.kernel.org/bpf/ZlmGoT9KiYLZd91S@krava/T/ Link: https://lore.kernel.org/bpf/e9c64e9b5c073dabd457ff45128aabcab7630098.1717477560.git.Tony.Ambardar@gmail.com --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9e56fd12a9f..7c3e40c3295e 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -82,7 +82,7 @@ * as to avoid issues such as the compiler inlining or eliding either a static * kfunc, or a global kfunc in an LTO build. */ -#define __bpf_kfunc __used noinline +#define __bpf_kfunc __used __retain noinline #define __bpf_kfunc_start_defs() \ __diag_push(); \ -- cgit v1.2.3 From 384a746bb55960aa5ffb3a67de08f11fc2f51042 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 5 Jun 2024 11:17:10 +0200 Subject: Revert "mm: init_mlocked_on_free_v3" There was insufficient review and no agreement that this is the right approach. There are serious flaws with the implementation that make processes using mlock() not even work with simple fork() [1] and we get reliable crashes when rebooting. Further, simply because we might be unmapping a single PTE of a large mlocked folio, we shouldn't zero out the whole folio. ... especially because the code can also *corrupt* urelated memory because kernel_init_pages(page, folio_nr_pages(folio)); Could end up writing outside of the actual folio if we work with a tail page. Let's revert it. Once there is agreement that this is the right approach, the issues were fixed and there was reasonable review and proper testing, we can consider it again. [1] https://lkml.kernel.org/r/4da9da2f-73e4-45fd-b62f-a8a513314057@redhat.com Link: https://lkml.kernel.org/r/20240605091710.38961-1-david@redhat.com Fixes: ba42b524a040 ("mm: init_mlocked_on_free_v3") Signed-off-by: David Hildenbrand Reported-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20240528151340.4282-1-00107082@163.com/ Reported-by: Lance Yang Closes: https://lkml.kernel.org/r/20240601140917.43562-1-ioworker0@gmail.com Acked-by: Lance Yang Cc: York Jasper Niebuhr Cc: Matthew Wilcox (Oracle) Cc: Kees Cook Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 6 ---- include/linux/mm.h | 9 +----- mm/internal.h | 1 - mm/memory.c | 6 ---- mm/mm_init.c | 43 ++++--------------------- mm/page_alloc.c | 2 +- security/Kconfig.hardening | 15 --------- 7 files changed, 9 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b600df82669d..11e57ba2985c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2192,12 +2192,6 @@ Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. - init_mlocked_on_free= [MM] Fill freed userspace memory with zeroes if - it was mlock'ed and not explicitly munlock'ed - afterwards. - Format: 0 | 1 - Default set by CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON - init_pkru= [X86] Specify the default memory protection keys rights register contents for all processes. 0x55555554 by default (disallow access to all but pkey 0). Can diff --git a/include/linux/mm.h b/include/linux/mm.h index 9849dfda44d4..9a5652c5fadd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3776,14 +3776,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, - &init_on_free); -} - -DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); -static inline bool want_init_mlocked_on_free(void) -{ - return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, - &init_mlocked_on_free); + &init_on_free); } extern bool _debug_pagealloc_enabled_early; diff --git a/mm/internal.h b/mm/internal.h index b2c75b12014e..c72c306761a4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -588,7 +588,6 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); -extern void kernel_init_pages(struct page *page, int numpages); /* * This will have no effect, other than possibly generating a warning, if the diff --git a/mm/memory.c b/mm/memory.c index 0f47a533014e..2bc8032a30a2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1507,12 +1507,6 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb, if (unlikely(folio_mapcount(folio) < 0)) print_bad_pte(vma, addr, ptent, page); } - - if (want_init_mlocked_on_free() && folio_test_mlocked(folio) && - !delay_rmap && folio_test_anon(folio)) { - kernel_init_pages(page, folio_nr_pages(folio)); - } - if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) { *force_flush = true; *force_break = true; diff --git a/mm/mm_init.c b/mm/mm_init.c index f72b852bd5b8..3ec04933f7fd 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2523,9 +2523,6 @@ EXPORT_SYMBOL(init_on_alloc); DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); EXPORT_SYMBOL(init_on_free); -DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free); -EXPORT_SYMBOL(init_mlocked_on_free); - static bool _init_on_alloc_enabled_early __read_mostly = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) @@ -2543,14 +2540,6 @@ static int __init early_init_on_free(char *buf) } early_param("init_on_free", early_init_on_free); -static bool _init_mlocked_on_free_enabled_early __read_mostly - = IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON); -static int __init early_init_mlocked_on_free(char *buf) -{ - return kstrtobool(buf, &_init_mlocked_on_free_enabled_early); -} -early_param("init_mlocked_on_free", early_init_mlocked_on_free); - DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled); /* @@ -2578,21 +2567,12 @@ static void __init mem_debugging_and_hardening_init(void) } #endif - if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early || - _init_mlocked_on_free_enabled_early) && + if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && page_poisoning_requested) { pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " - "will take precedence over init_on_alloc, init_on_free " - "and init_mlocked_on_free\n"); + "will take precedence over init_on_alloc and init_on_free\n"); _init_on_alloc_enabled_early = false; _init_on_free_enabled_early = false; - _init_mlocked_on_free_enabled_early = false; - } - - if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) { - pr_info("mem auto-init: init_on_free is on, " - "will take precedence over init_mlocked_on_free\n"); - _init_mlocked_on_free_enabled_early = false; } if (_init_on_alloc_enabled_early) { @@ -2609,17 +2589,9 @@ static void __init mem_debugging_and_hardening_init(void) static_branch_disable(&init_on_free); } - if (_init_mlocked_on_free_enabled_early) { - want_check_pages = true; - static_branch_enable(&init_mlocked_on_free); - } else { - static_branch_disable(&init_mlocked_on_free); - } - - if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early || - _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early)) - pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and " - "init_mlocked_on_free are disabled when running KMSAN\n"); + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); #ifdef CONFIG_DEBUG_PAGEALLOC if (debug_pagealloc_enabled()) { @@ -2658,10 +2630,9 @@ static void __init report_meminit(void) else stack = "off"; - pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n", + pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n", stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off", - want_init_on_free() ? "on" : "off", - want_init_mlocked_on_free() ? "on" : "off"); + want_init_on_free() ? "on" : "off"); if (want_init_on_free()) pr_info("mem auto-init: clearing system memory may take some time...\n"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 222299b5c0e6..7300aa9f14b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1016,7 +1016,7 @@ static inline bool should_skip_kasan_poison(struct page *page) return page_kasan_tag(page) == KASAN_TAG_KERNEL; } -void kernel_init_pages(struct page *page, int numpages) +static void kernel_init_pages(struct page *page, int numpages) { int i; diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index effbf5982be1..2cff851ebfd7 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -255,21 +255,6 @@ config INIT_ON_FREE_DEFAULT_ON touching "cold" memory areas. Most cases see 3-5% impact. Some synthetic workloads have measured as high as 8%. -config INIT_MLOCKED_ON_FREE_DEFAULT_ON - bool "Enable mlocked memory zeroing on free" - depends on !KMSAN - help - This config has the effect of setting "init_mlocked_on_free=1" - on the kernel command line. If it is enabled, all mlocked process - memory is zeroed when freed. This restriction to mlocked memory - improves performance over "init_on_free" but can still be used to - protect confidential data like key material from content exposures - to other processes, as well as live forensics and cold boot attacks. - Any non-mlocked memory is not cleared before it is reassigned. This - configuration can be overwritten by setting "init_mlocked_on_free=0" - on the command line. The "init_on_free" boot option takes - precedence over "init_mlocked_on_free". - config CC_HAS_ZERO_CALL_USED_REGS def_bool $(cc-option,-fzero-call-used-regs=used-gpr) # https://github.com/ClangBuiltLinux/linux/issues/1766 -- cgit v1.2.3 From a273559e9eb68cb58c57803d76a1622b8324a878 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Jun 2024 16:38:40 -0700 Subject: lib/alloc_tag: fix RCU imbalance in pgalloc_tag_get() put_page_tag_ref() should be called only when get_page_tag_ref() returns a valid reference because only in that case get_page_tag_ref() enters RCU read section while put_page_tag_ref() will call rcu_read_unlock() even if the provided reference is NULL. Fix pgalloc_tag_get() which does not follow this rule causing RCU imbalance. Add a warning in put_page_tag_ref() to catch any future mistakes. Link: https://lkml.kernel.org/r/20240601233840.617458-1-surenb@google.com Fixes: cc92eba1c88b ("mm: fix non-compound multi-order memory accounting in __free_pages") Signed-off-by: Suren Baghdasaryan Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202405271029.6d2f9c4c-lkp@intel.com Acked-by: Vlastimil Babka Cc: Kent Overstreet Cc: Kees Cook Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 86ba5d33e43b..9cacadbd61f8 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -37,6 +37,9 @@ static inline union codetag_ref *get_page_tag_ref(struct page *page) static inline void put_page_tag_ref(union codetag_ref *ref) { + if (WARN_ON(!ref)) + return; + page_ext_put(page_ext_from_codetag_ref(ref)); } @@ -102,9 +105,11 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) union codetag_ref *ref = get_page_tag_ref(page); alloc_tag_sub_check(ref); - if (ref && ref->ct) - tag = ct_to_alloc_tag(ref->ct); - put_page_tag_ref(ref); + if (ref) { + if (ref->ct) + tag = ct_to_alloc_tag(ref->ct); + put_page_tag_ref(ref); + } } return tag; -- cgit v1.2.3 From 6a50c9b512f7734bc356f4bd47885a6f7c98491a Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Fri, 7 Jun 2024 17:40:48 +0800 Subject: mm: huge_memory: fix misused mapping_large_folio_support() for anon folios When I did a large folios split test, a WARNING "[ 5059.122759][ T166] Cannot split file folio to non-0 order" was triggered. But the test cases are only for anonmous folios. while mapping_large_folio_support() is only reasonable for page cache folios. In split_huge_page_to_list_to_order(), the folio passed to mapping_large_folio_support() maybe anonmous folio. The folio_test_anon() check is missing. So the split of the anonmous THP is failed. This is also the same for shmem_mapping(). We'd better add a check for both. But the shmem_mapping() in __split_huge_page() is not involved, as for anonmous folios, the end parameter is set to -1, so (head[i].index >= end) is always false. shmem_mapping() is not called. Also add a VM_WARN_ON_ONCE() in mapping_large_folio_support() for anon mapping, So we can detect the wrong use more easily. THP folios maybe exist in the pagecache even the file system doesn't support large folio, it is because when CONFIG_TRANSPARENT_HUGEPAGE is enabled, khugepaged will try to collapse read-only file-backed pages to THP. But the mapping does not actually support multi order large folios properly. Using /sys/kernel/debug/split_huge_pages to verify this, with this patch, large anon THP is successfully split and the warning is ceased. Link: https://lkml.kernel.org/r/202406071740485174hcFl7jRxncsHDtI-Pz-o@zte.com.cn Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Reviewed-by: Barry Song Reviewed-by: Zi Yan Acked-by: David Hildenbrand Signed-off-by: Ran Xiaokai Cc: Michal Hocko Cc: xu xin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 4 ++++ mm/huge_memory.c | 28 +++++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ee633712bba0..59f1df0cde5a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -381,6 +381,10 @@ static inline void mapping_set_large_folios(struct address_space *mapping) */ static inline bool mapping_large_folio_support(struct address_space *mapping) { + /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ + VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + "Anonymous mapping always supports large folio"); + return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 89932fd0f62e..db7946a0a28c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3009,30 +3009,36 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, if (new_order >= folio_order(folio)) return -EINVAL; - /* Cannot split anonymous THP to order-1 */ - if (new_order == 1 && folio_test_anon(folio)) { - VM_WARN_ONCE(1, "Cannot split to order-1 folio"); - return -EINVAL; - } - - if (new_order) { - /* Only swapping a whole PMD-mapped folio is supported */ - if (folio_test_swapcache(folio)) + if (folio_test_anon(folio)) { + /* order-1 is not supported for anonymous THP. */ + if (new_order == 1) { + VM_WARN_ONCE(1, "Cannot split to order-1 folio"); return -EINVAL; + } + } else if (new_order) { /* Split shmem folio to non-zero order not supported */ if (shmem_mapping(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split shmem folio to non-0 order"); return -EINVAL; } - /* No split if the file system does not support large folio */ - if (!mapping_large_folio_support(folio->mapping)) { + /* + * No split if the file system does not support large folio. + * Note that we might still have THPs in such mappings due to + * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping + * does not actually support large folios properly. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + !mapping_large_folio_support(folio->mapping)) { VM_WARN_ONCE(1, "Cannot split file folio to non-0 order"); return -EINVAL; } } + /* Only swapping a whole PMD-mapped folio is supported */ + if (folio_test_swapcache(folio) && new_order) + return -EINVAL; is_hzp = is_huge_zero_folio(folio); if (is_hzp) { -- cgit v1.2.3 From 01c8f9806bde438ca1c8cbbc439f0a14a6694f6c Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Tue, 11 Jun 2024 15:32:29 +0200 Subject: kcov: don't lose track of remote references during softirqs In kcov_remote_start()/kcov_remote_stop(), we swap the previous KCOV metadata of the current task into a per-CPU variable. However, the kcov_mode_enabled(mode) check is not sufficient in the case of remote KCOV coverage: current->kcov_mode always remains KCOV_MODE_DISABLED for remote KCOV objects. If the original task that has invoked the KCOV_REMOTE_ENABLE ioctl happens to get interrupted and kcov_remote_start() is called, it ultimately leads to kcov_remote_stop() NOT restoring the original KCOV reference. So when the task exits, all registered remote KCOV handles remain active forever. The most uncomfortable effect (at least for syzkaller) is that the bug prevents the reuse of the same /sys/kernel/debug/kcov descriptor. If we obtain it in the parent process and then e.g. drop some capabilities and continuously fork to execute individual programs, at some point current->kcov of the forked process is lost, kcov_task_exit() takes no action, and all KCOV_REMOTE_ENABLE ioctls calls from subsequent forks fail. And, yes, the efficiency is also affected if we keep on losing remote kcov objects. a) kcov_remote_map keeps on growing forever. b) (If I'm not mistaken), we're also not freeing the memory referenced by kcov->area. Fix it by introducing a special kcov_mode that is assigned to the task that owns a KCOV remote object. It makes kcov_mode_enabled() return true and yet does not trigger coverage collection in __sanitizer_cov_trace_pc() and write_comp_data(). [nogikh@google.com: replace WRITE_ONCE() with an ordinary assignment] Link: https://lkml.kernel.org/r/20240614171221.2837584-1-nogikh@google.com Link: https://lkml.kernel.org/r/20240611133229.527822-1-nogikh@google.com Fixes: 5ff3b30ab57d ("kcov: collect coverage from interrupts") Signed-off-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Tested-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Arnd Bergmann Cc: Marco Elver Cc: Signed-off-by: Andrew Morton --- include/linux/kcov.h | 2 ++ kernel/kcov.c | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kcov.h b/include/linux/kcov.h index b851ba415e03..3b479a3d235a 100644 --- a/include/linux/kcov.h +++ b/include/linux/kcov.h @@ -21,6 +21,8 @@ enum kcov_mode { KCOV_MODE_TRACE_PC = 2, /* Collecting comparison operands mode. */ KCOV_MODE_TRACE_CMP = 3, + /* The process owns a KCOV remote reference. */ + KCOV_MODE_REMOTE = 4, }; #define KCOV_IN_CTXSW (1 << 30) diff --git a/kernel/kcov.c b/kernel/kcov.c index c3124f6d5536..f0a69d402066 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -632,6 +632,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, return -EINVAL; kcov->mode = mode; t->kcov = kcov; + t->kcov_mode = KCOV_MODE_REMOTE; kcov->t = t; kcov->remote = true; kcov->remote_size = remote_arg->area_size; -- cgit v1.2.3 From 8043832e2a123fd9372007a29192f2f3ba328cd6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Fri, 14 Jun 2024 11:05:43 +0300 Subject: memblock: use numa_valid_node() helper to check for invalid node ID Introduce numa_valid_node(nid) that verifies that nid is a valid node ID and use that instead of comparing nid parameter with either NUMA_NO_NODE or MAX_NUMNODES. This makes the checks for valid node IDs consistent and more robust and allows to get rid of multiple WARNings. Suggested-by: Linus Torvalds Signed-off-by: Mike Rapoport (IBM) --- include/linux/numa.h | 5 +++++ mm/memblock.c | 28 +++++++--------------------- 2 files changed, 12 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index 1d43371fafd2..eb19503604fe 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -15,6 +15,11 @@ #define NUMA_NO_NODE (-1) #define NUMA_NO_MEMBLK (-1) +static inline bool numa_valid_node(int nid) +{ + return nid >= 0 && nid < MAX_NUMNODES; +} + /* optionally keep NUMA memory info available post init */ #ifdef CONFIG_NUMA_KEEP_MEMINFO #define __initdata_or_meminfo diff --git a/mm/memblock.c b/mm/memblock.c index 08e9806b1cf9..e81fb68f7f88 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -754,7 +754,7 @@ bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_byt /* calculate lose page */ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { - if (nid == NUMA_NO_NODE) + if (!numa_valid_node(nid)) nr_pages += end_pfn - start_pfn; } @@ -1061,7 +1061,7 @@ static bool should_skip_region(struct memblock_type *type, return false; /* only memory regions are associated with nodes, check it */ - if (nid != NUMA_NO_NODE && nid != m_nid) + if (numa_valid_node(nid) && nid != m_nid) return true; /* skip hotpluggable memory regions if needed */ @@ -1118,10 +1118,6 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - for (; idx_a < type_a->cnt; idx_a++) { struct memblock_region *m = &type_a->regions[idx_a]; @@ -1215,9 +1211,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, int idx_a = *idx & 0xffffffff; int idx_b = *idx >> 32; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (*idx == (u64)ULLONG_MAX) { idx_a = type_a->cnt - 1; if (type_b != NULL) @@ -1303,7 +1296,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - if (nid == MAX_NUMNODES || nid == r_nid) + if (!numa_valid_node(nid) || nid == r_nid) break; } if (*idx >= type->cnt) { @@ -1339,10 +1332,6 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, int start_rgn, end_rgn; int i, ret; - if (WARN_ONCE(nid == MAX_NUMNODES, - "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); if (ret) return ret; @@ -1452,9 +1441,6 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; - if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) - nid = NUMA_NO_NODE; - if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); @@ -1467,7 +1453,7 @@ again: if (found && !memblock_reserve(found, size)) goto done; - if (nid != NUMA_NO_NODE && !exact_nid) { + if (numa_valid_node(nid) && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); @@ -1987,7 +1973,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) end = base + size - 1; flags = rgn->flags; #ifdef CONFIG_NUMA - if (memblock_get_region_node(rgn) != MAX_NUMNODES) + if (numa_valid_node(memblock_get_region_node(rgn))) snprintf(nid_buf, sizeof(nid_buf), " on node %d", memblock_get_region_node(rgn)); #endif @@ -2181,7 +2167,7 @@ static void __init memmap_init_reserved_pages(void) start = region->base; end = start + region->size; - if (nid == NUMA_NO_NODE || nid >= MAX_NUMNODES) + if (!numa_valid_node(nid)) nid = early_pfn_to_nid(PFN_DOWN(start)); reserve_bootmem_region(start, end, nid); @@ -2272,7 +2258,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) seq_printf(m, "%4d: ", i); seq_printf(m, "%pa..%pa ", ®->base, &end); - if (nid != MAX_NUMNODES) + if (numa_valid_node(nid)) seq_printf(m, "%4d ", nid); else seq_printf(m, "%4c ", 'x'); -- cgit v1.2.3 From d6a711a898672dd873aab3844f754a3ca40723a5 Mon Sep 17 00:00:00 2001 From: Patrice Chotard Date: Tue, 18 Jun 2024 15:29:51 +0200 Subject: spi: Fix OCTAL mode support Add OCTAL mode support. Issue detected using "--octal" spidev_test's option. Signed-off-by: Patrice Chotard Link: https://msgid.link/r/20240618132951.2743935-4-patrice.chotard@foss.st.com Signed-off-by: Mark Brown --- drivers/spi/spi.c | 6 ++++-- include/linux/spi/spi.h | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 9bc9fd10d538..9da736d51a2b 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -4156,7 +4156,8 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message) return -EINVAL; if (xfer->tx_nbits != SPI_NBITS_SINGLE && xfer->tx_nbits != SPI_NBITS_DUAL && - xfer->tx_nbits != SPI_NBITS_QUAD) + xfer->tx_nbits != SPI_NBITS_QUAD && + xfer->tx_nbits != SPI_NBITS_OCTAL) return -EINVAL; if ((xfer->tx_nbits == SPI_NBITS_DUAL) && !(spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD))) @@ -4171,7 +4172,8 @@ static int __spi_validate(struct spi_device *spi, struct spi_message *message) return -EINVAL; if (xfer->rx_nbits != SPI_NBITS_SINGLE && xfer->rx_nbits != SPI_NBITS_DUAL && - xfer->rx_nbits != SPI_NBITS_QUAD) + xfer->rx_nbits != SPI_NBITS_QUAD && + xfer->rx_nbits != SPI_NBITS_OCTAL) return -EINVAL; if ((xfer->rx_nbits == SPI_NBITS_DUAL) && !(spi->mode & (SPI_RX_DUAL | SPI_RX_QUAD))) diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index e8e1e798924f..98fdef6e28f2 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -1085,12 +1085,13 @@ struct spi_transfer { unsigned dummy_data:1; unsigned cs_off:1; unsigned cs_change:1; - unsigned tx_nbits:3; - unsigned rx_nbits:3; + unsigned tx_nbits:4; + unsigned rx_nbits:4; unsigned timestamped:1; #define SPI_NBITS_SINGLE 0x01 /* 1-bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2-bit transfer */ #define SPI_NBITS_QUAD 0x04 /* 4-bit transfer */ +#define SPI_NBITS_OCTAL 0x08 /* 8-bit transfer */ u8 bits_per_word; struct spi_delay delay; struct spi_delay cs_change_delay; -- cgit v1.2.3 From 158ed777e330e9bf6bd592daaf1e860d965ec8b5 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 30 Apr 2024 11:43:16 +0100 Subject: firmware: qcom: scm: Add gpu_init_regs call This will used by drm/msm to initialize GPU registers that Qualcomm's firmware doesn't make writeable to the kernel. Reviewed-by: Dmitry Baryshkov Signed-off-by: Connor Abbott Reviewed-by: Konrad Dybcio Acked-by: Bjorn Andersson Link: https://lore.kernel.org/r/20240430-a750-raytracing-v3-2-7f57c5ac082d@gmail.com Signed-off-by: Bjorn Andersson --- drivers/firmware/qcom/qcom_scm.c | 14 ++++++++++++++ drivers/firmware/qcom/qcom_scm.h | 3 +++ include/linux/firmware/qcom/qcom_scm.h | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c index 68f4df7e6c3c..0f2e628f5cac 100644 --- a/drivers/firmware/qcom/qcom_scm.c +++ b/drivers/firmware/qcom/qcom_scm.c @@ -1394,6 +1394,20 @@ int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, } EXPORT_SYMBOL_GPL(qcom_scm_lmh_dcvsh); +int qcom_scm_gpu_init_regs(u32 gpu_req) +{ + struct qcom_scm_desc desc = { + .svc = QCOM_SCM_SVC_GPU, + .cmd = QCOM_SCM_SVC_GPU_INIT_REGS, + .arginfo = QCOM_SCM_ARGS(1), + .args[0] = gpu_req, + .owner = ARM_SMCCC_OWNER_SIP, + }; + + return qcom_scm_call(__scm->dev, &desc, NULL); +} +EXPORT_SYMBOL_GPL(qcom_scm_gpu_init_regs); + static int qcom_scm_find_dload_address(struct device *dev, u64 *addr) { struct device_node *tcsr; diff --git a/drivers/firmware/qcom/qcom_scm.h b/drivers/firmware/qcom/qcom_scm.h index 4532907e8489..484e030bcac9 100644 --- a/drivers/firmware/qcom/qcom_scm.h +++ b/drivers/firmware/qcom/qcom_scm.h @@ -138,6 +138,9 @@ int scm_legacy_call(struct device *dev, const struct qcom_scm_desc *desc, #define QCOM_SCM_WAITQ_RESUME 0x02 #define QCOM_SCM_WAITQ_GET_WQ_CTX 0x03 +#define QCOM_SCM_SVC_GPU 0x28 +#define QCOM_SCM_SVC_GPU_INIT_REGS 0x01 + /* common error codes */ #define QCOM_SCM_V2_EBUSY -12 #define QCOM_SCM_ENOMEM -5 diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index aaa19f93ac43..a221a643dc12 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -115,6 +115,29 @@ int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, int qcom_scm_lmh_profile_change(u32 profile_id); bool qcom_scm_lmh_dcvsh_available(void); +/* + * Request TZ to program set of access controlled registers necessary + * irrespective of any features + */ +#define QCOM_SCM_GPU_ALWAYS_EN_REQ BIT(0) +/* + * Request TZ to program BCL id to access controlled register when BCL is + * enabled + */ +#define QCOM_SCM_GPU_BCL_EN_REQ BIT(1) +/* + * Request TZ to program set of access controlled register for CLX feature + * when enabled + */ +#define QCOM_SCM_GPU_CLX_EN_REQ BIT(2) +/* + * Request TZ to program tsense ids to access controlled registers for reading + * gpu temperature sensors + */ +#define QCOM_SCM_GPU_TSENSE_EN_REQ BIT(3) + +int qcom_scm_gpu_init_regs(u32 gpu_req); + #ifdef CONFIG_QCOM_QSEECOM int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id); -- cgit v1.2.3 From 9267997fa7aa0b597e8b32cb3fdfe91be1d35a83 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 5 Jun 2024 22:10:14 +0200 Subject: soc: qcom: Move some socinfo defines to the header In preparation for parsing the chip "feature code" (FC) and "product code" (PC) (essentially the parameters that let us conclusively characterize the sillicon we're running on, including various speed bins), move the socinfo version defines to the public header. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240605-topic-smem_speedbin-v2-1-8989d7e3d176@linaro.org Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/socinfo.c | 8 -------- include/linux/soc/qcom/socinfo.h | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/soc/qcom/socinfo.c b/drivers/soc/qcom/socinfo.c index 277c07a6603d..cf4616a468f2 100644 --- a/drivers/soc/qcom/socinfo.c +++ b/drivers/soc/qcom/socinfo.c @@ -21,14 +21,6 @@ #include -/* - * SoC version type with major number in the upper 16 bits and minor - * number in the lower 16 bits. - */ -#define SOCINFO_MAJOR(ver) (((ver) >> 16) & 0xffff) -#define SOCINFO_MINOR(ver) ((ver) & 0xffff) -#define SOCINFO_VERSION(maj, min) ((((maj) & 0xffff) << 16)|((min) & 0xffff)) - /* Helper macros to create soc_id table */ #define qcom_board_id(id) QCOM_ID_ ## id, __stringify(id) #define qcom_board_id_named(id, name) QCOM_ID_ ## id, (name) diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index e78777bb0f4a..10e0a4c287f4 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -12,6 +12,14 @@ #define SMEM_SOCINFO_BUILD_ID_LENGTH 32 #define SMEM_SOCINFO_CHIP_ID_LENGTH 32 +/* + * SoC version type with major number in the upper 16 bits and minor + * number in the lower 16 bits. + */ +#define SOCINFO_MAJOR(ver) (((ver) >> 16) & 0xffff) +#define SOCINFO_MINOR(ver) ((ver) & 0xffff) +#define SOCINFO_VERSION(maj, min) ((((maj) & 0xffff) << 16)|((min) & 0xffff)) + /* Socinfo SMEM item structure */ struct socinfo { __le32 fmt; -- cgit v1.2.3 From 81bbb2b891174da9301fc0d4fe9622bd4cb6a995 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Wed, 5 Jun 2024 22:10:15 +0200 Subject: soc: qcom: smem: Add a feature code getter Recent (SM8550+ ish) Qualcomm SoCs have a new mechanism for precisely identifying the specific SKU and the precise speed bin (in the general meaning of this word, anyway): a pair of values called Product Code and Feature Code. Based on this information, we can deduce the available frequencies for things such as Adreno. In the case of Adreno specifically, Pcode is useless for non-prototype SoCs. Introduce a getter for the feature code and export it. Reviewed-by: Dmitry Baryshkov Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20240605-topic-smem_speedbin-v2-2-8989d7e3d176@linaro.org Signed-off-by: Bjorn Andersson --- drivers/soc/qcom/smem.c | 33 +++++++++++++++++++++++++++++++++ include/linux/soc/qcom/smem.h | 1 + include/linux/soc/qcom/socinfo.h | 26 ++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) (limited to 'include/linux') diff --git a/drivers/soc/qcom/smem.c b/drivers/soc/qcom/smem.c index 7191fa0c087f..e40aac281b06 100644 --- a/drivers/soc/qcom/smem.c +++ b/drivers/soc/qcom/smem.c @@ -795,6 +795,39 @@ int qcom_smem_get_soc_id(u32 *id) } EXPORT_SYMBOL_GPL(qcom_smem_get_soc_id); +/** + * qcom_smem_get_feature_code() - return the feature code + * @code: On success, return the feature code here. + * + * Look up the feature code identifier from SMEM and return it. + * + * Return: 0 on success, negative errno on failure. + */ +int qcom_smem_get_feature_code(u32 *code) +{ + struct socinfo *info; + u32 raw_code; + + info = qcom_smem_get(QCOM_SMEM_HOST_ANY, SMEM_HW_SW_BUILD_ID, NULL); + if (IS_ERR(info)) + return PTR_ERR(info); + + /* This only makes sense for socinfo >= 16 */ + if (__le32_to_cpu(info->fmt) < SOCINFO_VERSION(0, 16)) + return -EOPNOTSUPP; + + raw_code = __le32_to_cpu(info->feature_code); + + /* Ensure the value makes sense */ + if (raw_code > SOCINFO_FC_INT_MAX) + raw_code = SOCINFO_FC_UNKNOWN; + + *code = raw_code; + + return 0; +} +EXPORT_SYMBOL_GPL(qcom_smem_get_feature_code); + static int qcom_smem_get_sbl_version(struct qcom_smem *smem) { struct smem_header *header; diff --git a/include/linux/soc/qcom/smem.h b/include/linux/soc/qcom/smem.h index a36a3b9d4929..0943bf419e11 100644 --- a/include/linux/soc/qcom/smem.h +++ b/include/linux/soc/qcom/smem.h @@ -13,5 +13,6 @@ int qcom_smem_get_free_space(unsigned host); phys_addr_t qcom_smem_virt_to_phys(void *p); int qcom_smem_get_soc_id(u32 *id); +int qcom_smem_get_feature_code(u32 *code); #endif diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index 10e0a4c287f4..608950443eee 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -3,6 +3,8 @@ #ifndef __QCOM_SOCINFO_H__ #define __QCOM_SOCINFO_H__ +#include + /* * SMEM item id, used to acquire handles to respective * SMEM region. @@ -82,4 +84,28 @@ struct socinfo { __le32 boot_core; }; +/* Internal feature codes */ +enum qcom_socinfo_feature_code { + /* External feature codes */ + SOCINFO_FC_UNKNOWN = 0x0, + SOCINFO_FC_AA, + SOCINFO_FC_AB, + SOCINFO_FC_AC, + SOCINFO_FC_AD, + SOCINFO_FC_AE, + SOCINFO_FC_AF, + SOCINFO_FC_AG, + SOCINFO_FC_AH, +}; + +/* Internal feature codes */ +/* Valid values: 0 <= n <= 0xf */ +#define SOCINFO_FC_Yn(n) (0xf1 + (n)) +#define SOCINFO_FC_INT_MAX SOCINFO_FC_Yn(0xf) + +/* Product codes */ +#define SOCINFO_PC_UNKNOWN 0 +#define SOCINFO_PCn(n) ((n) + 1) +#define SOCINFO_PC_RESERVE (BIT(31) - 1) + #endif -- cgit v1.2.3 From 1cbf347288702af0fe8667c0ce760afbe982a2f1 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 14 Jun 2024 11:14:18 +0300 Subject: i2c: Add nop fwnode operations Add nop variants of i2c_find_device_by_fwnode(), i2c_find_adapter_by_fwnode() and i2c_get_adapter_by_fwnode() for use without CONFIG_I2C. Signed-off-by: Sakari Ailus Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 9709537370ee..424acb98c7c2 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -960,8 +960,6 @@ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr); #define builtin_i2c_driver(__i2c_driver) \ builtin_driver(__i2c_driver, i2c_add_driver) -#endif /* I2C */ - /* must call put_device() when done with returned i2c_client device */ struct i2c_client *i2c_find_device_by_fwnode(struct fwnode_handle *fwnode); @@ -971,6 +969,28 @@ struct i2c_adapter *i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode); /* must call i2c_put_adapter() when done with returned i2c_adapter device */ struct i2c_adapter *i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode); +#else /* I2C */ + +static inline struct i2c_client * +i2c_find_device_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct i2c_adapter * +i2c_find_adapter_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct i2c_adapter * +i2c_get_adapter_by_fwnode(struct fwnode_handle *fwnode) +{ + return NULL; +} + +#endif /* !I2C */ + #if IS_ENABLED(CONFIG_OF) /* must call put_device() when done with returned i2c_client device */ static inline struct i2c_client *of_find_i2c_device_by_node(struct device_node *node) -- cgit v1.2.3 From f80a55fa90fa76d01e3fffaa5d0413e522ab9a00 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 17 Jun 2024 09:27:27 +0200 Subject: nvme: fixup comment for nvme RDMA Provider Type PRTYPE is the provider type, not the QP service type. Fixes: eb793e2c9286 ("nvme.h: add NVMe over Fabrics definitions") Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 425573202295..69ac2abf8acf 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -87,8 +87,8 @@ enum { NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ }; -/* RDMA QP Service Type codes for Discovery Log Page entry TSAS - * RDMA_QPTYPE field +/* RDMA Provider Type codes for Discovery Log Page entry TSAS + * RDMA_PRTYPE field */ enum { NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */ -- cgit v1.2.3 From 0f1f5803920d2a6b88bee950914fd37421e17170 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 17 Jun 2024 09:27:28 +0200 Subject: nvmet: make 'tsas' attribute idempotent for RDMA The RDMA transport defines values for TSAS, but it cannot be changed as we only support the 'connected' mode. So to avoid errors during reconfiguration we should allow to write the current value. Fixes: 3f123494db72 ("nvmet: make TCP sectype settable via configfs") Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 39 ++++++++++++++++++++++++++++++--------- include/linux/nvme.h | 2 ++ 2 files changed, 32 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e60224356048..685e89b35d33 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -413,25 +413,46 @@ static ssize_t nvmet_addr_tsas_show(struct config_item *item, return sprintf(page, "\n"); } +static u8 nvmet_addr_tsas_rdma_store(const char *page) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_rdma); i++) { + if (sysfs_streq(page, nvmet_addr_tsas_rdma[i].name)) + return nvmet_addr_tsas_rdma[i].type; + } + return NVMF_RDMA_QPTYPE_INVALID; +} + +static u8 nvmet_addr_tsas_tcp_store(const char *page) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_tcp); i++) { + if (sysfs_streq(page, nvmet_addr_tsas_tcp[i].name)) + return nvmet_addr_tsas_tcp[i].type; + } + return NVMF_TCP_SECTYPE_INVALID; +} + static ssize_t nvmet_addr_tsas_store(struct config_item *item, const char *page, size_t count) { struct nvmet_port *port = to_nvmet_port(item); u8 treq = nvmet_port_disc_addr_treq_mask(port); - u8 sectype; - int i; + u8 sectype, qptype; if (nvmet_is_port_enabled(port, __func__)) return -EACCES; - if (port->disc_addr.trtype != NVMF_TRTYPE_TCP) - return -EINVAL; - - for (i = 0; i < ARRAY_SIZE(nvmet_addr_tsas_tcp); i++) { - if (sysfs_streq(page, nvmet_addr_tsas_tcp[i].name)) { - sectype = nvmet_addr_tsas_tcp[i].type; + if (port->disc_addr.trtype == NVMF_TRTYPE_RDMA) { + qptype = nvmet_addr_tsas_rdma_store(page); + if (qptype == port->disc_addr.tsas.rdma.qptype) + return count; + } else if (port->disc_addr.trtype == NVMF_TRTYPE_TCP) { + sectype = nvmet_addr_tsas_tcp_store(page); + if (sectype != NVMF_TCP_SECTYPE_INVALID) goto found; - } } pr_err("Invalid value '%s' for tsas\n", page); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 69ac2abf8acf..c693ac344ec0 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -85,6 +85,7 @@ enum { enum { NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */ NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ + NVMF_RDMA_QPTYPE_INVALID = 0xff, }; /* RDMA Provider Type codes for Discovery Log Page entry TSAS @@ -110,6 +111,7 @@ enum { NVMF_TCP_SECTYPE_NONE = 0, /* No Security */ NVMF_TCP_SECTYPE_TLS12 = 1, /* TLSv1.2, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */ NVMF_TCP_SECTYPE_TLS13 = 2, /* TLSv1.3, NVMe-oF 1.1 and NVMe-TCP 3.6.1.1 */ + NVMF_TCP_SECTYPE_INVALID = 0xff, }; #define NVME_AQ_DEPTH 32 -- cgit v1.2.3 From 90c3e2bc9ecbf3210d661fadee4c947d34a88ceb Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 30 Apr 2024 11:43:16 +0100 Subject: firmware: qcom_scm: Add gpu_init_regs call This will used by drm/msm to initialize GPU registers that Qualcomm's firmware doesn't make writeable to the kernel. Reviewed-by: Dmitry Baryshkov Signed-off-by: Connor Abbott Reviewed-by: Konrad Dybcio Acked-by: Bjorn Andersson Patchwork: https://patchwork.freedesktop.org/patch/592039/ Signed-off-by: Rob Clark --- drivers/firmware/qcom/qcom_scm.c | 14 ++++++++++++++ drivers/firmware/qcom/qcom_scm.h | 3 +++ include/linux/firmware/qcom/qcom_scm.h | 23 +++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/qcom/qcom_scm.c b/drivers/firmware/qcom/qcom_scm.c index 68f4df7e6c3c..0f2e628f5cac 100644 --- a/drivers/firmware/qcom/qcom_scm.c +++ b/drivers/firmware/qcom/qcom_scm.c @@ -1394,6 +1394,20 @@ int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, } EXPORT_SYMBOL_GPL(qcom_scm_lmh_dcvsh); +int qcom_scm_gpu_init_regs(u32 gpu_req) +{ + struct qcom_scm_desc desc = { + .svc = QCOM_SCM_SVC_GPU, + .cmd = QCOM_SCM_SVC_GPU_INIT_REGS, + .arginfo = QCOM_SCM_ARGS(1), + .args[0] = gpu_req, + .owner = ARM_SMCCC_OWNER_SIP, + }; + + return qcom_scm_call(__scm->dev, &desc, NULL); +} +EXPORT_SYMBOL_GPL(qcom_scm_gpu_init_regs); + static int qcom_scm_find_dload_address(struct device *dev, u64 *addr) { struct device_node *tcsr; diff --git a/drivers/firmware/qcom/qcom_scm.h b/drivers/firmware/qcom/qcom_scm.h index 4532907e8489..484e030bcac9 100644 --- a/drivers/firmware/qcom/qcom_scm.h +++ b/drivers/firmware/qcom/qcom_scm.h @@ -138,6 +138,9 @@ int scm_legacy_call(struct device *dev, const struct qcom_scm_desc *desc, #define QCOM_SCM_WAITQ_RESUME 0x02 #define QCOM_SCM_WAITQ_GET_WQ_CTX 0x03 +#define QCOM_SCM_SVC_GPU 0x28 +#define QCOM_SCM_SVC_GPU_INIT_REGS 0x01 + /* common error codes */ #define QCOM_SCM_V2_EBUSY -12 #define QCOM_SCM_ENOMEM -5 diff --git a/include/linux/firmware/qcom/qcom_scm.h b/include/linux/firmware/qcom/qcom_scm.h index aaa19f93ac43..a221a643dc12 100644 --- a/include/linux/firmware/qcom/qcom_scm.h +++ b/include/linux/firmware/qcom/qcom_scm.h @@ -115,6 +115,29 @@ int qcom_scm_lmh_dcvsh(u32 payload_fn, u32 payload_reg, u32 payload_val, int qcom_scm_lmh_profile_change(u32 profile_id); bool qcom_scm_lmh_dcvsh_available(void); +/* + * Request TZ to program set of access controlled registers necessary + * irrespective of any features + */ +#define QCOM_SCM_GPU_ALWAYS_EN_REQ BIT(0) +/* + * Request TZ to program BCL id to access controlled register when BCL is + * enabled + */ +#define QCOM_SCM_GPU_BCL_EN_REQ BIT(1) +/* + * Request TZ to program set of access controlled register for CLX feature + * when enabled + */ +#define QCOM_SCM_GPU_CLX_EN_REQ BIT(2) +/* + * Request TZ to program tsense ids to access controlled registers for reading + * gpu temperature sensors + */ +#define QCOM_SCM_GPU_TSENSE_EN_REQ BIT(3) + +int qcom_scm_gpu_init_regs(u32 gpu_req); + #ifdef CONFIG_QCOM_QSEECOM int qcom_scm_qseecom_app_get_id(const char *app_name, u32 *app_id); -- cgit v1.2.3 From 06efa5f30c28eaf237247ca8c4cb46eb62cb6bd9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 22 Jun 2024 21:38:58 -0400 Subject: closures: closure_get_not_zero(), closure_return_sync() Provide new primitives for solving a lifetime issue with bcachefs btree_trans objects. closure_sync_return(): like closure_sync(), wait synchronously for any outstanding gets. like closure_return, the closure is considered "finished" and the ref left at 0. closure_get_not_zero(): get a ref on a closure if it's alive, i.e. the ref is not zero. Signed-off-by: Kent Overstreet --- include/linux/closure.h | 23 ++++++++++++++++++++++ lib/closure.c | 52 +++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 69 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/closure.h b/include/linux/closure.h index 99155df162d0..59b8c06b11ff 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -284,6 +284,21 @@ static inline void closure_get(struct closure *cl) #endif } +/** + * closure_get_not_zero + */ +static inline bool closure_get_not_zero(struct closure *cl) +{ + unsigned old = atomic_read(&cl->remaining); + do { + if (!(old & CLOSURE_REMAINING_MASK)) + return false; + + } while (!atomic_try_cmpxchg_acquire(&cl->remaining, &old, old + 1)); + + return true; +} + /** * closure_init - Initialize a closure, setting the refcount to 1 * @cl: closure to initialize @@ -310,6 +325,12 @@ static inline void closure_init_stack(struct closure *cl) atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); } +static inline void closure_init_stack_release(struct closure *cl) +{ + memset(cl, 0, sizeof(struct closure)); + atomic_set_release(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +} + /** * closure_wake_up - wake up all closures on a wait list, * with memory barrier @@ -355,6 +376,8 @@ do { \ */ #define closure_return(_cl) continue_at((_cl), NULL, NULL) +void closure_return_sync(struct closure *cl); + /** * continue_at_nobarrier - jump to another function without barrier * diff --git a/lib/closure.c b/lib/closure.c index 2e1ee9fdec08..c971216d9d77 100644 --- a/lib/closure.c +++ b/lib/closure.c @@ -13,7 +13,7 @@ #include #include -static inline void closure_put_after_sub(struct closure *cl, int flags) +static inline void closure_put_after_sub_checks(int flags) { int r = flags & CLOSURE_REMAINING_MASK; @@ -22,12 +22,17 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) flags & CLOSURE_GUARD_MASK, (unsigned) __fls(r))) r &= ~CLOSURE_GUARD_MASK; - if (!r) { - smp_acquire__after_ctrl_dep(); + WARN(!r && (flags & ~CLOSURE_DESTRUCTOR), + "closure ref hit 0 with incorrect flags set: %x (%u)", + flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); +} + +static inline void closure_put_after_sub(struct closure *cl, int flags) +{ + closure_put_after_sub_checks(flags); - WARN(flags & ~CLOSURE_DESTRUCTOR, - "closure ref hit 0 with incorrect flags set: %x (%u)", - flags & ~CLOSURE_DESTRUCTOR, (unsigned) __fls(flags)); + if (!(flags & CLOSURE_REMAINING_MASK)) { + smp_acquire__after_ctrl_dep(); cl->closure_get_happened = false; @@ -145,6 +150,41 @@ void __sched __closure_sync(struct closure *cl) } EXPORT_SYMBOL(__closure_sync); +/* + * closure_return_sync - finish running a closure, synchronously (i.e. waiting + * for outstanding get()s to finish) and returning once closure refcount is 0. + * + * Unlike closure_sync() this doesn't reinit the ref to 1; subsequent + * closure_get_not_zero() calls waill fail. + */ +void __sched closure_return_sync(struct closure *cl) +{ + struct closure_syncer s = { .task = current }; + + cl->s = &s; + set_closure_fn(cl, closure_sync_fn, NULL); + + unsigned flags = atomic_sub_return_release(1 + CLOSURE_RUNNING - CLOSURE_DESTRUCTOR, + &cl->remaining); + + closure_put_after_sub_checks(flags); + + if (unlikely(flags & CLOSURE_REMAINING_MASK)) { + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (s.done) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); + } + + if (cl->parent) + closure_put(cl->parent); +} +EXPORT_SYMBOL(closure_return_sync); + int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout) { struct closure_syncer s = { .task = current }; -- cgit v1.2.3 From 467cfe945656df044c8cf9121e5cdbe5b977b497 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Mon, 19 Feb 2024 13:43:55 +0200 Subject: accel/habanalabs/gaudi2: align embedded specs headers Align embedded headers to latest release. Reviewed-by: Tomer Tayar Signed-off-by: Ofir Bitton --- .../accel/habanalabs/include/gaudi2/gaudi2_fw_if.h | 27 ++++++-------------- .../habanalabs/include/gaudi2/gaudi2_reg_map.h | 8 ++++++ include/linux/habanalabs/cpucp_if.h | 10 ++++++-- include/linux/habanalabs/hl_boot_if.h | 29 ++++++++++++++++------ 4 files changed, 45 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h index 18ca147b1c86..6ea936c9594e 100644 --- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h +++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h @@ -45,6 +45,13 @@ #define GAUDI2_ARM_RX_MB_OFFSET (GAUDI2_ARM_RX_MB_ADDR - \ GAUDI2_SP_SRAM_BASE_ADDR) +#define POWER_MODE_LEVELS { \ + 150000, /* 00 */ \ + 250000, /* 01 */ \ + 400000, /* 10 */ \ + /* 11: Normal mode */ \ +} + enum gaudi2_fw_status { GAUDI2_PID_STATUS_UP = 0x1, /* PID on ARC0 is up */ GAUDI2_ARM_STATUS_UP = 0x2, /* ARM Linux Boot complete */ @@ -52,26 +59,6 @@ enum gaudi2_fw_status { GAUDI2_STATUS_LAST = 0xFF }; -struct gaudi2_cold_rst_data { - union { - struct { - u32 recovery_flag: 1; - u32 validation_flag: 1; - u32 efuse_read_flag: 1; - u32 spsram_init_done : 1; - u32 fake_security_enable : 1; - u32 fake_sig_validation_en : 1; - u32 bist_skip_enable : 1; - u32 reserved1 : 1; - u32 fake_bis_compliant : 1; - u32 wd_rst_cause_arm : 1; - u32 wd_rst_cause_arcpid : 1; - u32 reserved : 21; - }; - __le32 data; - }; -}; - enum gaudi2_rst_src { HL_COLD_RST = 1, HL_MANUAL_RST = 2, diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h index f3eaeb6d9b7e..1e9c056e437d 100644 --- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h +++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h @@ -58,4 +58,12 @@ #define mmWD_GPIO_DATAOUT_REG mmPSOC_GPIO3_DATAOUT #define mmSTM_PROFILER_SPE_REG mmPSOC_STM_STMSPER +/* Registers below are used to pass the boot_if data between ARM and ARC1 */ +#define mmARM_MSG_BOOT_ERR_SET mmCPU_IF_SPECIAL_GLBL_SPARE_0 +#define mmARM_MSG_BOOT_ERR_CLR mmCPU_IF_SPECIAL_GLBL_SPARE_1 +#define mmARM_MSG_BOOT_DEV_STS_SET mmCPU_IF_SPECIAL_GLBL_SPARE_2 +#define mmARM_MSG_BOOT_DEV_STS_CLR mmCPU_IF_SPECIAL_GLBL_SPARE_3 +#define mmMGMT_MSG_BOOT_ERR mmCPU_MSTR_IF_SPECIAL_GLBL_SPARE_0 +#define mmMGMT_MSG_BOOT_DEV_STS mmCPU_MSTR_IF_SPECIAL_GLBL_SPARE_1 + #endif /* GAUDI2_REG_MAP_H_ */ diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index f316c8d0f3fc..1ac1d68193e3 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -42,6 +42,12 @@ enum eq_event_id { EQ_EVENT_PWR_BRK_ENTRY, EQ_EVENT_PWR_BRK_EXIT, EQ_EVENT_HEARTBEAT, + EQ_EVENT_CPLD_RESET_REASON, + EQ_EVENT_CPLD_SHUTDOWN, + EQ_EVENT_POWER_EVT_START, + EQ_EVENT_POWER_EVT_END, + EQ_EVENT_THERMAL_EVT_START, + EQ_EVENT_THERMAL_EVT_END, }; /* @@ -1165,7 +1171,7 @@ struct cpucp_security_info { struct cpucp_info { struct cpucp_sensor sensors[CPUCP_MAX_SENSORS]; __u8 kernel_version[VERSION_MAX_LEN]; - __le32 reserved; + __le32 reserved1; __le32 card_type; __le32 card_location; __le32 cpld_version; @@ -1187,7 +1193,7 @@ struct cpucp_info { __u8 substrate_version; __u8 eq_health_check_supported; struct cpucp_security_info sec_info; - __le32 fw_hbm_region_size; + __le32 reserved2; __u8 pll_map[PLL_MAP_LEN]; __le64 mme_binning_mask; __u8 fw_os_version[VERSION_MAX_LEN]; diff --git a/include/linux/habanalabs/hl_boot_if.h b/include/linux/habanalabs/hl_boot_if.h index 93366d5621fd..d2a9fc96424b 100644 --- a/include/linux/habanalabs/hl_boot_if.h +++ b/include/linux/habanalabs/hl_boot_if.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2018-2020 HabanaLabs, Ltd. + * Copyright 2018-2023 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -49,7 +49,6 @@ enum cpu_boot_err { #define CPU_BOOT_ERR_FATAL_MASK \ ((1 << CPU_BOOT_ERR_DRAM_INIT_FAIL) | \ (1 << CPU_BOOT_ERR_PLL_FAIL) | \ - (1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL) | \ (1 << CPU_BOOT_ERR_BINNING_FAIL) | \ (1 << CPU_BOOT_ERR_DRAM_SKIPPED) | \ (1 << CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL) | \ @@ -194,6 +193,8 @@ enum cpu_boot_dev_sts { CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN = 24, CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN = 25, CPU_BOOT_DEV_STS_MAP_HWMON_EN = 26, + CPU_BOOT_DEV_STS_NIC_MEM_CLEAR_EN = 27, + CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN = 28, CPU_BOOT_DEV_STS_ENABLED = 31, CPU_BOOT_DEV_STS_SCND_EN = 63, CPU_BOOT_DEV_STS_LAST = 64 /* we have 2 registers of 32 bits */ @@ -331,6 +332,17 @@ enum cpu_boot_dev_sts { * HWMON enum mapping to cpucp enums. * Initialized in: linux * + * CPU_BOOT_DEV_STS0_NIC_MEM_CLEAR_EN + * If set, means f/w supports nic hbm memory clear and + * tmr,txs hbm memory init. + * Initialized in: zephyr-mgmt + * + * CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN + * MMU page tables are located in DRAM. + * F/W initializes security settings for MMU + * page tables to reside in DRAM. + * Initialized in: zephyr-mgmt + * * CPU_BOOT_DEV_STS0_ENABLED Device status register enabled. * This is a main indication that the * running FW populates the device status @@ -367,6 +379,8 @@ enum cpu_boot_dev_sts { #define CPU_BOOT_DEV_STS0_FW_NIC_STAT_EXT_EN (1 << CPU_BOOT_DEV_STS_FW_NIC_STAT_EXT_EN) #define CPU_BOOT_DEV_STS0_IS_IDLE_CHECK_EN (1 << CPU_BOOT_DEV_STS_IS_IDLE_CHECK_EN) #define CPU_BOOT_DEV_STS0_MAP_HWMON_EN (1 << CPU_BOOT_DEV_STS_MAP_HWMON_EN) +#define CPU_BOOT_DEV_STS0_NIC_MEM_CLEAR_EN (1 << CPU_BOOT_DEV_STS_NIC_MEM_CLEAR_EN) +#define CPU_BOOT_DEV_STS0_MMU_PGTBL_DRAM_EN (1 << CPU_BOOT_DEV_STS_MMU_PGTBL_DRAM_EN) #define CPU_BOOT_DEV_STS0_ENABLED (1 << CPU_BOOT_DEV_STS_ENABLED) #define CPU_BOOT_DEV_STS1_ENABLED (1 << CPU_BOOT_DEV_STS_ENABLED) @@ -450,11 +464,11 @@ struct cpu_dyn_regs { __le32 gic_dma_core_irq_ctrl; __le32 gic_host_halt_irq; __le32 gic_host_ints_irq; - __le32 gic_host_soft_rst_irq; + __le32 reserved0; __le32 gic_rot_qm_irq_ctrl; - __le32 cpu_rst_status; + __le32 reserved1; __le32 eng_arc_irq_ctrl; - __le32 reserved1[20]; /* reserve for future use */ + __le32 reserved2[20]; /* reserve for future use */ }; /* TODO: remove the desc magic after the code is updated to use message */ @@ -551,8 +565,9 @@ enum lkd_fw_ascii_msg_lvls { LKD_FW_ASCII_MSG_DBG = 3, }; -#define LKD_FW_ASCII_MSG_MAX_LEN 128 -#define LKD_FW_ASCII_MSG_MAX 4 /* consider ABI when changing */ +#define LKD_FW_ASCII_MSG_MAX_LEN 128 +#define LKD_FW_ASCII_MSG_MAX 4 /* consider ABI when changing */ +#define LKD_FW_ASCII_MSG_MIN_DESC_VERSION 3 struct lkd_fw_ascii_msg { __u8 valid; -- cgit v1.2.3 From c2a27584ff3b7f17da3ed2abfa3956f9d376e3da Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 11 Mar 2024 18:31:10 +0200 Subject: accel/habanalabs: separate nonce from max_size in cpucp_packet struct In struct cpucp_packet both nonce and data_max_size members are in an union overlapping each other. This is a problem as they both are used in attestation and info_signed packets. The solution here is to move the nonce member to a different union under the same structure. Signed-off-by: Dani Liberman Reviewed-by: Ofir Bitton Signed-off-by: Ofir Bitton --- include/linux/habanalabs/cpucp_if.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 1ac1d68193e3..0913415243e8 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -859,9 +859,6 @@ struct cpucp_packet { * result cannot be used to hold general purpose data. */ __le32 status_mask; - - /* random, used once number, for security packets */ - __le32 nonce; }; union { @@ -870,6 +867,9 @@ struct cpucp_packet { /* For Generic packet sub index */ __le32 pkt_subidx; + + /* random, used once number, for security packets */ + __le32 nonce; }; }; -- cgit v1.2.3 From cebb64f9335b073cc2877fed16c613c56bed82bc Mon Sep 17 00:00:00 2001 From: Vitaly Margolin Date: Mon, 27 May 2024 17:25:37 +0300 Subject: accel/habanalabs: add cpld ts cpld_timestamp cpucp Add cpld_timestamp field to cpucp_info structure and return cpld timestamp as part of cpld version Signed-off-by: Vitaly Margolin Reviewed-by: Ofir Bitton Signed-off-by: Ofir Bitton --- drivers/accel/habanalabs/common/sysfs.c | 5 +++-- include/linux/habanalabs/cpucp_if.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c index b6c63f8a0c1b..e9f8ccc0bbf9 100644 --- a/drivers/accel/habanalabs/common/sysfs.c +++ b/drivers/accel/habanalabs/common/sysfs.c @@ -142,8 +142,9 @@ static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr, { struct hl_device *hdev = dev_get_drvdata(dev); - return sprintf(buf, "0x%08x\n", - le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_version)); + return sprintf(buf, "0x%08x%08x\n", + le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_timestamp), + le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_version)); } static ssize_t cpucp_kernel_ver_show(struct device *dev, diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 0913415243e8..1ed17887f1a8 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -1146,6 +1146,7 @@ struct cpucp_security_info { * (0 = fully functional, 1 = lower-half is not functional, * 2 = upper-half is not functional) * @sec_info: security information + * @cpld_timestamp: CPLD programmed F/W timestamp. * @pll_map: Bit map of supported PLLs for current ASIC version. * @mme_binning_mask: MME binning mask, * bits [0:6] <==> dcore0 mme fma @@ -1193,7 +1194,7 @@ struct cpucp_info { __u8 substrate_version; __u8 eq_health_check_supported; struct cpucp_security_info sec_info; - __le32 reserved2; + __le32 cpld_timestamp; __u8 pll_map[PLL_MAP_LEN]; __le64 mme_binning_mask; __u8 fw_os_version[VERSION_MAX_LEN]; -- cgit v1.2.3 From c4548eee537ed7671170eb370c6c8db12dcfd3e8 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Tue, 16 Apr 2024 19:01:42 +0300 Subject: accel/habanalabs: dump the EQ entries headers on EQ heartbeat failure Add a dump of the EQ entries headers upon a EQ heartbeat failure. Signed-off-by: Tomer Tayar Reviewed-by: Ofir Bitton Signed-off-by: Ofir Bitton --- drivers/accel/habanalabs/common/device.c | 2 ++ drivers/accel/habanalabs/common/habanalabs.h | 1 + drivers/accel/habanalabs/common/irq.c | 25 +++++++++++++++++++++++++ include/linux/habanalabs/cpucp_if.h | 3 +++ 4 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 3efc26dd9497..7bd7c2eb5dd2 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1082,6 +1082,8 @@ static bool hl_device_eq_heartbeat_received(struct hl_device *hdev) atomic_read(&hdev->kernel_queues[cpu_q_id].ci), atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask); + hl_eq_dump(hdev, &hdev->event_queue); + return false; } diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h index f4ac5e9b1974..ce78b331e244 100644 --- a/drivers/accel/habanalabs/common/habanalabs.h +++ b/drivers/accel/habanalabs/common/habanalabs.h @@ -3754,6 +3754,7 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q); void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q); void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q); void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q); +void hl_eq_dump(struct hl_device *hdev, struct hl_eq *q); irqreturn_t hl_irq_handler_cq(int irq, void *arg); irqreturn_t hl_irq_handler_eq(int irq, void *arg); irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg); diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c index 2caf2df4de08..7c9f2f6a2870 100644 --- a/drivers/accel/habanalabs/common/irq.c +++ b/drivers/accel/habanalabs/common/irq.c @@ -697,3 +697,28 @@ void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q) memset(q->kernel_address, 0, q->size); } + +void hl_eq_dump(struct hl_device *hdev, struct hl_eq *q) +{ + u32 eq_length, eqe_size, ctl, ready, mode, type, index; + struct hl_eq_header *hdr; + u8 *ptr; + int i; + + eq_length = HL_EQ_LENGTH; + eqe_size = q->size / HL_EQ_LENGTH; + + dev_info(hdev->dev, "Contents of EQ entries headers:\n"); + + for (i = 0, ptr = q->kernel_address ; i < eq_length ; ++i, ptr += eqe_size) { + hdr = (struct hl_eq_header *) ptr; + ctl = le32_to_cpu(hdr->ctl); + ready = FIELD_GET(EQ_CTL_READY_MASK, ctl); + mode = FIELD_GET(EQ_CTL_EVENT_MODE_MASK, ctl); + type = FIELD_GET(EQ_CTL_EVENT_TYPE_MASK, ctl); + index = FIELD_GET(EQ_CTL_INDEX_MASK, ctl); + + dev_info(hdev->dev, "%02u: %#010x [ready: %u, mode %u, type %04u, index %05u]\n", + i, ctl, ready, mode, type, index); + } +} diff --git a/include/linux/habanalabs/cpucp_if.h b/include/linux/habanalabs/cpucp_if.h index 1ed17887f1a8..7ed3fdd55dda 100644 --- a/include/linux/habanalabs/cpucp_if.h +++ b/include/linux/habanalabs/cpucp_if.h @@ -397,6 +397,9 @@ struct hl_eq_entry { #define EQ_CTL_READY_SHIFT 31 #define EQ_CTL_READY_MASK 0x80000000 +#define EQ_CTL_EVENT_MODE_SHIFT 28 +#define EQ_CTL_EVENT_MODE_MASK 0x70000000 + #define EQ_CTL_EVENT_TYPE_SHIFT 16 #define EQ_CTL_EVENT_TYPE_MASK 0x0FFF0000 -- cgit v1.2.3 From c5603e2a621dac10c5e21cc430848ebcfa6c7e01 Mon Sep 17 00:00:00 2001 From: Doug Brown Date: Thu, 6 Jun 2024 12:56:31 -0700 Subject: Revert "serial: core: only stop transmit when HW fifo is empty" This reverts commit 7bfb915a597a301abb892f620fe5c283a9fdbd77. This commit broke pxa and omap-serial, because it inhibited them from calling stop_tx() if their TX FIFOs weren't completely empty. This resulted in these two drivers hanging during transmits because the TX interrupt would stay enabled, and a new TX interrupt would never fire. Cc: stable@vger.kernel.org Fixes: 7bfb915a597a ("serial: core: only stop transmit when HW fifo is empty") Signed-off-by: Doug Brown Link: https://lore.kernel.org/r/20240606195632.173255-2-doug@schmorgal.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 8cb65f50e830..3fb9a29e025f 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -811,8 +811,7 @@ enum UART_TX_FLAGS { if (pending < WAKEUP_CHARS) { \ uart_write_wakeup(__port); \ \ - if (!((flags) & UART_TX_NOSTOP) && pending == 0 && \ - __port->ops->tx_empty(__port)) \ + if (!((flags) & UART_TX_NOSTOP) && pending == 0) \ __port->ops->stop_tx(__port); \ } \ \ -- cgit v1.2.3 From 9bb43b9e8d9a288a214e9b17acc9e46fda3977cf Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Thu, 6 Jun 2024 12:56:32 -0700 Subject: serial: core: introduce uart_port_tx_limited_flags() Analogue to uart_port_tx_flags() introduced in commit 3ee07964d407 ("serial: core: introduce uart_port_tx_flags()"), add a _flags variant for uart_port_tx_limited(). Fixes: d11cc8c3c4b6 ("tty: serial: use uart_port_tx_limited()") Cc: stable@vger.kernel.org Signed-off-by: Jonas Gorski Signed-off-by: Doug Brown Link: https://lore.kernel.org/r/20240606195632.173255-3-doug@schmorgal.com Signed-off-by: Greg Kroah-Hartman --- include/linux/serial_core.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 3fb9a29e025f..aea25eef9a1a 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -850,6 +850,24 @@ enum UART_TX_FLAGS { __count--); \ }) +/** + * uart_port_tx_limited_flags -- transmit helper for uart_port with count limiting with flags + * @port: uart port + * @ch: variable to store a character to be written to the HW + * @flags: %UART_TX_NOSTOP or similar + * @count: a limit of characters to send + * @tx_ready: can HW accept more data function + * @put_char: function to write a character + * @tx_done: function to call after the loop is done + * + * See uart_port_tx_limited() for more details. + */ +#define uart_port_tx_limited_flags(port, ch, flags, count, tx_ready, put_char, tx_done) ({ \ + unsigned int __count = (count); \ + __uart_port_tx(port, ch, flags, tx_ready, put_char, tx_done, __count, \ + __count--); \ +}) + /** * uart_port_tx -- transmit helper for uart_port * @port: uart port -- cgit v1.2.3 From 4b8e88e563b5f666446d002ad0dc1e6e8e7102b0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jun 2024 11:34:09 +0200 Subject: ftruncate: pass a signed offset The old ftruncate() syscall, using the 32-bit off_t misses a sign extension when called in compat mode on 64-bit architectures. As a result, passing a negative length accidentally succeeds in truncating to file size between 2GiB and 4GiB. Changing the type of the compat syscall to the signed compat_off_t changes the behavior so it instead returns -EINVAL. The native entry point, the truncate() syscall and the corresponding loff_t based variants are all correct already and do not suffer from this mistake. Fixes: 3f6d078d4acc ("fix compat truncate/ftruncate") Reviewed-by: Christian Brauner Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann --- fs/open.c | 4 ++-- include/linux/compat.h | 2 +- include/linux/syscalls.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/open.c b/fs/open.c index 89cafb572061..50e45bc7c4d8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -202,13 +202,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) return error; } -SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length) +SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length) { return do_sys_ftruncate(fd, length, 1); } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length) +COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length) { return do_sys_ftruncate(fd, length, 1); } diff --git a/include/linux/compat.h b/include/linux/compat.h index 233f61ec8afc..56cebaff0c91 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -608,7 +608,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf); asmlinkage long compat_sys_truncate(const char __user *, compat_off_t); -asmlinkage long compat_sys_ftruncate(unsigned int, compat_ulong_t); +asmlinkage long compat_sys_ftruncate(unsigned int, compat_off_t); /* No generic prototype for truncate64, ftruncate64, fallocate */ asmlinkage long compat_sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 9104952d323d..ba9337709878 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -418,7 +418,7 @@ asmlinkage long sys_listmount(const struct mnt_id_req __user *req, u64 __user *mnt_ids, size_t nr_mnt_ids, unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); -asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length); +asmlinkage long sys_ftruncate(unsigned int fd, off_t length); #if BITS_PER_LONG == 32 asmlinkage long sys_truncate64(const char __user *path, loff_t length); asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length); -- cgit v1.2.3 From 26b97668e5339434e5df8ddc7b1898a37a350112 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 24 Jun 2024 13:47:22 -0600 Subject: io_uring: remove dead struct io_submit_state member When the intermediate CQE aux cache got removed, any usage of the this member went away. As it isn't used anymore, kill it. Fixes: 902ce82c2aa1 ("io_uring: get rid of intermediate aux cqe caches") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b48570eaa449..7abdc0927124 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -207,7 +207,6 @@ struct io_submit_state { bool need_plug; bool cq_flush; unsigned short submit_nr; - unsigned int cqes_count; struct blk_plug plug; }; -- cgit v1.2.3 From 399ab86ea55039f9d0a5f621a68cb4631f796f37 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Fri, 14 Jun 2024 23:20:14 +0000 Subject: /proc/pid/smaps: add mseal info for vma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sl in /proc/pid/smaps to indicate vma is sealed Link: https://lkml.kernel.org/r/20240614232014.806352-2-jeffxu@google.com Fixes: 8be7258aad44 ("mseal: add mseal syscall") Signed-off-by: Jeff Xu Acked-by: David Hildenbrand Cc: Adhemerval Zanella Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Randy Dunlap Cc: Stephen Röttger Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 1 + fs/proc/task_mmu.c | 3 +++ include/linux/mm.h | 5 +++++ mm/internal.h | 5 ----- 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 7c3a565ffbef..82d142de3461 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -571,6 +571,7 @@ encoded manner. The codes are the following: um userfaultfd missing tracking uw userfaultfd wr-protect tracking ss shadow stack page + sl sealed == ======================================= Note that there is no guarantee that every flag and associated mnemonic will diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f8d35f993fe5..71e5039d940d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -706,6 +706,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", +#endif +#ifdef CONFIG_64BIT + [ilog2(VM_SEALED)] = "sl", #endif }; size_t i; diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a5652c5fadd..eb7c96d24ac0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -406,6 +406,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif +#ifdef CONFIG_64BIT +/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(63) +#endif + /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) diff --git a/mm/internal.h b/mm/internal.h index c72c306761a4..6902b7dd8509 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1434,11 +1434,6 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); -#ifdef CONFIG_64BIT -/* VM is sealed, in vm_flags */ -#define VM_SEALED _BITUL(63) -#endif - #ifdef CONFIG_64BIT static inline int can_do_mseal(unsigned long flags) { -- cgit v1.2.3 From ff202303c398ed56386ca4954154de9a96eb732a Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Fri, 7 Jun 2024 13:29:53 -0700 Subject: mm: convert page type macros to enum Changing PG_slab from a page flag to a page type in commit 46df8e73a4a3 ("mm: free up PG_slab") in has the unintended consequence of removing the PG_slab constant from kernel debuginfo. The commit does add the value to the vmcoreinfo note, which allows debuggers to find the value without hardcoding it. However it's most flexible to continue representing the constant with an enum. To that end, convert the page type fields into an enum. Debuggers will now be able to detect that PG_slab's type has changed from enum pageflags to enum pagetype. Link: https://lkml.kernel.org/r/20240607202954.1198180-1-stephen.s.brennan@oracle.com Fixes: 46df8e73a4a3 ("mm: free up PG_slab") Signed-off-by: Stephen Brennan Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hao Ge Cc: Matthew Wilcox (Oracle) Cc: Omar Sandoval Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 104078afe0b1..b9e914e1face 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -944,15 +944,18 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) * mistaken for a page type value. */ -#define PAGE_TYPE_BASE 0xf0000000 -/* Reserve 0x0000007f to catch underflows of _mapcount */ -#define PAGE_MAPCOUNT_RESERVE -128 -#define PG_buddy 0x00000080 -#define PG_offline 0x00000100 -#define PG_table 0x00000200 -#define PG_guard 0x00000400 -#define PG_hugetlb 0x00000800 -#define PG_slab 0x00001000 +enum pagetype { + PG_buddy = 0x00000080, + PG_offline = 0x00000100, + PG_table = 0x00000200, + PG_guard = 0x00000400, + PG_hugetlb = 0x00000800, + PG_slab = 0x00001000, + + PAGE_TYPE_BASE = 0xf0000000, + /* Reserve 0x0000007f to catch underflows of _mapcount */ + PAGE_MAPCOUNT_RESERVE = -128, +}; #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) -- cgit v1.2.3 From bf14ed81f571f8dba31cd72ab2e50fbcc877cc31 Mon Sep 17 00:00:00 2001 From: yangge Date: Thu, 20 Jun 2024 08:59:50 +0800 Subject: mm/page_alloc: Separate THP PCP into movable and non-movable categories Since commit 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations") no longer differentiates the migration type of pages in THP-sized PCP list, it's possible that non-movable allocation requests may get a CMA page from the list, in some cases, it's not acceptable. If a large number of CMA memory are configured in system (for example, the CMA memory accounts for 50% of the system memory), starting a virtual machine with device passthrough will get stuck. During starting the virtual machine, it will call pin_user_pages_remote(..., FOLL_LONGTERM, ...) to pin memory. Normally if a page is present and in CMA area, pin_user_pages_remote() will migrate the page from CMA area to non-CMA area because of FOLL_LONGTERM flag. But if non-movable allocation requests return CMA memory, migrate_longterm_unpinnable_pages() will migrate a CMA page to another CMA page, which will fail to pass the check in check_and_migrate_movable_pages() and cause migration endless. Call trace: pin_user_pages_remote --__gup_longterm_locked // endless loops in this function ----_get_user_pages_locked ----check_and_migrate_movable_pages ------migrate_longterm_unpinnable_pages --------alloc_migration_target This problem will also have a negative impact on CMA itself. For example, when CMA is borrowed by THP, and we need to reclaim it through cma_alloc() or dma_alloc_coherent(), we must move those pages out to ensure CMA's users can retrieve that contigous memory. Currently, CMA's memory is occupied by non-movable pages, meaning we can't relocate them. As a result, cma_alloc() is more likely to fail. To fix the problem above, we add one PCP list for THP, which will not introduce a new cacheline for struct per_cpu_pages. THP will have 2 PCP lists, one PCP list is used by MOVABLE allocation, and the other PCP list is used by UNMOVABLE allocation. MOVABLE allocation contains GPF_MOVABLE, and UNMOVABLE allocation contains GFP_UNMOVABLE and GFP_RECLAIMABLE. Link: https://lkml.kernel.org/r/1718845190-4456-1-git-send-email-yangge1116@126.com Fixes: 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations") Signed-off-by: yangge Cc: Baolin Wang Cc: Barry Song <21cnbao@gmail.com> Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 9 ++++----- mm/page_alloc.c | 9 +++++++-- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8f9c9590a42c..586a8f0104d7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -654,13 +654,12 @@ enum zone_watermarks { }; /* - * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list - * for THP which will usually be GFP_MOVABLE. Even if it is another type, - * it should not contribute to serious fragmentation causing THP allocation - * failures. + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists + * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list + * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define NR_PCP_THP 1 +#define NR_PCP_THP 2 #else #define NR_PCP_THP 0 #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7300aa9f14b0..9ecf99190ea2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -504,10 +504,15 @@ out: static inline unsigned int order_to_pindex(int migratetype, int order) { + bool __maybe_unused movable; + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != HPAGE_PMD_ORDER); - return NR_LOWORDER_PCP_LISTS; + + movable = migratetype == MIGRATE_MOVABLE; + + return NR_LOWORDER_PCP_LISTS + movable; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -521,7 +526,7 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pindex == NR_LOWORDER_PCP_LISTS) + if (pindex >= NR_LOWORDER_PCP_LISTS) order = HPAGE_PMD_ORDER; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); -- cgit v1.2.3 From cc8d5a2f09a54405321769abfd6ec3395482336a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 25 Jun 2024 07:58:10 +0200 Subject: Revert "printk: Save console options for add_preferred_console_match()" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f03e8c1060f86c23eb49bafee99d9fcbd1c1bd77. Let's roll back all of the serial core and printk console changes that went into 6.10-rc1 as there still are problems with them that need to be sorted out. Link: https://lore.kernel.org/r/ZnpRozsdw6zbjqze@tlindgre-MOBL1 Reported-by: Petr Mladek Reported-by: Tony Lindgren Cc: Jiri Slaby Cc: John Ogness Cc: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- include/linux/printk.h | 3 - kernel/printk/Makefile | 2 +- kernel/printk/conopt.c | 146 ---------------------------------------- kernel/printk/console_cmdline.h | 6 -- kernel/printk/printk.c | 14 +--- 5 files changed, 4 insertions(+), 167 deletions(-) delete mode 100644 kernel/printk/conopt.c (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index 40afab23881a..65c5184470f1 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -60,9 +60,6 @@ static inline const char *printk_skip_headers(const char *buffer) #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT #define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET -int add_preferred_console_match(const char *match, const char *name, - const short idx); - extern int console_printk[]; #define console_loglevel (console_printk[0]) diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 040fe7d1eda2..39a2b61c7232 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y = printk.o conopt.o +obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o nbcon.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK_INDEX) += index.o diff --git a/kernel/printk/conopt.c b/kernel/printk/conopt.c deleted file mode 100644 index 9d507bac3657..000000000000 --- a/kernel/printk/conopt.c +++ /dev/null @@ -1,146 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Kernel command line console options for hardware based addressing - * - * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/ - * Author: Tony Lindgren - */ - -#include -#include -#include -#include - -#include - -#include "console_cmdline.h" - -/* - * Allow longer DEVNAME:0.0 style console naming such as abcd0000.serial:0.0 - * in addition to the legacy ttyS0 style naming. - */ -#define CONSOLE_NAME_MAX 32 - -#define CONSOLE_OPT_MAX 16 -#define CONSOLE_BRL_OPT_MAX 16 - -struct console_option { - char name[CONSOLE_NAME_MAX]; - char opt[CONSOLE_OPT_MAX]; - char brl_opt[CONSOLE_BRL_OPT_MAX]; - u8 has_brl_opt:1; -}; - -/* Updated only at console_setup() time, no locking needed */ -static struct console_option conopt[MAX_CMDLINECONSOLES]; - -/** - * console_opt_save - Saves kernel command line console option for driver use - * @str: Kernel command line console name and option - * @brl_opt: Braille console options - * - * Saves a kernel command line console option for driver subsystems to use for - * adding a preferred console during init. Called from console_setup() only. - * - * Return: 0 on success, negative error code on failure. - */ -int __init console_opt_save(const char *str, const char *brl_opt) -{ - struct console_option *con; - size_t namelen, optlen; - const char *opt; - int i; - - namelen = strcspn(str, ","); - if (namelen == 0 || namelen >= CONSOLE_NAME_MAX) - return -EINVAL; - - opt = str + namelen; - if (*opt == ',') - opt++; - - optlen = strlen(opt); - if (optlen >= CONSOLE_OPT_MAX) - return -EINVAL; - - for (i = 0; i < MAX_CMDLINECONSOLES; i++) { - con = &conopt[i]; - - if (con->name[0]) { - if (!strncmp(str, con->name, namelen)) - return 0; - continue; - } - - /* - * The name isn't terminated, only opt is. Empty opt is fine, - * but brl_opt can be either empty or NULL. For more info, see - * _braille_console_setup(). - */ - strscpy(con->name, str, namelen + 1); - strscpy(con->opt, opt, CONSOLE_OPT_MAX); - if (brl_opt) { - strscpy(con->brl_opt, brl_opt, CONSOLE_BRL_OPT_MAX); - con->has_brl_opt = 1; - } - - return 0; - } - - return -ENOMEM; -} - -static struct console_option *console_opt_find(const char *name) -{ - struct console_option *con; - int i; - - for (i = 0; i < MAX_CMDLINECONSOLES; i++) { - con = &conopt[i]; - if (!strcmp(name, con->name)) - return con; - } - - return NULL; -} - -/** - * add_preferred_console_match - Adds a preferred console if a match is found - * @match: Expected console on kernel command line, such as console=DEVNAME:0.0 - * @name: Name of the console character device to add such as ttyS - * @idx: Index for the console - * - * Allows driver subsystems to add a console after translating the command - * line name to the character device name used for the console. Options are - * added automatically based on the kernel command line. Duplicate preferred - * consoles are ignored by __add_preferred_console(). - * - * Return: 0 on success, negative error code on failure. - */ -int add_preferred_console_match(const char *match, const char *name, - const short idx) -{ - struct console_option *con; - char *brl_opt = NULL; - - if (!match || !strlen(match) || !name || !strlen(name) || - idx < 0) - return -EINVAL; - - con = console_opt_find(match); - if (!con) - return -ENOENT; - - /* - * See __add_preferred_console(). It checks for NULL brl_options to set - * the preferred_console flag. Empty brl_opt instead of NULL leads into - * the preferred_console flag not set, and CON_CONSDEV not being set, - * and the boot console won't get disabled at the end of console_setup(). - */ - if (con->has_brl_opt) - brl_opt = con->brl_opt; - - console_opt_add_preferred_console(name, idx, con->opt, brl_opt); - - return 0; -} diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index a125e0235589..3ca74ad391d6 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -2,12 +2,6 @@ #ifndef _CONSOLE_CMDLINE_H #define _CONSOLE_CMDLINE_H -#define MAX_CMDLINECONSOLES 8 - -int console_opt_save(const char *str, const char *brl_opt); -int console_opt_add_preferred_console(const char *name, const short idx, - char *options, char *brl_options); - struct console_cmdline { char name[16]; /* Name of the driver */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b582404cd29d..dddb15f48d59 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -383,6 +383,9 @@ static int console_locked; /* * Array of consoles built from command line options (console=) */ + +#define MAX_CMDLINECONSOLES 8 + static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; @@ -2500,10 +2503,6 @@ static int __init console_setup(char *str) if (_braille_console_setup(&str, &brl_options)) return 1; - /* Save the console for driver subsystem use */ - if (console_opt_save(str, brl_options)) - return 1; - /* * Decode str into name, index, options. */ @@ -2534,13 +2533,6 @@ static int __init console_setup(char *str) } __setup("console=", console_setup); -/* Only called from add_preferred_console_match() */ -int console_opt_add_preferred_console(const char *name, const short idx, - char *options, char *brl_options) -{ - return __add_preferred_console(name, idx, options, brl_options, true); -} - /** * add_preferred_console - add a device to the list of preferred consoles. * @name: device name -- cgit v1.2.3 From 0fa8ab5f3533b307a7d0e438ab08ecd92725dad7 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Jun 2024 14:47:27 +0200 Subject: linux/syscalls.h: add missing __user annotations A couple of declarations in linux/syscalls.h are missing __user annotations on their pointers, which can lead to warnings from sparse because these don't match the implementation that have the correct address space annotations. Signed-off-by: Arnd Bergmann --- include/linux/syscalls.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ba9337709878..63424af87bba 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -322,13 +322,13 @@ asmlinkage long sys_io_pgetevents(aio_context_t ctx_id, long nr, struct io_event __user *events, struct __kernel_timespec __user *timeout, - const struct __aio_sigset *sig); + const struct __aio_sigset __user *sig); asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id, long min_nr, long nr, struct io_event __user *events, struct old_timespec32 __user *timeout, - const struct __aio_sigset *sig); + const struct __aio_sigset __user *sig); asmlinkage long sys_io_uring_setup(u32 entries, struct io_uring_params __user *p); asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, @@ -441,7 +441,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, umode_t mode); asmlinkage long sys_openat2(int dfd, const char __user *filename, - struct open_how *how, size_t size); + struct open_how __user *how, size_t size); asmlinkage long sys_close(unsigned int fd); asmlinkage long sys_close_range(unsigned int fd, unsigned int max_fd, unsigned int flags); @@ -555,7 +555,7 @@ asmlinkage long sys_get_robust_list(int pid, asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); -asmlinkage long sys_futex_waitv(struct futex_waitv *waiters, +asmlinkage long sys_futex_waitv(struct futex_waitv __user *waiters, unsigned int nr_futexes, unsigned int flags, struct __kernel_timespec __user *timeout, clockid_t clockid); @@ -907,7 +907,7 @@ asmlinkage long sys_seccomp(unsigned int op, unsigned int flags, asmlinkage long sys_getrandom(char __user *buf, size_t count, unsigned int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); -asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size); +asmlinkage long sys_bpf(int cmd, union bpf_attr __user *attr, unsigned int size); asmlinkage long sys_execveat(int dfd, const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp, int flags); @@ -960,11 +960,11 @@ asmlinkage long sys_cachestat(unsigned int fd, struct cachestat_range __user *cstat_range, struct cachestat __user *cstat, unsigned int flags); asmlinkage long sys_map_shadow_stack(unsigned long addr, unsigned long size, unsigned int flags); -asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx *ctx, - u32 *size, u32 flags); -asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx *ctx, +asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *ctx, + u32 __user *size, u32 flags); +asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx, u32 size, u32 flags); -asmlinkage long sys_lsm_list_modules(u64 *ids, u32 *size, u32 flags); +asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags); /* * Architecture-specific system calls -- cgit v1.2.3 From 7e1f4eb9a60d40dd17a97d9b76818682a024a127 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 4 Apr 2024 12:04:54 +0200 Subject: kallsyms: rework symbol lookup return codes Building with W=1 in some configurations produces a false positive warning for kallsyms: kernel/kallsyms.c: In function '__sprint_symbol.isra': kernel/kallsyms.c:503:17: error: 'strcpy' source argument is the same as destination [-Werror=restrict] 503 | strcpy(buffer, name); | ^~~~~~~~~~~~~~~~~~~~ This originally showed up while building with -O3, but later started happening in other configurations as well, depending on inlining decisions. The underlying issue is that the local 'name' variable is always initialized to the be the same as 'buffer' in the called functions that fill the buffer, which gcc notices while inlining, though it could see that the address check always skips the copy. The calling conventions here are rather unusual, as all of the internal lookup functions (bpf_address_lookup, ftrace_mod_address_lookup, ftrace_func_address_lookup, module_address_lookup and kallsyms_lookup_buildid) already use the provided buffer and either return the address of that buffer to indicate success, or NULL for failure, but the callers are written to also expect an arbitrary other buffer to be returned. Rework the calling conventions to return the length of the filled buffer instead of its address, which is simpler and easier to follow as well as avoiding the warning. Leave only the kallsyms_lookup() calling conventions unchanged, since that is called from 16 different functions and adapting this would be a much bigger change. Link: https://lore.kernel.org/lkml/20200107214042.855757-1-arnd@arndb.de/ Link: https://lore.kernel.org/lkml/20240326130647.7bfb1d92@gandalf.local.home/ Tested-by: Geert Uytterhoeven Reviewed-by: Luis Chamberlain Acked-by: Steven Rostedt (Google) Signed-off-by: Arnd Bergmann --- include/linux/filter.h | 14 +++++++------- include/linux/ftrace.h | 6 +++--- include/linux/module.h | 14 +++++++------- kernel/bpf/core.c | 7 +++---- kernel/kallsyms.c | 23 ++++++++++++----------- kernel/module/kallsyms.c | 25 ++++++++++++------------- kernel/trace/ftrace.c | 13 +++++-------- 7 files changed, 49 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0f12cf01070e..5669da513cd7 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1208,18 +1208,18 @@ static inline bool bpf_jit_kallsyms_enabled(void) return false; } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); -static inline const char * +static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - const char *ret = __bpf_address_lookup(addr, size, off, sym); + int ret = __bpf_address_lookup(addr, size, off, sym); if (ret && modname) *modname = NULL; @@ -1263,11 +1263,11 @@ static inline bool bpf_jit_kallsyms_enabled(void) return false; } -static inline const char * +static inline int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { - return NULL; + return 0; } static inline bool is_bpf_text_address(unsigned long addr) @@ -1286,11 +1286,11 @@ static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) return NULL; } -static inline const char * +static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - return NULL; + return 0; } static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 800995c425e0..b792274189a3 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -86,15 +86,15 @@ struct ftrace_hash; #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \ defined(CONFIG_DYNAMIC_FTRACE) -const char * +int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym); #else -static inline const char * +static inline int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { - return NULL; + return 0; } #endif diff --git a/include/linux/module.h b/include/linux/module.h index ffa1c603163c..330ffb59efe5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -931,11 +931,11 @@ int module_kallsyms_on_each_symbol(const char *modname, * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, const unsigned char **modbuildid, - char *namebuf); +int module_address_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, const unsigned char **modbuildid, + char *namebuf); int lookup_module_symbol_name(unsigned long addr, char *symname); int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, @@ -964,14 +964,14 @@ static inline int module_kallsyms_on_each_symbol(const char *modname, } /* For kallsyms to ask for address resolution. NULL means not found. */ -static inline const char *module_address_lookup(unsigned long addr, +static inline int module_address_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { - return NULL; + return 0; } static inline int lookup_module_symbol_name(unsigned long addr, char *symname) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a6c3faa6e4a..695a0fb2cd4d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -736,11 +736,11 @@ static struct bpf_ksym *bpf_ksym_find(unsigned long addr) return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } -const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { struct bpf_ksym *ksym; - char *ret = NULL; + int ret = 0; rcu_read_lock(); ksym = bpf_ksym_find(addr); @@ -748,9 +748,8 @@ const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; - strscpy(sym, ksym->name, KSYM_NAME_LEN); + ret = strscpy(sym, ksym->name, KSYM_NAME_LEN); - ret = sym; if (size) *size = symbol_end - symbol_start; if (off) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 22ea19a36e6e..98b9622d372e 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -388,12 +388,12 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, !!__bpf_address_lookup(addr, symbolsize, offset, namebuf); } -static const char *kallsyms_lookup_buildid(unsigned long addr, +static int kallsyms_lookup_buildid(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, const unsigned char **modbuildid, char *namebuf) { - const char *ret; + int ret; namebuf[KSYM_NAME_LEN - 1] = 0; namebuf[0] = 0; @@ -410,7 +410,7 @@ static const char *kallsyms_lookup_buildid(unsigned long addr, if (modbuildid) *modbuildid = NULL; - ret = namebuf; + ret = strlen(namebuf); goto found; } @@ -442,8 +442,13 @@ const char *kallsyms_lookup(unsigned long addr, unsigned long *offset, char **modname, char *namebuf) { - return kallsyms_lookup_buildid(addr, symbolsize, offset, modname, - NULL, namebuf); + int ret = kallsyms_lookup_buildid(addr, symbolsize, offset, modname, + NULL, namebuf); + + if (!ret) + return NULL; + + return namebuf; } int lookup_symbol_name(unsigned long addr, char *symname) @@ -478,19 +483,15 @@ static int __sprint_symbol(char *buffer, unsigned long address, { char *modname; const unsigned char *buildid; - const char *name; unsigned long offset, size; int len; address += symbol_offset; - name = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, + len = kallsyms_lookup_buildid(address, &size, &offset, &modname, &buildid, buffer); - if (!name) + if (!len) return sprintf(buffer, "0x%lx", address - symbol_offset); - if (name != buffer) - strcpy(buffer, name); - len = strlen(buffer); offset -= symbol_offset; if (add_offset) diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index 62fb57bb9f16..bf65e0c3c86f 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -321,14 +321,15 @@ void * __weak dereference_module_function_descriptor(struct module *mod, * For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *size, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) +int module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) { - const char *ret = NULL; + const char *sym; + int ret = 0; struct module *mod; preempt_disable(); @@ -344,12 +345,10 @@ const char *module_address_lookup(unsigned long addr, #endif } - ret = find_kallsyms_symbol(mod, addr, size, offset); - } - /* Make a copy in here where it's safe */ - if (ret) { - strscpy(namebuf, ret, KSYM_NAME_LEN); - ret = namebuf; + sym = find_kallsyms_symbol(mod, addr, size, offset); + + if (sym) + ret = strscpy(namebuf, sym, KSYM_NAME_LEN); } preempt_enable(); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65208d3b5ed9..eacab4020508 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6969,7 +6969,7 @@ allocate_ftrace_mod_map(struct module *mod, return mod_map; } -static const char * +static int ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, unsigned long addr, unsigned long *size, unsigned long *off, char *sym) @@ -6990,21 +6990,18 @@ ftrace_func_address_lookup(struct ftrace_mod_map *mod_map, *size = found_func->size; if (off) *off = addr - found_func->ip; - if (sym) - strscpy(sym, found_func->name, KSYM_NAME_LEN); - - return found_func->name; + return strscpy(sym, found_func->name, KSYM_NAME_LEN); } - return NULL; + return 0; } -const char * +int ftrace_mod_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { struct ftrace_mod_map *mod_map; - const char *ret = NULL; + int ret = 0; /* mod_map is freed via call_rcu() */ preempt_disable(); -- cgit v1.2.3 From f6549f538fe0b2c389e1a7037f4e21039e25137a Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Sat, 29 Jun 2024 14:42:12 +0200 Subject: ata,scsi: libata-core: Do not leak memory for ata_port struct members libsas is currently not freeing all the struct ata_port struct members, e.g. ncq_sense_buf for a driver supporting Command Duration Limits (CDL). Add a function, ata_port_free(), that is used to free a ata_port, including its struct members. It makes sense to keep the code related to freeing a ata_port in its own function, which will also free all the struct members of struct ata_port. Fixes: 18bd7718b5c4 ("scsi: ata: libata: Handle completion of CDL commands using policy 0xD") Reviewed-by: John Garry Link: https://lore.kernel.org/r/20240629124210.181537-8-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 24 ++++++++++++++---------- drivers/scsi/libsas/sas_ata.c | 6 +++--- drivers/scsi/libsas/sas_discover.c | 2 +- include/linux/libata.h | 1 + 4 files changed, 19 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index bdccf4ea251a..27d22bc43a95 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -5489,6 +5489,18 @@ struct ata_port *ata_port_alloc(struct ata_host *host) return ap; } +void ata_port_free(struct ata_port *ap) +{ + if (!ap) + return; + + kfree(ap->pmp_link); + kfree(ap->slave_link); + kfree(ap->ncq_sense_buf); + kfree(ap); +} +EXPORT_SYMBOL_GPL(ata_port_free); + static void ata_devres_release(struct device *gendev, void *res) { struct ata_host *host = dev_get_drvdata(gendev); @@ -5515,15 +5527,7 @@ static void ata_host_release(struct kref *kref) int i; for (i = 0; i < host->n_ports; i++) { - struct ata_port *ap = host->ports[i]; - - if (!ap) - continue; - - kfree(ap->pmp_link); - kfree(ap->slave_link); - kfree(ap->ncq_sense_buf); - kfree(ap); + ata_port_free(host->ports[i]); host->ports[i] = NULL; } kfree(host); @@ -5906,7 +5910,7 @@ int ata_host_register(struct ata_host *host, const struct scsi_host_template *sh * allocation time. */ for (i = host->n_ports; host->ports[i]; i++) - kfree(host->ports[i]); + ata_port_free(host->ports[i]); /* give ports names and add SCSI hosts */ for (i = 0; i < host->n_ports; i++) { diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 4c69fc63c119..cbbe43d8ef87 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -610,15 +610,15 @@ int sas_ata_init(struct domain_device *found_dev) rc = ata_sas_tport_add(ata_host->dev, ap); if (rc) - goto destroy_port; + goto free_port; found_dev->sata_dev.ata_host = ata_host; found_dev->sata_dev.ap = ap; return 0; -destroy_port: - kfree(ap); +free_port: + ata_port_free(ap); free_host: ata_host_put(ata_host); return rc; diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c index 8fb7c41c0962..48d975c6dbf2 100644 --- a/drivers/scsi/libsas/sas_discover.c +++ b/drivers/scsi/libsas/sas_discover.c @@ -301,7 +301,7 @@ void sas_free_device(struct kref *kref) if (dev_is_sata(dev) && dev->sata_dev.ap) { ata_sas_tport_delete(dev->sata_dev.ap); - kfree(dev->sata_dev.ap); + ata_port_free(dev->sata_dev.ap); ata_host_put(dev->sata_dev.ata_host); dev->sata_dev.ata_host = NULL; dev->sata_dev.ap = NULL; diff --git a/include/linux/libata.h b/include/linux/libata.h index 13fb41d25da6..7d3bd7c9664a 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1249,6 +1249,7 @@ extern int ata_slave_link_init(struct ata_port *ap); extern struct ata_port *ata_sas_port_alloc(struct ata_host *, struct ata_port_info *, struct Scsi_Host *); extern void ata_port_probe(struct ata_port *ap); +extern void ata_port_free(struct ata_port *ap); extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap); extern void ata_sas_tport_delete(struct ata_port *ap); int ata_sas_device_configure(struct scsi_device *sdev, struct queue_limits *lim, -- cgit v1.2.3