diff options
Diffstat (limited to 'include')
82 files changed, 830 insertions, 359 deletions
diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 402a8c1c202d..a8f4b653ef4e 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -190,7 +190,7 @@ int hv_common_cpu_die(unsigned int cpu); void *hv_alloc_hyperv_page(void); void *hv_alloc_hyperv_zeroed_page(void); -void hv_free_hyperv_page(unsigned long addr); +void hv_free_hyperv_page(void *addr); /** * hv_cpu_number_to_vp_number() - Map CPU to VP. diff --git a/include/asm-generic/word-at-a-time.h b/include/asm-generic/word-at-a-time.h index 20c93f08c993..95a1d214108a 100644 --- a/include/asm-generic/word-at-a-time.h +++ b/include/asm-generic/word-at-a-time.h @@ -38,7 +38,7 @@ static inline long find_zero(unsigned long mask) return (mask >> 8) ? byte : byte + 1; } -static inline bool has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) +static inline unsigned long has_zero(unsigned long val, unsigned long *data, const struct word_at_a_time *c) { unsigned long rhs = val | c->low_bits; *data = rhs; diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h index 02f2ac4dd2df..e69cece404b3 100644 --- a/include/drm/display/drm_dp.h +++ b/include/drm/display/drm_dp.h @@ -1537,7 +1537,7 @@ enum drm_dp_phy { #define DP_BRANCH_OUI_HEADER_SIZE 0xc #define DP_RECEIVER_CAP_SIZE 0xf -#define DP_DSC_RECEIVER_CAP_SIZE 0xf +#define DP_DSC_RECEIVER_CAP_SIZE 0x10 /* DSC Capabilities 0x60 through 0x6F */ #define EDP_PSR_RECEIVER_CAP_SIZE 2 #define EDP_DISPLAY_CTL_CAP_SIZE 3 #define DP_LTTPR_COMMON_CAP_SIZE 8 diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h index 169755d3de19..48e93f909ef6 100644 --- a/include/drm/drm_edid.h +++ b/include/drm/drm_edid.h @@ -61,15 +61,9 @@ struct std_timing { u8 vfreq_aspect; } __attribute__((packed)); -#define DRM_EDID_PT_SYNC_MASK (3 << 3) -# define DRM_EDID_PT_ANALOG_CSYNC (0 << 3) -# define DRM_EDID_PT_BIPOLAR_ANALOG_CSYNC (1 << 3) -# define DRM_EDID_PT_DIGITAL_CSYNC (2 << 3) -# define DRM_EDID_PT_CSYNC_ON_RGB (1 << 1) /* analog csync only */ -# define DRM_EDID_PT_CSYNC_SERRATE (1 << 2) -# define DRM_EDID_PT_DIGITAL_SEPARATE_SYNC (3 << 3) -# define DRM_EDID_PT_HSYNC_POSITIVE (1 << 1) /* also digital csync */ -# define DRM_EDID_PT_VSYNC_POSITIVE (1 << 2) +#define DRM_EDID_PT_HSYNC_POSITIVE (1 << 1) +#define DRM_EDID_PT_VSYNC_POSITIVE (1 << 2) +#define DRM_EDID_PT_SEPARATE_SYNC (3 << 3) #define DRM_EDID_PT_STEREO (1 << 5) #define DRM_EDID_PT_INTERLACED (1 << 7) diff --git a/include/drm/drm_probe_helper.h b/include/drm/drm_probe_helper.h index 4977e0ab72db..fad3c4003b2b 100644 --- a/include/drm/drm_probe_helper.h +++ b/include/drm/drm_probe_helper.h @@ -25,6 +25,7 @@ void drm_kms_helper_connector_hotplug_event(struct drm_connector *connector); void drm_kms_helper_poll_disable(struct drm_device *dev); void drm_kms_helper_poll_enable(struct drm_device *dev); +void drm_kms_helper_poll_reschedule(struct drm_device *dev); bool drm_kms_helper_is_poll_worker(void); enum drm_mode_status drm_crtc_helper_mode_valid_fixed(struct drm_crtc *crtc, diff --git a/include/linux/bio.h b/include/linux/bio.h index c4f5b5228105..11984ed29cb8 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -791,7 +791,7 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) { bio->bi_opf |= REQ_POLLED; - if (!is_sync_kiocb(kiocb)) + if (kiocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0bad62cca3d0..d5c5e59ddbd2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -52,7 +52,6 @@ struct block_device { atomic_t bd_openers; spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct inode * bd_inode; /* will die */ - struct super_block * bd_super; void * bd_claiming; void * bd_holder; const struct blk_holder_ops *bd_holder_ops; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ed44a997f629..83ce87354e9a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -750,7 +750,8 @@ static inline int bdev_read_only(struct block_device *bdev) } bool set_capacity_and_notify(struct gendisk *disk, sector_t size); -bool disk_force_media_change(struct gendisk *disk, unsigned int events); +void disk_force_media_change(struct gendisk *disk); +void bdev_mark_dead(struct block_device *bdev, bool surprise); void add_disk_randomness(struct gendisk *disk) __latent_entropy; void rand_initialize_disk(struct gendisk *disk); @@ -809,7 +810,6 @@ int __register_blkdev(unsigned int major, const char *name, void unregister_blkdev(unsigned int major, const char *name); bool disk_check_media_change(struct gendisk *disk); -int __invalidate_device(struct block_device *bdev, bool kill_dirty); void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED @@ -969,7 +969,6 @@ struct blk_plug { bool multiple_queues; bool has_elevator; - bool nowait; struct list_head cb_list; /* md requires an unplug callback */ }; @@ -1461,9 +1460,16 @@ void blkdev_show(struct seq_file *seqf, off_t offset); #endif struct blk_holder_ops { - void (*mark_dead)(struct block_device *bdev); + void (*mark_dead)(struct block_device *bdev, bool surprise); + + /* + * Sync the file system mounted on the block device. + */ + void (*sync)(struct block_device *bdev); }; +extern const struct blk_holder_ops fs_holder_ops; + /* * Return the correct open flags for blkdev_get_by_* for super block flags * as stored in sb->s_flags. @@ -1522,8 +1528,6 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) } #endif /* CONFIG_BLOCK */ -int fsync_bdev(struct block_device *bdev); - int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); diff --git a/include/linux/clk.h b/include/linux/clk.h index 1ef013324237..06f1b292f8a0 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -183,6 +183,39 @@ int clk_get_scaled_duty_cycle(struct clk *clk, unsigned int scale); */ bool clk_is_match(const struct clk *p, const struct clk *q); +/** + * clk_rate_exclusive_get - get exclusivity over the rate control of a + * producer + * @clk: clock source + * + * This function allows drivers to get exclusive control over the rate of a + * provider. It prevents any other consumer to execute, even indirectly, + * opereation which could alter the rate of the provider or cause glitches + * + * If exlusivity is claimed more than once on clock, even by the same driver, + * the rate effectively gets locked as exclusivity can't be preempted. + * + * Must not be called from within atomic context. + * + * Returns success (0) or negative errno. + */ +int clk_rate_exclusive_get(struct clk *clk); + +/** + * clk_rate_exclusive_put - release exclusivity over the rate control of a + * producer + * @clk: clock source + * + * This function allows drivers to release the exclusivity it previously got + * from clk_rate_exclusive_get() + * + * The caller must balance the number of clk_rate_exclusive_get() and + * clk_rate_exclusive_put() calls. + * + * Must not be called from within atomic context. + */ +void clk_rate_exclusive_put(struct clk *clk); + #else static inline int clk_notifier_register(struct clk *clk, @@ -236,6 +269,13 @@ static inline bool clk_is_match(const struct clk *p, const struct clk *q) return p == q; } +static inline int clk_rate_exclusive_get(struct clk *clk) +{ + return 0; +} + +static inline void clk_rate_exclusive_put(struct clk *clk) {} + #endif #ifdef CONFIG_HAVE_CLK_PREPARE @@ -583,38 +623,6 @@ struct clk *devm_clk_get_optional_enabled(struct device *dev, const char *id); */ struct clk *devm_get_clk_from_child(struct device *dev, struct device_node *np, const char *con_id); -/** - * clk_rate_exclusive_get - get exclusivity over the rate control of a - * producer - * @clk: clock source - * - * This function allows drivers to get exclusive control over the rate of a - * provider. It prevents any other consumer to execute, even indirectly, - * opereation which could alter the rate of the provider or cause glitches - * - * If exlusivity is claimed more than once on clock, even by the same driver, - * the rate effectively gets locked as exclusivity can't be preempted. - * - * Must not be called from within atomic context. - * - * Returns success (0) or negative errno. - */ -int clk_rate_exclusive_get(struct clk *clk); - -/** - * clk_rate_exclusive_put - release exclusivity over the rate control of a - * producer - * @clk: clock source - * - * This function allows drivers to release the exclusivity it previously got - * from clk_rate_exclusive_get() - * - * The caller must balance the number of clk_rate_exclusive_get() and - * clk_rate_exclusive_put() calls. - * - * Must not be called from within atomic context. - */ -void clk_rate_exclusive_put(struct clk *clk); /** * clk_enable - inform the system when the clock source should be running. @@ -974,14 +982,6 @@ static inline void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks) {} static inline void devm_clk_put(struct device *dev, struct clk *clk) {} - -static inline int clk_rate_exclusive_get(struct clk *clk) -{ - return 0; -} - -static inline void clk_rate_exclusive_put(struct clk *clk) {} - static inline int clk_enable(struct clk *clk) { return 0; diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 00efa35c350f..28566624f008 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -95,6 +95,19 @@ #endif /* + * Optional: only supported since gcc >= 14 + * Optional: only supported since clang >= 18 + * + * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 + * clang: https://reviews.llvm.org/D148381 + */ +#if __has_attribute(__counted_by__) +# define __counted_by(member) __attribute__((__counted_by__(member))) +#else +# define __counted_by(member) +#endif + +/* * Optional: not supported by gcc * Optional: only supported since clang >= 14.0 * @@ -130,19 +143,6 @@ #endif /* - * Optional: only supported since gcc >= 14 - * Optional: only supported since clang >= 17 - * - * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108896 - * clang: https://reviews.llvm.org/D148381 - */ -#if __has_attribute(__element_count__) -# define __counted_by(member) __attribute__((__element_count__(#member))) -#else -# define __counted_by(member) -#endif - -/* * Optional: only supported since clang >= 14.0 * * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-error-function-attribute diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 547ea1ff806e..c523c6683789 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -106,6 +106,34 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } #define __cold #endif +/* + * On x86-64 and arm64 targets, __preserve_most changes the calling convention + * of a function to make the code in the caller as unintrusive as possible. This + * convention behaves identically to the C calling convention on how arguments + * and return values are passed, but uses a different set of caller- and callee- + * saved registers. + * + * The purpose is to alleviates the burden of saving and recovering a large + * register set before and after the call in the caller. This is beneficial for + * rarely taken slow paths, such as error-reporting functions that may be called + * from hot paths. + * + * Note: This may conflict with instrumentation inserted on function entry which + * does not use __preserve_most or equivalent convention (if in assembly). Since + * function tracing assumes the normal C calling convention, where the attribute + * is supported, __preserve_most implies notrace. It is recommended to restrict + * use of the attribute to functions that should or already disable tracing. + * + * Optional: not supported by gcc. + * + * clang: https://clang.llvm.org/docs/AttributeReference.html#preserve-most + */ +#if __has_attribute(__preserve_most__) && (defined(CONFIG_X86_64) || defined(CONFIG_ARM64)) +# define __preserve_most notrace __attribute__((__preserve_most__)) +#else +# define __preserve_most +#endif + /* Builtins */ /* diff --git a/include/linux/completion.h b/include/linux/completion.h index 62b32b19e0a8..fb2915676574 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -116,6 +116,7 @@ extern bool try_wait_for_completion(struct completion *x); extern bool completion_done(struct completion *x); extern void complete(struct completion *); +extern void complete_on_current_cpu(struct completion *x); extern void complete_all(struct completion *); #endif diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6e6e57ec69e8..e006c719182b 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -70,6 +70,10 @@ extern ssize_t cpu_show_mmio_stale_data(struct device *dev, char *buf); extern ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_spec_rstack_overflow(struct device *dev, + struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_gds(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 0d2e2a38b92d..f10fb87d49db 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -175,8 +175,8 @@ static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 - * @src1p: the first input - * @src2p: the second input + * @srcp1: the first input + * @srcp2: the second input * * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ @@ -1197,6 +1197,10 @@ cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, /** * cpumap_print_list_to_buf - copies the cpumask into the buffer as * comma-separated list of cpus + * @buf: the buffer to copy into + * @mask: the cpumask to copy + * @off: in the string from which we are copying, we copy to @buf + * @count: the maximum number of bytes to print * * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. diff --git a/include/linux/dm-verity-loadpin.h b/include/linux/dm-verity-loadpin.h index 552b817ab102..3ac6dbaeaa37 100644 --- a/include/linux/dm-verity-loadpin.h +++ b/include/linux/dm-verity-loadpin.h @@ -12,7 +12,7 @@ extern struct list_head dm_verity_loadpin_trusted_root_digests; struct dm_verity_loadpin_trusted_root_digest { struct list_head node; unsigned int len; - u8 data[]; + u8 data[] __counted_by(len); }; #if IS_ENABLED(CONFIG_SECURITY_LOADPIN_VERITY) diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h index b1d26f9f1c9f..9f183a679277 100644 --- a/include/linux/dnotify.h +++ b/include/linux/dnotify.h @@ -30,7 +30,7 @@ struct dnotify_struct { FS_MOVED_FROM | FS_MOVED_TO) extern void dnotify_flush(struct file *, fl_owner_t); -extern int fcntl_dirnotify(int, struct file *, unsigned long); +extern int fcntl_dirnotify(int, struct file *, unsigned int); #else @@ -38,7 +38,7 @@ static inline void dnotify_flush(struct file *filp, fl_owner_t id) { } -static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) +static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { return -EINVAL; } diff --git a/include/linux/filelock.h b/include/linux/filelock.h index efcdd1631d9b..95e868e09e29 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -144,7 +144,7 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif -int fcntl_setlease(unsigned int fd, struct file *filp, long arg); +int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); /* fs/locks.c */ @@ -167,8 +167,8 @@ bool vfs_inode_has_locks(struct inode *inode); int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); void lease_get_mtime(struct inode *, struct timespec64 *time); -int generic_setlease(struct file *, long, struct file_lock **, void **priv); -int vfs_setlease(struct file *, long, struct file_lock **, void **); +int generic_setlease(struct file *, int, struct file_lock **, void **priv); +int vfs_setlease(struct file *, int, struct file_lock **, void **); int lease_modify(struct file_lock *, int, struct list_head *); struct notifier_block; @@ -213,7 +213,7 @@ static inline int fcntl_setlk64(unsigned int fd, struct file *file, return -EACCES; } #endif -static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { return -EINVAL; } @@ -306,13 +306,13 @@ static inline void lease_get_mtime(struct inode *inode, return; } -static inline int generic_setlease(struct file *filp, long arg, +static inline int generic_setlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { return -EINVAL; } -static inline int vfs_setlease(struct file *filp, long arg, +static inline int vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) { return -EINVAL; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..dda08d973639 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -338,6 +338,20 @@ enum rw_hint { #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) +/* + * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the + * iocb completion can be passed back to the owner for execution from a safe + * context rather than needing to be punted through a workqueue. If this + * flag is set, the bio completion handling may set iocb->dio_complete to a + * handler function and iocb->private to context information for that handler. + * The issuer should call the handler with that context information from task + * context to complete the processing of the iocb. Note that while this + * provides a task context for the dio_complete() callback, it should only be + * used on the completion side for non-IO generating completions. It's fine to + * call blocking functions from this callback, but they should not wait for + * unrelated IO (like cache flushing, new IO generation, etc). + */ +#define IOCB_DIO_CALLER_COMP (1 << 22) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ @@ -351,7 +365,8 @@ enum rw_hint { { IOCB_WRITE, "WRITE" }, \ { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ - { IOCB_ALLOC_CACHE, "ALLOC_CACHE" } + { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ + { IOCB_DIO_CALLER_COMP, "CALLER_COMP" } struct kiocb { struct file *ki_filp; @@ -360,7 +375,23 @@ struct kiocb { void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ - struct wait_page_queue *ki_waitq; /* for async buffered IO */ + union { + /* + * Only used for async buffered reads, where it denotes the + * page waitqueue associated with completing the read. Valid + * IFF IOCB_WAITQ is set. + */ + struct wait_page_queue *ki_waitq; + /* + * Can be used for O_DIRECT IO, where the completion handling + * is punted back to the issuer of the IO. May only be set + * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer + * must then check for presence of this handler when ki_complete + * is invoked. The data passed in to this handler must be + * assigned to ->private when dio_complete is assigned. + */ + ssize_t (*dio_complete)(void *data); + }; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) @@ -642,7 +673,7 @@ struct inode { loff_t i_size; struct timespec64 i_atime; struct timespec64 i_mtime; - struct timespec64 i_ctime; + struct timespec64 __i_ctime; /* use inode_*_ctime accessors! */ spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; @@ -1069,7 +1100,7 @@ extern void fasync_free(struct fasync_struct *); extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); -extern int f_setown(struct file *filp, unsigned long arg, int force); +extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); @@ -1095,6 +1126,8 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ +#define SB_DEAD BIT(21) +#define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) @@ -1147,7 +1180,8 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - int frozen; /* Is sb frozen? */ + unsigned short frozen; /* Is sb frozen? */ + unsigned short freeze_holders; /* Who froze fs? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -1474,7 +1508,79 @@ static inline bool fsuidgid_has_mapping(struct super_block *sb, kgid_has_mapping(fs_userns, kgid); } -extern struct timespec64 current_time(struct inode *inode); +struct timespec64 current_mgtime(struct inode *inode); +struct timespec64 current_time(struct inode *inode); +struct timespec64 inode_set_ctime_current(struct inode *inode); + +/* + * Multigrain timestamps + * + * Conditionally use fine-grained ctime and mtime timestamps when there + * are users actively observing them via getattr. The primary use-case + * for this is NFS clients that use the ctime to distinguish between + * different states of the file, and that are often fooled by multiple + * operations that occur in the same coarse-grained timer tick. + * + * The kernel always keeps normalized struct timespec64 values in the ctime, + * which means that only the first 30 bits of the value are used. Use the + * 31st bit of the ctime's tv_nsec field as a flag to indicate that the value + * has been queried since it was last updated. + */ +#define I_CTIME_QUERIED (1L<<30) + +/** + * inode_get_ctime - fetch the current ctime from the inode + * @inode: inode from which to fetch ctime + * + * Grab the current ctime tv_nsec field from the inode, mask off the + * I_CTIME_QUERIED flag and return it. This is mostly intended for use by + * internal consumers of the ctime that aren't concerned with ensuring a + * fine-grained update on the next change (e.g. when preparing to store + * the value in the backing store for later retrieval). + * + * This is safe to call regardless of whether the underlying filesystem + * is using multigrain timestamps. + */ +static inline struct timespec64 inode_get_ctime(const struct inode *inode) +{ + struct timespec64 ctime; + + ctime.tv_sec = inode->__i_ctime.tv_sec; + ctime.tv_nsec = inode->__i_ctime.tv_nsec & ~I_CTIME_QUERIED; + + return ctime; +} + +/** + * inode_set_ctime_to_ts - set the ctime in the inode + * @inode: inode in which to set the ctime + * @ts: value to set in the ctime field + * + * Set the ctime in @inode to @ts + */ +static inline struct timespec64 inode_set_ctime_to_ts(struct inode *inode, + struct timespec64 ts) +{ + inode->__i_ctime = ts; + return ts; +} + +/** + * inode_set_ctime - set the ctime in the inode + * @inode: inode in which to set the ctime + * @sec: tv_sec value to set + * @nsec: tv_nsec value to set + * + * Set the ctime in @inode to { @sec, @nsec } + */ +static inline struct timespec64 inode_set_ctime(struct inode *inode, + time64_t sec, long nsec) +{ + struct timespec64 ts = { .tv_sec = sec, + .tv_nsec = nsec }; + + return inode_set_ctime_to_ts(inode, ts); +} /* * Snapshotting support. @@ -1770,6 +1876,7 @@ struct dir_context { struct iov_iter; struct io_uring_cmd; +struct offset_ctx; struct file_operations { struct module *owner; @@ -1780,7 +1887,6 @@ struct file_operations { ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags); - int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); @@ -1799,7 +1905,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); - int (*setlease)(struct file *, long, struct file_lock **, void **); + int (*setlease)(struct file *, int, struct file_lock **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); @@ -1817,6 +1923,13 @@ struct file_operations { unsigned int poll_flags); } __randomize_layout; +/* Wrap a directory iterator that needs exclusive inode access */ +int wrap_directory_iterator(struct file *, struct dir_context *, + int (*) (struct file *, struct dir_context *)); +#define WRAP_DIR_ITER(x) \ + static int shared_##x(struct file *file , struct dir_context *ctx) \ + { return wrap_directory_iterator(file, ctx, x); } + struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); @@ -1844,7 +1957,7 @@ struct inode_operations { ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); - int (*update_time)(struct inode *, struct timespec64 *, int); + int (*update_time)(struct inode *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); @@ -1857,6 +1970,7 @@ struct inode_operations { int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); + struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, @@ -1902,6 +2016,10 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); +enum freeze_holder { + FREEZE_HOLDER_KERNEL = (1U << 0), + FREEZE_HOLDER_USERSPACE = (1U << 1), +}; struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); @@ -1914,9 +2032,9 @@ struct super_operations { void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_super) (struct super_block *); + int (*freeze_super) (struct super_block *, enum freeze_holder who); int (*freeze_fs) (struct super_block *); - int (*thaw_super) (struct super_block *); + int (*thaw_super) (struct super_block *, enum freeze_holder who); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); @@ -2194,7 +2312,7 @@ enum file_time_flags { extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); -int inode_update_time(struct inode *inode, struct timespec64 *time, int flags); +int inode_update_time(struct inode *inode, int flags); static inline void file_accessed(struct file *file) { @@ -2216,6 +2334,7 @@ struct file_system_type { #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ +#define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -2239,6 +2358,17 @@ struct file_system_type { #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) +/** + * is_mgtime: is this inode using multigrain timestamps + * @inode: inode to test for multigrain timestamps + * + * Return true if the inode uses multigrain timestamps, false otherwise. + */ +static inline bool is_mgtime(const struct inode *inode) +{ + return inode->i_sb->s_type->fs_flags & FS_MGTIME; +} + extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); @@ -2290,8 +2420,8 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -extern int freeze_super(struct super_block *super); -extern int thaw_super(struct super_block *super); +int freeze_super(struct super_block *super, enum freeze_holder who); +int thaw_super(struct super_block *super, enum freeze_holder who); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); @@ -2300,7 +2430,8 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); -extern int generic_update_time(struct inode *, struct timespec64 *, int); +int inode_update_timestamps(struct inode *inode, int flags); +int generic_update_time(struct inode *, int); /* /sys/fs */ extern struct kobject *fs_kobj; @@ -2539,6 +2670,13 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) return (inode->i_mode ^ mode) & S_IFMT; } +/** + * file_start_write - get write access to a superblock for regular file io + * @file: the file we want to write to + * + * This is a variant of sb_start_write() which is a noop on non-regualr file. + * Should be matched with a call to file_end_write(). + */ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) @@ -2553,11 +2691,53 @@ static inline bool file_start_write_trylock(struct file *file) return sb_start_write_trylock(file_inode(file)->i_sb); } +/** + * file_end_write - drop write access to a superblock of a regular file + * @file: the file we wrote to + * + * Should be matched with a call to file_start_write(). + */ static inline void file_end_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; - __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE); + sb_end_write(file_inode(file)->i_sb); +} + +/** + * kiocb_start_write - get write access to a superblock for async file io + * @iocb: the io context we want to submit the write with + * + * This is a variant of sb_start_write() for async io submission. + * Should be matched with a call to kiocb_end_write(). + */ +static inline void kiocb_start_write(struct kiocb *iocb) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + sb_start_write(inode->i_sb); + /* + * Fool lockdep by telling it the lock got released so that it + * doesn't complain about the held lock when we return to userspace. + */ + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); +} + +/** + * kiocb_end_write - drop write access to a superblock after async file io + * @iocb: the io context we sumbitted the write with + * + * Should be matched with a call to kiocb_start_write(). + */ +static inline void kiocb_end_write(struct kiocb *iocb) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission thread. + */ + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + sb_end_write(inode->i_sb); } /* @@ -2607,8 +2787,7 @@ static inline bool inode_is_open_for_write(const struct inode *inode) #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { - BUG_ON(!atomic_read(&inode->i_readcount)); - atomic_dec(&inode->i_readcount); + BUG_ON(atomic_dec_return(&inode->i_readcount) < 0); } static inline void i_readcount_inc(struct inode *inode) { @@ -2874,7 +3053,8 @@ extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); -void generic_fillattr(struct mnt_idmap *, struct inode *, struct kstat *); +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); +void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); @@ -2913,7 +3093,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); -extern struct super_block *get_super(struct block_device *); extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); @@ -2934,6 +3113,8 @@ extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); +void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename(struct mnt_idmap *, struct inode *, @@ -2950,7 +3131,7 @@ extern int simple_write_begin(struct file *file, struct address_space *mapping, extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); -extern int simple_nosetlease(struct file *, long, struct file_lock **, void **); +extern int simple_nosetlease(struct file *, int, struct file_lock **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); @@ -2971,6 +3152,22 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); +struct offset_ctx { + struct xarray xa; + u32 next_offset; +}; + +void simple_offset_init(struct offset_ctx *octx); +int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); +void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); +int simple_offset_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry); +void simple_offset_destroy(struct offset_ctx *octx); + +extern const struct file_operations simple_offset_dir_operations; + extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_file_fsync(struct file *, loff_t, loff_t, int); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index ff6341e09925..96332db693d5 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -109,6 +109,7 @@ struct fs_context { bool need_free:1; /* Need to call ops->free() */ bool global:1; /* Goes into &init_user_ns */ bool oldapi:1; /* Coming from mount(2) */ + bool exclusive:1; /* create new superblock, reject existing one */ }; struct fs_context_operations { @@ -150,14 +151,13 @@ extern int get_tree_nodev(struct fs_context *fc, extern int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)); -extern int get_tree_single_reconf(struct fs_context *fc, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)); extern int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key); +int setup_bdev_super(struct super_block *sb, int sb_flags, + struct fs_context *fc); extern int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)); diff --git a/include/linux/fs_stack.h b/include/linux/fs_stack.h index 54210a42c30d..010d39d0dc1c 100644 --- a/include/linux/fs_stack.h +++ b/include/linux/fs_stack.h @@ -24,7 +24,7 @@ static inline void fsstack_copy_attr_times(struct inode *dest, { dest->i_atime = src->i_atime; dest->i_mtime = src->i_mtime; - dest->i_ctime = src->i_ctime; + inode_set_ctime_to_ts(dest, inode_get_ctime(src)); } #endif /* _LINUX_FS_STACK_H */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 20284387b841..e718dbe928ba 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,9 +25,6 @@ static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); -struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, - unsigned int flags); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index bfbc37ce223b..3ac3974b3c78 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1239,9 +1239,6 @@ extern int vmbus_recvpacket_raw(struct vmbus_channel *channel, u32 *buffer_actual_len, u64 *requestid); - -extern void vmbus_ontimer(unsigned long data); - /* Base driver object */ struct hv_driver { const char *name; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index e6936cb25047..33f21bd85dbf 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -100,10 +100,16 @@ struct rapl_package; #define RAPL_DOMAIN_NAME_LENGTH 16 +union rapl_reg { + void __iomem *mmio; + u32 msr; + u64 val; +}; + struct rapl_domain { char name[RAPL_DOMAIN_NAME_LENGTH]; enum rapl_domain_type id; - u64 regs[RAPL_DOMAIN_REG_MAX]; + union rapl_reg regs[RAPL_DOMAIN_REG_MAX]; struct powercap_zone power_zone; struct rapl_domain_data rdd; struct rapl_power_limit rpl[NR_POWER_LIMITS]; @@ -116,7 +122,7 @@ struct rapl_domain { }; struct reg_action { - u64 reg; + union rapl_reg reg; u64 mask; u64 value; int err; @@ -143,8 +149,8 @@ struct rapl_if_priv { enum rapl_if_type type; struct powercap_control_type *control_type; enum cpuhp_state pcap_rapl_online; - u64 reg_unit; - u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; + union rapl_reg reg_unit; + union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; int (*read_raw)(int id, struct reg_action *ra); int (*write_raw)(int id, struct reg_action *ra); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e2b836c2e119..fdc6e64f49d6 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -261,9 +261,10 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos); +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, diff --git a/include/linux/list.h b/include/linux/list.h index f10344dbad4d..164b4d0e9d2a 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -38,11 +38,92 @@ static inline void INIT_LIST_HEAD(struct list_head *list) WRITE_ONCE(list->prev, list); } +#ifdef CONFIG_LIST_HARDENED + #ifdef CONFIG_DEBUG_LIST -extern bool __list_add_valid(struct list_head *new, - struct list_head *prev, - struct list_head *next); -extern bool __list_del_entry_valid(struct list_head *entry); +# define __list_valid_slowpath +#else +# define __list_valid_slowpath __cold __preserve_most +#endif + +/* + * Performs the full set of list corruption checks before __list_add(). + * On list corruption reports a warning, and returns false. + */ +extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new, + struct list_head *prev, + struct list_head *next); + +/* + * Performs list corruption checks before __list_add(). Returns false if a + * corruption is detected, true otherwise. + * + * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking + * inline to catch non-faulting corruptions, and only if a corruption is + * detected calls the reporting function __list_add_valid_or_report(). + */ +static __always_inline bool __list_add_valid(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + bool ret = true; + + if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { + /* + * With the hardening version, elide checking if next and prev + * are NULL, since the immediate dereference of them below would + * result in a fault if NULL. + * + * With the reduced set of checks, we can afford to inline the + * checks, which also gives the compiler a chance to elide some + * of them completely if they can be proven at compile-time. If + * one of the pre-conditions does not hold, the slow-path will + * show a report which pre-condition failed. + */ + if (likely(next->prev == prev && prev->next == next && new != prev && new != next)) + return true; + ret = false; + } + + ret &= __list_add_valid_or_report(new, prev, next); + return ret; +} + +/* + * Performs the full set of list corruption checks before __list_del_entry(). + * On list corruption reports a warning, and returns false. + */ +extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry); + +/* + * Performs list corruption checks before __list_del_entry(). Returns false if a + * corruption is detected, true otherwise. + * + * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking + * inline to catch non-faulting corruptions, and only if a corruption is + * detected calls the reporting function __list_del_entry_valid_or_report(). + */ +static __always_inline bool __list_del_entry_valid(struct list_head *entry) +{ + bool ret = true; + + if (!IS_ENABLED(CONFIG_DEBUG_LIST)) { + struct list_head *prev = entry->prev; + struct list_head *next = entry->next; + + /* + * With the hardening version, elide checking if next and prev + * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate + * dereference of them below would result in a fault. + */ + if (likely(prev->next == entry && next->prev == entry)) + return true; + ret = false; + } + + ret &= __list_del_entry_valid_or_report(entry); + return ret; +} #else static inline bool __list_add_valid(struct list_head *new, struct list_head *prev, diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 7308a1a7599b..af796986baee 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -54,6 +54,7 @@ LSM_HOOK(int, 0, bprm_creds_from_file, struct linux_binprm *bprm, struct file *f LSM_HOOK(int, 0, bprm_check_security, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committing_creds, struct linux_binprm *bprm) LSM_HOOK(void, LSM_RET_VOID, bprm_committed_creds, struct linux_binprm *bprm) +LSM_HOOK(int, 0, fs_context_submount, struct fs_context *fc, struct super_block *reference) LSM_HOOK(int, 0, fs_context_dup, struct fs_context *fc, struct fs_context *src_sc) LSM_HOOK(int, -ENOPARAM, fs_context_parse_param, struct fs_context *fc, diff --git a/include/linux/mm.h b/include/linux/mm.h index 406ab9ea818f..34f9dba17c1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3421,15 +3421,24 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ -static inline bool gup_can_follow_protnone(unsigned int flags) +static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, + unsigned int flags) { /* - * FOLL_FORCE has to be able to make progress even if the VMA is - * inaccessible. Further, FOLL_FORCE access usually does not represent - * application behaviour and we should avoid triggering NUMA hinting - * faults. + * If callers don't want to honor NUMA hinting faults, no need to + * determine if we would actually have to trigger a NUMA hinting fault. */ - return flags & FOLL_FORCE; + if (!(flags & FOLL_HONOR_NUMA_FAULT)) + return true; + + /* + * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. + * + * Requiring a fault here even for inaccessible VMAs would mean that + * FOLL_FORCE cannot make any progress, because handle_mm_fault() + * refuses to process NUMA hinting faults in inaccessible VMAs. + */ + return !vma_is_accessible(vma); } typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5e74ce4a28cd..7d30dc4ff0ff 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1286,6 +1286,15 @@ enum { FOLL_PCI_P2PDMA = 1 << 10, /* allow interrupts from generic signals */ FOLL_INTERRUPTIBLE = 1 << 11, + /* + * Always honor (trigger) NUMA hinting faults. + * + * FOLL_WRITE implicitly honors NUMA hinting faults because a + * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE + * apply). get_user_pages_fast_only() always implicitly honors NUMA + * hinting faults. + */ + FOLL_HONOR_NUMA_FAULT = 1 << 12, /* See also internal only FOLL flags in mm/internal.h */ }; diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 86544707236a..45702bdcbceb 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -73,9 +73,7 @@ struct raw_notifier_head { struct srcu_notifier_head { struct mutex mutex; -#ifdef CONFIG_TREE_SRCU struct srcu_usage srcuu; -#endif struct srcu_struct srcu; struct notifier_block __rcu *head; }; @@ -106,7 +104,6 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #define RAW_NOTIFIER_INIT(name) { \ .head = NULL } -#ifdef CONFIG_TREE_SRCU #define SRCU_NOTIFIER_INIT(name, pcpu) \ { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ @@ -114,14 +111,6 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ } -#else -#define SRCU_NOTIFIER_INIT(name, pcpu) \ - { \ - .mutex = __MUTEX_INITIALIZER(name.mutex), \ - .head = NULL, \ - .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ - } -#endif #define ATOMIC_NOTIFIER_HEAD(name) \ struct atomic_notifier_head name = \ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index fee881cded01..771cb0285872 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -29,7 +29,7 @@ struct fs_struct; * nsproxy is copied. */ struct nsproxy { - atomic_t count; + refcount_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; @@ -102,14 +102,13 @@ int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { - if (atomic_dec_and_test(&ns->count)) { + if (refcount_dec_and_test(&ns->count)) free_nsproxy(ns); - } } static inline void get_nsproxy(struct nsproxy *ns) { - atomic_inc(&ns->count); + refcount_inc(&ns->count); } #endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 716953ee1ebd..d87840acbfb2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -470,6 +470,19 @@ static inline void *detach_page_private(struct page *page) return folio_detach_private(page_folio(page)); } +/* + * There are some parts of the kernel which assume that PMD entries + * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, + * limit the maximum allocation order to PMD size. I'm not aware of any + * assumptions about maximum order if THP are disabled, but 8 seems like + * a good order (that's 1MB if you're using 4kB pages) + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +#else +#define MAX_PAGECACHE_ORDER 8 +#endif + #ifdef CONFIG_NUMA struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); #else @@ -501,22 +514,69 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); -#define FGP_ACCESSED 0x00000001 -#define FGP_LOCK 0x00000002 -#define FGP_CREAT 0x00000004 -#define FGP_WRITE 0x00000008 -#define FGP_NOFS 0x00000010 -#define FGP_NOWAIT 0x00000020 -#define FGP_FOR_MMAP 0x00000040 -#define FGP_STABLE 0x00000080 +/** + * typedef fgf_t - Flags for getting folios from the page cache. + * + * Most users of the page cache will not need to use these flags; + * there are convenience functions such as filemap_get_folio() and + * filemap_lock_folio(). For users which need more control over exactly + * what is done with the folios, these flags to __filemap_get_folio() + * are available. + * + * * %FGP_ACCESSED - The folio will be marked accessed. + * * %FGP_LOCK - The folio is returned locked. + * * %FGP_CREAT - If no folio is present then a new folio is allocated, + * added to the page cache and the VM's LRU list. The folio is + * returned locked. + * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the + * folio is already in cache. If the folio was allocated, unlock it + * before returning so the caller can do the same dance. + * * %FGP_WRITE - The folio will be written to by the caller. + * * %FGP_NOFS - __GFP_FS will get cleared in gfp. + * * %FGP_NOWAIT - Don't block on the folio lock. + * * %FGP_STABLE - Wait for the folio to be stable (finished writeback) + * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin() + * implementation. + */ +typedef unsigned int __bitwise fgf_t; + +#define FGP_ACCESSED ((__force fgf_t)0x00000001) +#define FGP_LOCK ((__force fgf_t)0x00000002) +#define FGP_CREAT ((__force fgf_t)0x00000004) +#define FGP_WRITE ((__force fgf_t)0x00000008) +#define FGP_NOFS ((__force fgf_t)0x00000010) +#define FGP_NOWAIT ((__force fgf_t)0x00000020) +#define FGP_FOR_MMAP ((__force fgf_t)0x00000040) +#define FGP_STABLE ((__force fgf_t)0x00000080) +#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */ #define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) +/** + * fgf_set_order - Encode a length in the fgf_t flags. + * @size: The suggested size of the folio to create. + * + * The caller of __filemap_get_folio() can use this to suggest a preferred + * size for the folio that is created. If there is already a folio at + * the index, it will be returned, no matter what its size. If a folio + * is freshly created, it may be of a different size than requested + * due to alignment constraints, memory pressure, or the presence of + * other folios at nearby indices. + */ +static inline fgf_t fgf_set_order(size_t size) +{ + unsigned int shift = ilog2(size); + + if (shift <= PAGE_SHIFT) + return 0; + return (__force fgf_t)((shift - PAGE_SHIFT) << 26); +} + void *filemap_get_entry(struct address_space *mapping, pgoff_t index); struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp); + fgf_t fgp_flags, gfp_t gfp); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp); + fgf_t fgp_flags, gfp_t gfp); /** * filemap_get_folio - Find and get a folio. @@ -590,7 +650,7 @@ static inline struct page *find_get_page(struct address_space *mapping, } static inline struct page *find_get_page_flags(struct address_space *mapping, - pgoff_t offset, int fgp_flags) + pgoff_t offset, fgf_t fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 27a6df448ee5..27cd1e59ccf7 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -6,6 +6,16 @@ struct mm_walk; +/* Locking requirement during a page walk. */ +enum page_walk_lock { + /* mmap_lock should be locked for read to stabilize the vma tree */ + PGWALK_RDLOCK = 0, + /* vma will be write-locked during the walk */ + PGWALK_WRLOCK = 1, + /* vma is expected to be already write-locked during the walk */ + PGWALK_WRLOCK_VERIFY = 2, +}; + /** * struct mm_walk_ops - callbacks for walk_page_range * @pgd_entry: if set, called for each non-empty PGD (top-level) entry @@ -66,6 +76,7 @@ struct mm_walk_ops { int (*pre_vma)(unsigned long start, unsigned long end, struct mm_walk *walk); void (*post_vma)(struct mm_walk *walk); + enum page_walk_lock walk_lock; }; /* diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 02e0086b10f6..608a9eb86bff 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -269,10 +269,10 @@ bool pipe_is_unprivileged_user(void); /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); -long pipe_fcntl(struct file *, unsigned int, unsigned long arg); +long pipe_fcntl(struct file *, unsigned int, unsigned int arg); struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); int create_pipe_files(struct file **, int); -unsigned int round_pipe_size(unsigned long size); +unsigned int round_pipe_size(unsigned int size); #endif diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h index b83a3f944f28..b068e2e60939 100644 --- a/include/linux/prefetch.h +++ b/include/linux/prefetch.h @@ -25,11 +25,10 @@ struct page; prefetch() should be defined by the architecture, if not, the #define below provides a no-op define. - There are 3 prefetch() macros: + There are 2 prefetch() macros: prefetch(x) - prefetches the cacheline at "x" for read prefetchw(x) - prefetches the cacheline at "x" for write - spin_lock_prefetch(x) - prefetches the spinlock *x for taking there is also PREFETCH_STRIDE which is the architecure-preferred "lookahead" size for prefetching streamed operations. @@ -44,10 +43,6 @@ struct page; #define prefetchw(x) __builtin_prefetch(x,1) #endif -#ifndef ARCH_HAS_SPINLOCK_PREFETCH -#define spin_lock_prefetch(x) prefetchw(x) -#endif - #ifndef PREFETCH_STRIDE #define PREFETCH_STRIDE (4*L1_CACHE_BYTES) #endif diff --git a/include/linux/raid_class.h b/include/linux/raid_class.h index 6a9b177d5c41..e50416ba9cd9 100644 --- a/include/linux/raid_class.h +++ b/include/linux/raid_class.h @@ -77,7 +77,3 @@ DEFINE_RAID_ATTRIBUTE(enum raid_state, state) struct raid_template *raid_class_attach(struct raid_function_template *); void raid_class_release(struct raid_template *); - -int __must_check raid_component_add(struct raid_template *, struct device *, - struct device *); - diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index ba4c00dd8005..89186c499dd4 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -101,7 +101,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n, { struct hlist_nulls_node *first = h->first; - n->next = first; + WRITE_ONCE(n->next, first); WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_nulls_first_rcu(h), n); if (!is_a_nulls(first)) @@ -137,7 +137,7 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, last = i; if (last) { - n->next = last->next; + WRITE_ONCE(n->next, last->next); n->pprev = &last->next; rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 9bc8cbb33340..eda493200663 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -87,6 +87,7 @@ static inline void rcu_read_unlock_trace(void) void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); void synchronize_rcu_tasks_trace(void); void rcu_barrier_tasks_trace(void); +struct task_struct *get_rcu_tasks_trace_gp_kthread(void); #else /* * The BPF JIT forms these addresses even when it doesn't call these diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index 699b938358bf..5e0f74f2f8ca 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -42,6 +42,11 @@ do { \ * call_srcu() function, with this wrapper supplying the pointer to the * corresponding srcu_struct. * + * Note that call_rcu_hurry() should be used instead of call_rcu() + * because in kernels built with CONFIG_RCU_LAZY=y the delay between the + * invocation of call_rcu() and that of the corresponding RCU callback + * can be multiple seconds. + * * The first argument tells Tiny RCU's _wait_rcu_gp() not to * bother waiting for RCU. The reason for this is because anywhere * synchronize_rcu_mult() can be called is automatically already a full diff --git a/include/linux/security.h b/include/linux/security.h index 32828502f09e..bac98ea18f78 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -293,6 +293,7 @@ int security_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file); int security_bprm_check(struct linux_binprm *bprm); void security_bprm_committing_creds(struct linux_binprm *bprm); void security_bprm_committed_creds(struct linux_binprm *bprm); +int security_fs_context_submount(struct fs_context *fc, struct super_block *reference); int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc); int security_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param); int security_sb_alloc(struct super_block *sb); @@ -629,6 +630,11 @@ static inline void security_bprm_committed_creds(struct linux_binprm *bprm) { } +static inline int security_fs_context_submount(struct fs_context *fc, + struct super_block *reference) +{ + return 0; +} static inline int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index bd023dd38ae6..386ab580b839 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -249,18 +249,19 @@ static inline void seq_show_option(struct seq_file *m, const char *name, /** * seq_show_option_n - display mount options with appropriate escapes - * where @value must be a specific length. + * where @value must be a specific length (i.e. + * not NUL-terminated). * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, cannot be NULL - * @length: the length of @value to display + * @length: the exact length of @value to display, must be constant expression * * This is a macro since this uses "length" to define the size of the * stack buffer. */ #define seq_show_option_n(m, name, value, length) { \ char val_buf[length + 1]; \ - strncpy(val_buf, value, length); \ + memcpy(val_buf, value, length); \ val_buf[length] = '\0'; \ seq_show_option(m, name, val_buf); \ } diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 6d58c57acdaa..a156d2ed8d9e 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -459,7 +459,8 @@ struct uart_port { struct serial_rs485 *rs485); int (*iso7816_config)(struct uart_port *, struct serial_iso7816 *iso7816); - int ctrl_id; /* optional serial core controller id */ + unsigned int ctrl_id; /* optional serial core controller id */ + unsigned int port_id; /* optional serial core port id */ unsigned int irq; /* irq number */ unsigned long irqflags; /* irq flags */ unsigned int uartclk; /* base uart clock */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 9029abd29b1c..6b0c626620f5 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -13,6 +13,10 @@ /* inode in-kernel data */ +#ifdef CONFIG_TMPFS_QUOTA +#define SHMEM_MAXQUOTAS 2 +#endif + struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ @@ -27,6 +31,10 @@ struct shmem_inode_info { atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ +#ifdef CONFIG_TMPFS_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + struct offset_ctx dir_offsets; /* stable entry offsets */ struct inode vfs_inode; }; @@ -35,11 +43,18 @@ struct shmem_inode_info { (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL) #define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL) +struct shmem_quota_limits { + qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */ + qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */ + qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */ + qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */ +}; + struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ - unsigned long free_inodes; /* How many are left for allocation */ + unsigned long free_ispace; /* How much ispace left for allocation */ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ @@ -53,6 +68,7 @@ struct shmem_sb_info { spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ + struct shmem_quota_limits qlimits; /* Default quota limits */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) @@ -172,4 +188,17 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ +/* + * Used space is stored as unsigned 64-bit value in bytes but + * quota core supports only signed 64-bit values so use that + * as a limit + */ +#define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */ +#define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL + +#ifdef CONFIG_TMPFS_QUOTA +extern const struct dquot_operations shmem_quota_operations; +extern struct quota_format_type shmem_quota_format; +#endif /* CONFIG_TMPFS_QUOTA */ + #endif diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 054d7911bfc9..c1637515a8a4 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -62,6 +62,7 @@ struct sk_psock_progs { enum sk_psock_state_bits { SK_PSOCK_TX_ENABLED, + SK_PSOCK_RX_STRP_ENABLED, }; struct sk_psock_link { diff --git a/include/linux/spi/corgi_lcd.h b/include/linux/spi/corgi_lcd.h index 0b857616919c..fc6c1515dc54 100644 --- a/include/linux/spi/corgi_lcd.h +++ b/include/linux/spi/corgi_lcd.h @@ -15,4 +15,6 @@ struct corgi_lcd_platform_data { void (*kick_battery)(void); }; +void corgi_lcd_limit_intensity(int limit); + #endif /* __LINUX_SPI_CORGI_LCD_H */ diff --git a/include/linux/spi/spi-mem.h b/include/linux/spi/spi-mem.h index 8e984d75f5b6..6b0a7dc48a4b 100644 --- a/include/linux/spi/spi-mem.h +++ b/include/linux/spi/spi-mem.h @@ -101,6 +101,7 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; u16 opcode; } cmd; @@ -108,6 +109,7 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; u64 val; } addr; @@ -115,12 +117,14 @@ struct spi_mem_op { u8 nbytes; u8 buswidth; u8 dtr : 1; + u8 __pad : 7; } dummy; struct { u8 buswidth; u8 dtr : 1; u8 ecc : 1; + u8 __pad : 6; enum spi_mem_data_dir dir; unsigned int nbytes; union { diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index ebd72491af99..447133171d95 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -48,6 +48,10 @@ void srcu_drive_gp(struct work_struct *wp); #define DEFINE_STATIC_SRCU(name) \ static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) +// Dummy structure for srcu_notifier_head. +struct srcu_usage { }; +#define __SRCU_USAGE_INIT(name) { } + void synchronize_srcu(struct srcu_struct *ssp); /* diff --git a/include/linux/swait.h b/include/linux/swait.h index 6a8c22b8c2a5..d324419482a0 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -146,7 +146,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq) extern void swake_up_one(struct swait_queue_head *q); extern void swake_up_all(struct swait_queue_head *q); -extern void swake_up_locked(struct swait_queue_head *q); +extern void swake_up_locked(struct swait_queue_head *q, int wake_flags); extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state); extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 03e3d0121d5e..c0cb22cd607d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -284,22 +284,6 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) #endif /* - * Called before coming back to user-mode. Returning to user-mode with an - * address limit different than USER_DS can allow to overwrite kernel memory. - */ -static inline void addr_limit_user_check(void) -{ -#ifdef TIF_FSCHECK - if (!test_thread_flag(TIF_FSCHECK)) - return; -#endif - -#ifdef TIF_FSCHECK - clear_thread_flag(TIF_FSCHECK); -#endif -} - -/* * These syscall function prototypes are kept in the same order as * include/uapi/asm-generic/unistd.h. Architecture specific entries go below, * followed by deprecated or obsolete system calls. @@ -438,8 +422,10 @@ asmlinkage long sys_chdir(const char __user *filename); asmlinkage long sys_fchdir(unsigned int fd); asmlinkage long sys_chroot(const char __user *filename); asmlinkage long sys_fchmod(unsigned int fd, umode_t mode); -asmlinkage long sys_fchmodat(int dfd, const char __user * filename, +asmlinkage long sys_fchmodat(int dfd, const char __user *filename, umode_t mode); +asmlinkage long sys_fchmodat2(int dfd, const char __user *filename, + umode_t mode, unsigned int flags); asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group); diff --git a/include/linux/torture.h b/include/linux/torture.h index 7038104463e4..bb466eec01e4 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -108,12 +108,15 @@ bool torture_must_stop(void); bool torture_must_stop_irq(void); void torture_kthread_stopping(char *title); int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, - char *f, struct task_struct **tp); + char *f, struct task_struct **tp, void (*cbf)(struct task_struct *tp)); void _torture_stop_kthread(char *m, struct task_struct **tp); #define torture_create_kthread(n, arg, tp) \ _torture_create_kthread(n, (arg), #n, "Creating " #n " task", \ - "Failed to create " #n, &(tp)) + "Failed to create " #n, &(tp), NULL) +#define torture_create_kthread_cb(n, arg, tp, cbf) \ + _torture_create_kthread(n, (arg), #n, "Creating " #n " task", \ + "Failed to create " #n, &(tp), cbf) #define torture_stop_kthread(n, tp) \ _torture_stop_kthread("Stopping " #n " task", &(tp)) diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 6a1e8f157255..4ee9d13749ad 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -283,6 +283,7 @@ enum tpm_chip_flags { TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED = BIT(6), TPM_CHIP_FLAG_FIRMWARE_UPGRADE = BIT(7), TPM_CHIP_FLAG_SUSPENDED = BIT(8), + TPM_CHIP_FLAG_HWRNG_DISABLED = BIT(9), }; #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3930e676436c..1e8bbdb8da90 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -59,6 +59,17 @@ int trace_raw_output_prep(struct trace_iterator *iter, extern __printf(2, 3) void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...); +/* Used to find the offset and length of dynamic fields in trace events */ +struct trace_dynamic_info { +#ifdef CONFIG_CPU_BIG_ENDIAN + u16 offset; + u16 len; +#else + u16 len; + u16 offset; +#endif +}; + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: diff --git a/include/linux/uio.h b/include/linux/uio.h index ff81e5ccaef2..42bce38a8e87 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -163,7 +163,7 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) return ret; } -size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, +size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); @@ -184,6 +184,13 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, { return copy_page_to_iter(&folio->page, offset, bytes, i); } + +static inline size_t copy_folio_from_iter_atomic(struct folio *folio, + size_t offset, size_t bytes, struct iov_iter *i) +{ + return copy_page_from_iter_atomic(&folio->page, offset, bytes, i); +} + size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index bdf8de2cdd93..7b4dd69555e4 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -155,6 +155,10 @@ retry: if (gso_type & SKB_GSO_UDP) nh_off -= thlen; + /* Kernel has a special handling for GSO_BY_FRAGS. */ + if (gso_size == GSO_BY_FRAGS) + return -EINVAL; + /* Too small packets are not really GSO ones. */ if (skb->len - nh_off > gso_size) { shinfo->gso_size = gso_size; diff --git a/include/linux/wait.h b/include/linux/wait.h index a0307b516b09..5ec7739400f4 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -210,6 +210,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq } int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, unsigned int mode, void *key, wait_queue_entry_t *bookmark); @@ -237,6 +238,8 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head); #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m)) #define wake_up_poll(x, m) \ __wake_up(x, TASK_NORMAL, 1, poll_to_key(m)) +#define wake_up_poll_on_current_cpu(x, m) \ + __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m)) #define wake_up_locked_poll(x, m) \ __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m)) #define wake_up_interruptible_poll(x, m) \ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index fba937999fbf..083387c00f0c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -375,11 +375,6 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio); -void folio_account_redirty(struct folio *folio); -static inline void account_page_redirty(struct page *page) -{ - folio_account_redirty(page_folio(page)); -} bool folio_redirty_for_writepage(struct writeback_control *, struct folio *); bool redirty_page_for_writepage(struct writeback_control *, struct page *); diff --git a/include/linux/xattr.h b/include/linux/xattr.h index d591ef59aa98..d20051865800 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -114,13 +114,15 @@ struct simple_xattr { }; void simple_xattrs_init(struct simple_xattrs *xattrs); -void simple_xattrs_free(struct simple_xattrs *xattrs); +void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); +size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); +void simple_xattr_free(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); -int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags, - ssize_t *removed_size); +struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, + const char *name, const void *value, + size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_add(struct simple_xattrs *xattrs, diff --git a/include/net/bonding.h b/include/net/bonding.h index 30ac427cf0c6..5b8b1b644a2d 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -722,23 +722,14 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond, } /* Caller must hold rcu_read_lock() for read */ -static inline bool bond_slave_has_mac_rx(struct bonding *bond, const u8 *mac) +static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac) { struct list_head *iter; struct slave *tmp; - struct netdev_hw_addr *ha; bond_for_each_slave_rcu(bond, tmp, iter) if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) return true; - - if (netdev_uc_empty(bond->dev)) - return false; - - netdev_for_each_uc_addr(ha, bond->dev) - if (ether_addr_equal_64bits(mac, ha->addr)) - return true; - return false; } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7c7d03aa9d06..d6fa7c8767ad 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -562,6 +562,9 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) return NULL; + if (iftype == NL80211_IFTYPE_AP_VLAN) + iftype = NL80211_IFTYPE_AP; + for (i = 0; i < sband->n_iftype_data; i++) { const struct ieee80211_sband_iftype_data *data = &sband->iftype_data[i]; diff --git a/include/net/gro.h b/include/net/gro.h index 75efa6fb8441..88644b3ca660 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -452,6 +452,49 @@ static inline void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, gro_normal_list(napi); } +/* This function is the alternative of 'inet_iif' and 'inet_sdif' + * functions in case we can not rely on fields of IPCB. + * + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. + * The caller must hold the RCU read lock. + */ +static inline void inet_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) +{ + *iif = inet_iif(skb) ?: skb->dev->ifindex; + *sdif = 0; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(skb->dev)) { + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); + + *sdif = *iif; + *iif = master ? master->ifindex : 0; + } +#endif +} + +/* This function is the alternative of 'inet6_iif' and 'inet6_sdif' + * functions in case we can not rely on fields of IP6CB. + * + * The caller must verify skb_valid_dst(skb) is false and skb->dev is initialized. + * The caller must hold the RCU read lock. + */ +static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int *sdif) +{ + /* using skb->dev->ifindex because skb_dst(skb) is not initialized */ + *iif = skb->dev->ifindex; + *sdif = 0; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (netif_is_l3_slave(skb->dev)) { + struct net_device *master = netdev_master_upper_dev_get_rcu(skb->dev); + + *sdif = *iif; + *iif = master ? master->ifindex : 0; + } +#endif +} + extern struct list_head offload_base; #endif /* _NET_IPV6_GRO_H */ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index caa20a905531..491ceb7ebe5d 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -107,11 +107,12 @@ static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { - if (!sk->sk_mark && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) + u32 mark = READ_ONCE(sk->sk_mark); + + if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept)) return skb->mark; - return sk->sk_mark; + return mark; } static inline int inet_request_bound_dev_if(const struct sock *sk, @@ -221,8 +222,8 @@ struct inet_sock { __s16 uc_ttl; __u16 cmsg_flags; struct ip_options_rcu __rcu *inet_opt; + atomic_t inet_id; __be16 inet_sport; - __u16 inet_id; __u8 tos; __u8 min_ttl; diff --git a/include/net/ip.h b/include/net/ip.h index 50d435855ae2..19adacd5ece0 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -93,7 +93,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, { ipcm_init(ipcm); - ipcm->sockc.mark = inet->sk.sk_mark; + ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); ipcm->sockc.tsflags = inet->sk.sk_tsflags; ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); ipcm->addr = inet->inet_saddr; @@ -538,8 +538,19 @@ static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, * generator as much as we can. */ if (sk && inet_sk(sk)->inet_daddr) { - iph->id = htons(inet_sk(sk)->inet_id); - inet_sk(sk)->inet_id += segs; + int val; + + /* avoid atomic operations for TCP, + * as we hold socket lock at this point. + */ + if (sk_is_tcp(sk)) { + sock_owned_by_me(sk); + val = atomic_read(&inet_sk(sk)->inet_id); + atomic_set(&inet_sk(sk)->inet_id, val + segs); + } else { + val = atomic_add_return(segs, &inet_sk(sk)->inet_id); + } + iph->id = htons(val); return; } if ((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 3a8a2d2c58c3..2a55ae932c56 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6612,6 +6612,7 @@ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap, * marks frames marked in the bitmap as having been filtered. Afterwards, it * checks if any frames in the window starting from @ssn can now be released * (in case they were only waiting for frames that were filtered.) + * (Only work correctly if @max_rx_aggregation_subframes <= 64 frames) */ void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid, u16 ssn, u64 filtered, diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 640441a2f926..dd40c75011d2 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -512,6 +512,7 @@ struct nft_set_elem_expr { * * @list: table set list node * @bindings: list of set bindings + * @refs: internal refcounting for async set destruction * @table: table this set belongs to * @net: netnamespace this set belongs to * @name: name of the set @@ -533,6 +534,7 @@ struct nft_set_elem_expr { * @expr: stateful expression * @ops: set ops * @flags: set flags + * @dead: set will be freed, never cleared * @genmask: generation mask * @klen: key length * @dlen: data length @@ -541,6 +543,7 @@ struct nft_set_elem_expr { struct nft_set { struct list_head list; struct list_head bindings; + refcount_t refs; struct nft_table *table; possible_net_t net; char *name; @@ -562,7 +565,8 @@ struct nft_set { struct list_head pending_update; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:14, + u16 flags:13, + dead:1, genmask:2; u8 klen; u8 dlen; @@ -583,6 +587,11 @@ static inline void *nft_set_priv(const struct nft_set *set) return (void *)set->data; } +static inline bool nft_set_gc_is_pending(const struct nft_set *s) +{ + return refcount_read(&s->refs) != 1; +} + static inline struct nft_set *nft_set_container_of(const void *priv) { return (void *)priv - offsetof(struct nft_set, data); @@ -596,7 +605,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net, struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, const struct nft_set *set); -void *nft_set_catchall_gc(const struct nft_set *set); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { @@ -813,62 +821,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, const struct nft_set *set, void *elem); -/** - * struct nft_set_gc_batch_head - nf_tables set garbage collection batch - * - * @rcu: rcu head - * @set: set the elements belong to - * @cnt: count of elements - */ -struct nft_set_gc_batch_head { - struct rcu_head rcu; - const struct nft_set *set; - unsigned int cnt; -}; - -#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ - sizeof(struct nft_set_gc_batch_head)) / \ - sizeof(void *)) - -/** - * struct nft_set_gc_batch - nf_tables set garbage collection batch - * - * @head: GC batch head - * @elems: garbage collection elements - */ -struct nft_set_gc_batch { - struct nft_set_gc_batch_head head; - void *elems[NFT_SET_GC_BATCH_SIZE]; -}; - -struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, - gfp_t gfp); -void nft_set_gc_batch_release(struct rcu_head *rcu); - -static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) -{ - if (gcb != NULL) - call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); -} - -static inline struct nft_set_gc_batch * -nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, - gfp_t gfp) -{ - if (gcb != NULL) { - if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) - return gcb; - nft_set_gc_batch_complete(gcb); - } - return nft_set_gc_batch_alloc(set, gfp); -} - -static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, - void *elem) -{ - gcb->elems[gcb->head.cnt++] = elem; -} - struct nft_expr_ops; /** * struct nft_expr_type - nf_tables expression type @@ -1557,39 +1509,30 @@ static inline void nft_set_elem_change_active(const struct net *net, #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ -/* - * We use a free bit in the genmask field to indicate the element - * is busy, meaning it is currently being processed either by - * the netlink API or GC. - * - * Even though the genmask is only a single byte wide, this works - * because the extension structure if fully constant once initialized, - * so there are no non-atomic write accesses unless it is already - * marked busy. - */ -#define NFT_SET_ELEM_BUSY_MASK (1 << 2) +#define NFT_SET_ELEM_DEAD_MASK (1 << 2) #if defined(__LITTLE_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT 2 +#define NFT_SET_ELEM_DEAD_BIT 2 #elif defined(__BIG_ENDIAN_BITFIELD) -#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) +#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #else #error #endif -static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) +static inline void nft_set_elem_dead(struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); - return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); + set_bit(NFT_SET_ELEM_DEAD_BIT, word); } -static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) +static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) { unsigned long *word = (unsigned long *)ext; - clear_bit(NFT_SET_ELEM_BUSY_BIT, word); + BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); + return test_bit(NFT_SET_ELEM_DEAD_BIT, word); } /** @@ -1732,6 +1675,38 @@ struct nft_trans_flowtable { #define nft_trans_flowtable_flags(trans) \ (((struct nft_trans_flowtable *)trans->data)->flags) +#define NFT_TRANS_GC_BATCHCOUNT 256 + +struct nft_trans_gc { + struct list_head list; + struct net *net; + struct nft_set *set; + u32 seq; + u8 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; +}; + +struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_destroy(struct nft_trans_gc *trans); + +struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp); +void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); + +struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); +void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + +void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + unsigned int gc_seq); + +void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, + struct nft_set_elem *elem); + int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); @@ -1758,6 +1733,8 @@ struct nftables_pernet { struct mutex commit_mutex; u64 table_handle; unsigned int base_seq; + unsigned int gc_seq; + u8 validate_state; }; extern unsigned int nf_tables_net_id; diff --git a/include/net/route.h b/include/net/route.h index 5a5c726472bd..8c2a8e7d8f8e 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -168,7 +168,7 @@ static inline struct rtable *ip_route_output_ports(struct net *net, struct flowi __be16 dport, __be16 sport, __u8 proto, __u8 tos, int oif) { - flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos, + flowi4_init_output(fl4, oif, sk ? READ_ONCE(sk->sk_mark) : 0, tos, RT_SCOPE_UNIVERSE, proto, sk ? inet_sk_flowi_flags(sk) : 0, daddr, saddr, dport, sport, sock_net_uid(net, sk)); @@ -301,7 +301,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, if (inet_sk(sk)->transparent) flow_flags |= FLOWI_FLAG_ANYSRC; - flowi4_init_output(fl4, oif, sk->sk_mark, ip_sock_rt_tos(sk), + flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, src, dport, sport, sk->sk_uid); } diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index d9076a7a430c..6506221c5fe3 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -190,8 +190,8 @@ int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr * int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh); -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, - struct netlink_ext_ack *exterr); +int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, + struct netlink_ext_ack *exterr); struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) diff --git a/include/net/sock.h b/include/net/sock.h index 2eb916d1ff64..690e22139543 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1323,6 +1323,7 @@ struct proto { /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. + * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ @@ -1420,6 +1421,12 @@ static inline bool sk_has_memory_pressure(const struct sock *sk) return sk->sk_prot->memory_pressure != NULL; } +static inline bool sk_under_global_memory_pressure(const struct sock *sk) +{ + return sk->sk_prot->memory_pressure && + !!READ_ONCE(*sk->sk_prot->memory_pressure); +} + static inline bool sk_under_memory_pressure(const struct sock *sk) { if (!sk->sk_prot->memory_pressure) @@ -1429,7 +1436,7 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; - return !!*sk->sk_prot->memory_pressure; + return !!READ_ONCE(*sk->sk_prot->memory_pressure); } static inline long @@ -1506,7 +1513,7 @@ proto_memory_pressure(struct proto *prot) { if (!prot->memory_pressure) return false; - return !!*prot->memory_pressure; + return !!READ_ONCE(*prot->memory_pressure); } diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 1648240c9668..6a9f8a5f387c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -556,12 +556,12 @@ static inline void vxlan_flag_attr_error(int attrtype, } static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, - int hash, + u32 hash, struct vxlan_rdst *rdst) { struct fib_nh_common *nhc; - nhc = nexthop_path_fdb_result(nh, hash); + nhc = nexthop_path_fdb_result(nh, hash >> 1); if (unlikely(!nhc)) return false; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 151ca95dd08d..363c7d510554 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1984,6 +1984,7 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) if (dev->xfrmdev_ops->xdo_dev_state_free) dev->xfrmdev_ops->xdo_dev_state_free(x); xso->dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); } } diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 75b2235b99e2..b9230b6add04 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -194,6 +194,7 @@ struct scsi_device { unsigned no_start_on_add:1; /* do not issue start on add */ unsigned allow_restart:1; /* issue START_UNIT in error handler */ unsigned manage_start_stop:1; /* Let HLD (sd) manage start/stop */ + unsigned no_start_on_resume:1; /* Do not issue START_STOP_UNIT on resume */ unsigned start_stop_pwr_cond:1; /* Set power cond. in START_STOP_UNIT */ unsigned no_uld_attach:1; /* disable connecting to upper level drivers */ unsigned select_no_atn:1; diff --git a/include/soc/tegra/mc.h b/include/soc/tegra/mc.h index fc3001483e62..a5ef84944a06 100644 --- a/include/soc/tegra/mc.h +++ b/include/soc/tegra/mc.h @@ -175,6 +175,9 @@ struct tegra_mc_icc_ops { int (*get_bw)(struct icc_node *node, u32 *avg, u32 *peak); }; +struct icc_node *tegra_mc_icc_xlate(struct of_phandle_args *spec, void *data); +extern const struct tegra_mc_icc_ops tegra_mc_icc_ops; + struct tegra_mc_ops { /* * @probe: Callback to set up SoC-specific bits of the memory controller. This is called diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index a8206f5332e9..b2db2c2f1c57 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -38,7 +38,6 @@ struct find_free_extent_ctl; __print_symbolic(type, \ { BTRFS_TREE_BLOCK_REF_KEY, "TREE_BLOCK_REF" }, \ { BTRFS_EXTENT_DATA_REF_KEY, "EXTENT_DATA_REF" }, \ - { BTRFS_EXTENT_REF_V0_KEY, "EXTENT_REF_V0" }, \ { BTRFS_SHARED_BLOCK_REF_KEY, "SHARED_BLOCK_REF" }, \ { BTRFS_SHARED_DATA_REF_KEY, "SHARED_DATA_REF" }) @@ -2482,7 +2481,7 @@ DECLARE_EVENT_CLASS(btrfs_raid56_bio, __entry->offset, __entry->opf, __entry->physical, __entry->len) ); -DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial, +DEFINE_EVENT(btrfs_raid56_bio, raid56_read, TP_PROTO(const struct btrfs_raid_bio *rbio, const struct bio *bio, const struct raid56_bio_trace_info *trace_info), @@ -2490,32 +2489,7 @@ DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial, TP_ARGS(rbio, bio, trace_info) ); -DEFINE_EVENT(btrfs_raid56_bio, raid56_write_stripe, - TP_PROTO(const struct btrfs_raid_bio *rbio, - const struct bio *bio, - const struct raid56_bio_trace_info *trace_info), - - TP_ARGS(rbio, bio, trace_info) -); - - -DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_write_stripe, - TP_PROTO(const struct btrfs_raid_bio *rbio, - const struct bio *bio, - const struct raid56_bio_trace_info *trace_info), - - TP_ARGS(rbio, bio, trace_info) -); - -DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read, - TP_PROTO(const struct btrfs_raid_bio *rbio, - const struct bio *bio, - const struct raid56_bio_trace_info *trace_info), - - TP_ARGS(rbio, bio, trace_info) -); - -DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read_recover, +DEFINE_EVENT(btrfs_raid56_bio, raid56_write, TP_PROTO(const struct btrfs_raid_bio *rbio, const struct bio *bio, const struct raid56_bio_trace_info *trace_info), diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 71dbe8bfa7db..e18684b02c3d 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -80,11 +80,11 @@ TRACE_EVENT(erofs_fill_inode, __entry->blkaddr, __entry->ofs) ); -TRACE_EVENT(erofs_readpage, +TRACE_EVENT(erofs_read_folio, - TP_PROTO(struct page *page, bool raw), + TP_PROTO(struct folio *folio, bool raw), - TP_ARGS(page, raw), + TP_ARGS(folio, raw), TP_STRUCT__entry( __field(dev_t, dev ) @@ -96,11 +96,11 @@ TRACE_EVENT(erofs_readpage, ), TP_fast_assign( - __entry->dev = page->mapping->host->i_sb->s_dev; - __entry->nid = EROFS_I(page->mapping->host)->nid; - __entry->dir = S_ISDIR(page->mapping->host->i_mode); - __entry->index = page->index; - __entry->uptodate = PageUptodate(page); + __entry->dev = folio->mapping->host->i_sb->s_dev; + __entry->nid = EROFS_I(folio->mapping->host)->nid; + __entry->dir = S_ISDIR(folio->mapping->host->i_mode); + __entry->index = folio->index; + __entry->uptodate = folio_test_uptodate(folio); __entry->raw = raw; ), diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index bf06db8d2046..7b1ddffa3dfc 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -381,6 +381,7 @@ TRACE_EVENT(tcp_cong_state_set, __field(const void *, skaddr) __field(__u16, sport) __field(__u16, dport) + __field(__u16, family) __array(__u8, saddr, 4) __array(__u8, daddr, 4) __array(__u8, saddr_v6, 16) @@ -396,6 +397,7 @@ TRACE_EVENT(tcp_cong_state_set, __entry->sport = ntohs(inet->inet_sport); __entry->dport = ntohs(inet->inet_dport); + __entry->family = sk->sk_family; p32 = (__be32 *) __entry->saddr; *p32 = inet->inet_saddr; @@ -409,7 +411,8 @@ TRACE_EVENT(tcp_cong_state_set, __entry->cong_state = ca_state; ), - TP_printk("sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", + TP_printk("family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c cong_state=%u", + show_family_name(__entry->family), __entry->sport, __entry->dport, __entry->saddr, __entry->daddr, __entry->saddr_v6, __entry->daddr_v6, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index fd6c1cb585db..abe087c53b4b 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -820,8 +820,11 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) #define __NR_cachestat 451 __SYSCALL(__NR_cachestat, sys_cachestat) +#define __NR_fchmodat2 452 +__SYSCALL(__NR_fchmodat2, sys_fchmodat2) + #undef __NR_syscalls -#define __NR_syscalls 452 +#define __NR_syscalls 453 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index ab38d0f411fa..fc3c32186d7e 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -220,7 +220,11 @@ #define BTRFS_EXTENT_DATA_REF_KEY 178 -#define BTRFS_EXTENT_REF_V0_KEY 180 +/* + * Obsolete key. Defintion removed in 6.6, value may be reused in the future. + * + * #define BTRFS_EXTENT_REF_V0_KEY 180 + */ #define BTRFS_SHARED_BLOCK_REF_KEY 182 diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 0c8cf359ea5b..e0e159138331 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -443,7 +443,6 @@ typedef struct elf64_shdr { #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ #define NT_MIPS_FP_MODE 0x801 /* MIPS floating-point mode */ #define NT_MIPS_MSA 0x802 /* MIPS SIMD registers */ -#define NT_RISCV_VECTOR 0x900 /* RISC-V vector registers */ #define NT_LOONGARCH_CPUCFG 0xa00 /* LoongArch CPU config registers */ #define NT_LOONGARCH_CSR 0xa01 /* LoongArch control and status registers */ #define NT_LOONGARCH_LSX 0xa02 /* LoongArch Loongson SIMD Extension registers */ diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 8eb0d7b758d2..bb242fdcfe6b 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -100,8 +100,9 @@ enum fsconfig_command { FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ - FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_CMD_CREATE = 6, /* Create new or reuse existing superblock */ FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ + FSCONFIG_CMD_CREATE_EXCL = 8, /* Create new superblock, fail if reusing existing superblock */ }; /* diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7865f5a9885b..4f3932bb712d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -710,9 +710,11 @@ enum { TCA_FLOWER_KEY_CFM_OPT_UNSPEC, TCA_FLOWER_KEY_CFM_MD_LEVEL, TCA_FLOWER_KEY_CFM_OPCODE, - TCA_FLOWER_KEY_CFM_OPT_MAX, + __TCA_FLOWER_KEY_CFM_OPT_MAX, }; +#define TCA_FLOWER_KEY_CFM_OPT_MAX (__TCA_FLOWER_KEY_CFM_OPT_MAX - 1) + #define TCA_FLOWER_MASK_FLAGS_RANGE (1 << 0) /* Range-based match */ /* Match-all classifier */ diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h index f17c9636a859..52090105b828 100644 --- a/include/uapi/linux/quota.h +++ b/include/uapi/linux/quota.h @@ -77,6 +77,7 @@ #define QFMT_VFS_V0 2 #define QFMT_OCFS2 3 #define QFMT_VFS_V1 4 +#define QFMT_SHMEM 5 /* Size of block in which space limits are passed through the quota * interface */ diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 0fdc6ef02b94..dbfc9b37fcae 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -115,6 +115,8 @@ struct seccomp_notif_resp { __u32 flags; }; +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0) + /* valid flags for seccomp_notif_addfd */ #define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ #define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */ @@ -150,4 +152,6 @@ struct seccomp_notif_addfd { #define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ struct seccomp_notif_addfd) +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64) + #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h index 7837ba4fe728..7c3fc3980881 100644 --- a/include/uapi/linux/stddef.h +++ b/include/uapi/linux/stddef.h @@ -45,3 +45,7 @@ TYPE NAME[]; \ } #endif + +#ifndef __counted_by +#define __counted_by(m) +#endif diff --git a/include/video/kyro.h b/include/video/kyro.h index b958c2e9c915..418eef6c5523 100644 --- a/include/video/kyro.h +++ b/include/video/kyro.h @@ -38,18 +38,6 @@ struct kyrofb_info { int wc_cookie; }; -extern int kyro_dev_init(void); -extern void kyro_dev_reset(void); - -extern unsigned char *kyro_dev_physical_fb_ptr(void); -extern unsigned char *kyro_dev_virtual_fb_ptr(void); -extern void *kyro_dev_physical_regs_ptr(void); -extern void *kyro_dev_virtual_regs_ptr(void); -extern unsigned int kyro_dev_fb_size(void); -extern unsigned int kyro_dev_regs_size(void); - -extern u32 kyro_dev_overlay_offset(void); - /* * benedict.gaster@superh.com * Added the follow IOCTLS for the creation of overlay services... |