summaryrefslogtreecommitdiff
path: root/include/linux
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2025-03-25 10:17:25 +0300
committerIngo Molnar <mingo@kernel.org>2025-03-25 10:17:25 +0300
commit2487b6b9bf2874cfca7efb59c95650c5b1d88d43 (patch)
tree8bd7a32686e006b1cbeeb80511114c63ac442c87 /include/linux
parentc8c81458863ab686cda4fe1e603fccaae0f12460 (diff)
parent2df0c02dab829dd89360d98a8a1abaa026ef5798 (diff)
downloadlinux-2487b6b9bf2874cfca7efb59c95650c5b1d88d43.tar.xz
Merge branch 'linus' into x86/urgent, to pick up fixes and refresh the branch
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/binfmts.h2
-rw-r--r--include/linux/bitmap.h8
-rw-r--r--include/linux/bits.h2
-rw-r--r--include/linux/blkdev.h8
-rw-r--r--include/linux/cgroup-defs.h5
-rw-r--r--include/linux/cgroup.h3
-rw-r--r--include/linux/compiler.h18
-rw-r--r--include/linux/compiler_types.h23
-rw-r--r--include/linux/cpumask.h71
-rw-r--r--include/linux/cpuset.h11
-rw-r--r--include/linux/damon.h5
-rw-r--r--include/linux/dcache.h39
-rw-r--r--include/linux/energy_model.h2
-rw-r--r--include/linux/eventpoll.h4
-rw-r--r--include/linux/fanotify.h12
-rw-r--r--include/linux/file_ref.h48
-rw-r--r--include/linux/fs.h62
-rw-r--r--include/linux/fs_context.h2
-rw-r--r--include/linux/fscrypt.h12
-rw-r--r--include/linux/fsnotify.h20
-rw-r--r--include/linux/fsnotify_backend.h42
-rw-r--r--include/linux/idr.h17
-rw-r--r--include/linux/interrupt.h16
-rw-r--r--include/linux/iomap.h116
-rw-r--r--include/linux/kexec.h2
-rw-r--r--include/linux/key.h1
-rw-r--r--include/linux/kstrtox.h1
-rw-r--r--include/linux/libata.h2
-rw-r--r--include/linux/misc_cgroup.h6
-rw-r--r--include/linux/mm.h10
-rw-r--r--include/linux/mnt_idmapping.h5
-rw-r--r--include/linux/namei.h45
-rw-r--r--include/linux/nfs_xdr.h2
-rw-r--r--include/linux/nmi.h4
-rw-r--r--include/linux/nodemask.h8
-rw-r--r--include/linux/nodemask_types.h11
-rw-r--r--include/linux/numa.h17
-rw-r--r--include/linux/objpool.h7
-rw-r--r--include/linux/page-flags.h18
-rw-r--r--include/linux/pagemap.h48
-rw-r--r--include/linux/percpu-rwsem.h8
-rw-r--r--include/linux/perf_event.h92
-rw-r--r--include/linux/pid.h7
-rw-r--r--include/linux/pidfs.h1
-rw-r--r--include/linux/pipe_fs_i.h2
-rw-r--r--include/linux/preempt.h2
-rw-r--r--include/linux/printk.h6
-rw-r--r--include/linux/proc_fs.h7
-rw-r--r--include/linux/rcupdate.h58
-rw-r--r--include/linux/rcupdate_wait.h3
-rw-r--r--include/linux/rcutiny.h36
-rw-r--r--include/linux/rcutree.h5
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/sched/deadline.h4
-rw-r--r--include/linux/sched/debug.h2
-rw-r--r--include/linux/sched/ext.h1
-rw-r--r--include/linux/sched/idle.h23
-rw-r--r--include/linux/sched/mm.h7
-rw-r--r--include/linux/sched/topology.h14
-rw-r--r--include/linux/seccomp.h12
-rw-r--r--include/linux/slab.h16
-rw-r--r--include/linux/srcu.h102
-rw-r--r--include/linux/srcutiny.h29
-rw-r--r--include/linux/srcutree.h98
-rw-r--r--include/linux/string.h16
-rw-r--r--include/linux/string_choices.h24
-rw-r--r--include/linux/swap_cgroup.h4
-rw-r--r--include/linux/syscalls.h8
-rw-r--r--include/linux/sysv_fs.h214
-rw-r--r--include/linux/thread_info.h48
-rw-r--r--include/linux/topology.h30
-rw-r--r--include/linux/torture.h1
-rw-r--r--include/linux/uaccess.h2
-rw-r--r--include/linux/ucopysize.h63
-rw-r--r--include/linux/uidgid.h6
-rw-r--r--include/linux/uio.h2
-rw-r--r--include/linux/uprobes.h3
-rw-r--r--include/linux/vfsdebug.h45
-rw-r--r--include/linux/vmcore_info.h3
-rw-r--r--include/linux/wait.h3
80 files changed, 1057 insertions, 692 deletions
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 60d674af3080..1625c8529e70 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -64,7 +64,7 @@ struct linux_binprm {
const char *fdpath; /* generated filename for execveat */
unsigned interp_flags;
int execfd; /* File descriptor of the executable */
- unsigned long loader, exec;
+ unsigned long exec;
struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 2026953e2c4e..595217b7a6e7 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -560,9 +560,9 @@ void bitmap_replace(unsigned long *dst,
* ...0..11...0..10
* dst: 0000001100000010
*
- * A relationship exists between bitmap_scatter() and bitmap_gather().
+ * A relationship exists between bitmap_scatter() and bitmap_gather(). See
+ * bitmap_gather() for the bitmap gather detailed operations. TL;DR:
* bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation.
- * See bitmap_scatter() for details related to this relationship.
*/
static __always_inline
void bitmap_scatter(unsigned long *dst, const unsigned long *src,
@@ -608,7 +608,9 @@ void bitmap_scatter(unsigned long *dst, const unsigned long *src,
* dst: 0000000000011010
*
* A relationship exists between bitmap_gather() and bitmap_scatter(). See
- * bitmap_scatter() for the bitmap scatter detailed operations.
+ * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR:
+ * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation.
+ *
* Suppose scattered computed using bitmap_scatter(scattered, src, mask, n).
* The operation bitmap_gather(result, scattered, mask, n) leads to a result
* equal or equivalent to src.
diff --git a/include/linux/bits.h b/include/linux/bits.h
index 61a75d3f294b..14fd0ca9a6cd 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -40,7 +40,7 @@
* Missing asm support
*
* __GENMASK_U128() depends on _BIT128() which would not work
- * in the asm code, as it shifts an 'unsigned __init128' data
+ * in the asm code, as it shifts an 'unsigned __int128' data
* type instead of direct representation of 128 bit constants
* such as long and unsigned long. The fundamental problem is
* that a 128 bit constant will get silently truncated by the
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d37751789bf5..1c0cf6af392c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -268,10 +268,16 @@ static inline dev_t disk_devt(struct gendisk *disk)
return MKDEV(disk->major, disk->first_minor);
}
+/*
+ * We should strive for 1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER)
+ * however we constrain this to what we can validate and test.
+ */
+#define BLK_MAX_BLOCK_SIZE SZ_64K
+
/* blk_validate_limits() validates bsize, so drivers don't usually need to */
static inline int blk_validate_block_size(unsigned long bsize)
{
- if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize))
+ if (bsize < 512 || bsize > BLK_MAX_BLOCK_SIZE || !is_power_of_2(bsize))
return -EINVAL;
return 0;
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 17960a1e858d..485b651869d9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -619,9 +619,8 @@ struct cgroup_root {
*/
struct cftype {
/*
- * By convention, the name should begin with the name of the
- * subsystem, followed by a period. Zero length string indicates
- * end of cftype array.
+ * Name of the subsystem is prepended in cgroup_file_name().
+ * Zero length string indicates end of cftype array.
*/
char name[MAX_CFTYPE_NAME];
unsigned long private;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f8ef47f8a634..28e999f2c642 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -113,6 +113,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);
void cgroup_file_show(struct cgroup_file *cfile, bool show);
@@ -689,8 +690,6 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
*/
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
-void cgroup_rstat_flush_hold(struct cgroup *cgrp);
-void cgroup_rstat_flush_release(struct cgroup *cgrp);
/*
* Basic resource stats.
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index e80d0f04d78f..9fc30b6b80c9 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -206,9 +206,25 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
#define __must_be_byte_array(a) __BUILD_BUG_ON_ZERO_MSG(!__is_byte_array(a), \
"must be byte array")
+/*
+ * If the "nonstring" attribute isn't available, we have to return true
+ * so the __must_*() checks pass when "nonstring" isn't supported.
+ */
+#if __has_attribute(__nonstring__) && defined(__annotated)
+#define __is_cstr(a) (!__annotated(a, nonstring))
+#define __is_noncstr(a) (__annotated(a, nonstring))
+#else
+#define __is_cstr(a) (true)
+#define __is_noncstr(a) (true)
+#endif
+
/* Require C Strings (i.e. NUL-terminated) lack the "nonstring" attribute. */
#define __must_be_cstr(p) \
- __BUILD_BUG_ON_ZERO_MSG(__annotated(p, nonstring), "must be cstr (NUL-terminated)")
+ __BUILD_BUG_ON_ZERO_MSG(!__is_cstr(p), \
+ "must be C-string (NUL-terminated)")
+#define __must_be_noncstr(p) \
+ __BUILD_BUG_ON_ZERO_MSG(!__is_noncstr(p), \
+ "must be non-C-string (not NUL-terminated)")
#endif /* __KERNEL__ */
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 981cc3d7e3aa..e09d323be845 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -349,6 +349,18 @@ struct ftrace_likely_data {
#endif
/*
+ * Optional: only supported since gcc >= 15
+ * Optional: not supported by Clang
+ *
+ * gcc: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117178
+ */
+#ifdef CONFIG_CC_HAS_MULTIDIMENSIONAL_NONSTRING
+# define __nonstring_array __attribute__((__nonstring__))
+#else
+# define __nonstring_array
+#endif
+
+/*
* Apply __counted_by() when the Endianness matches to increase test coverage.
*/
#ifdef __LITTLE_ENDIAN
@@ -360,7 +372,7 @@ struct ftrace_likely_data {
#endif
/* Do not trap wrapping arithmetic within an annotated function. */
-#ifdef CONFIG_UBSAN_SIGNED_WRAP
+#ifdef CONFIG_UBSAN_INTEGER_WRAP
# define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow")))
#else
# define __signed_wrap
@@ -446,11 +458,14 @@ struct ftrace_likely_data {
#define __member_size(p) __builtin_object_size(p, 1)
#endif
-/* Determine if an attribute has been applied to a variable. */
+/*
+ * Determine if an attribute has been applied to a variable.
+ * Using __annotated needs to check for __annotated being available,
+ * or negative tests may fail when annotation cannot be checked. For
+ * example, see the definition of __is_cstr().
+ */
#if __has_builtin(__builtin_has_attribute)
#define __annotated(var, attr) __builtin_has_attribute(var, attr)
-#else
-#define __annotated(var, attr) (false)
#endif
/*
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 36a890d0dd57..f9a868384083 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -81,7 +81,7 @@ static __always_inline void set_nr_cpu_ids(unsigned int nr)
*
* cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
* cpu_present_mask - has bit 'cpu' set iff cpu is populated
- * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online
+ * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online
* cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler
* cpu_active_mask - has bit 'cpu' set iff cpu available to migration
*
@@ -285,35 +285,52 @@ unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
}
/**
- * for_each_cpu - iterate over every cpu in a mask
- * @cpu: the (optionally unsigned) integer iterator
- * @mask: the cpumask pointer
+ * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from
+ * @n+1. If nothing found, wrap around and start from
+ * the beginning
+ * @n: the cpu prior to the place to search (i.e. search starts from @n+1)
+ * @src1p: the first cpumask pointer
+ * @src2p: the second cpumask pointer
*
- * After the loop, cpu is >= nr_cpu_ids.
+ * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty.
*/
-#define for_each_cpu(cpu, mask) \
- for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
-
-#if NR_CPUS == 1
static __always_inline
-unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap)
+unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p,
+ const struct cpumask *src2p)
{
- cpumask_check(start);
+ /* -1 is a legal arg here. */
if (n != -1)
cpumask_check(n);
+ return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p),
+ small_cpumask_bits, n + 1);
+}
- /*
- * Return the first available CPU when wrapping, or when starting before cpu0,
- * since there is only one valid option.
- */
- if (wrap && n >= 0)
- return nr_cpumask_bits;
-
- return cpumask_first(mask);
+/**
+ * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing
+ * found, wrap around and start from the beginning
+ * @n: the cpu prior to the place to search (i.e. search starts from @n+1)
+ * @src: cpumask pointer
+ *
+ * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty.
+ */
+static __always_inline
+unsigned int cpumask_next_wrap(int n, const struct cpumask *src)
+{
+ /* -1 is a legal arg here. */
+ if (n != -1)
+ cpumask_check(n);
+ return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1);
}
-#else
-unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
-#endif
+
+/**
+ * for_each_cpu - iterate over every cpu in a mask
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the cpumask pointer
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu(cpu, mask) \
+ for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)
/**
* for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
@@ -1033,11 +1050,21 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++)
+
+#define for_each_possible_cpu_wrap(cpu, start) \
+ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_online_cpu_wrap(cpu, start) \
+ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++)
#else
#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask)
#define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask)
#define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask)
+
+#define for_each_possible_cpu_wrap(cpu, start) \
+ for_each_cpu_wrap((cpu), cpu_possible_mask, (start))
+#define for_each_online_cpu_wrap(cpu, start) \
+ for_each_cpu_wrap((cpu), cpu_online_mask, (start))
#endif
/* Wrappers for arch boot code to manipulate normally-constant masks */
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 835e7b793f6a..5466c96a33db 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -125,9 +125,11 @@ static inline int cpuset_do_page_mem_spread(void)
extern bool current_cpuset_is_being_rebound(void);
+extern void dl_rebuild_rd_accounting(void);
extern void rebuild_sched_domains(void);
extern void cpuset_print_current_mems_allowed(void);
+extern void cpuset_reset_sched_domains(void);
/*
* read_mems_allowed_begin is required when making decisions involving
@@ -259,11 +261,20 @@ static inline bool current_cpuset_is_being_rebound(void)
return false;
}
+static inline void dl_rebuild_rd_accounting(void)
+{
+}
+
static inline void rebuild_sched_domains(void)
{
partition_sched_domains(1, NULL, NULL);
}
+static inline void cpuset_reset_sched_domains(void)
+{
+ partition_sched_domains(1, NULL, NULL);
+}
+
static inline void cpuset_print_current_mems_allowed(void)
{
}
diff --git a/include/linux/damon.h b/include/linux/damon.h
index af525252b853..c9074d569596 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -470,6 +470,11 @@ struct damos {
unsigned long next_apply_sis;
/* informs if ongoing DAMOS walk for this scheme is finished */
bool walk_completed;
+ /*
+ * If the current region in the filtering stage is allowed by core
+ * layer-handled filters. If true, operations layer allows it, too.
+ */
+ bool core_filters_allowed;
/* public: */
struct damos_quota quota;
struct damos_watermarks wmarks;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 4afb60365675..45bff10d3773 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -203,34 +203,34 @@ struct dentry_operations {
#define DCACHE_NFSFS_RENAMED BIT(12)
/* this dentry has been "silly renamed" and has to be deleted on the last
* dput() */
-#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14)
+#define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(13)
/* Parent inode is watched by some fsnotify listener */
-#define DCACHE_DENTRY_KILLED BIT(15)
+#define DCACHE_DENTRY_KILLED BIT(14)
-#define DCACHE_MOUNTED BIT(16) /* is a mountpoint */
-#define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */
-#define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */
+#define DCACHE_MOUNTED BIT(15) /* is a mountpoint */
+#define DCACHE_NEED_AUTOMOUNT BIT(16) /* handle automount on this dir */
+#define DCACHE_MANAGE_TRANSIT BIT(17) /* manage transit from this dirent */
#define DCACHE_MANAGED_DENTRY \
(DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)
-#define DCACHE_LRU_LIST BIT(19)
+#define DCACHE_LRU_LIST BIT(18)
-#define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */
-#define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */
-#define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */
-#define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */
-#define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */
-#define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */
-#define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */
-#define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */
+#define DCACHE_ENTRY_TYPE (7 << 19) /* bits 19..21 are for storing type: */
+#define DCACHE_MISS_TYPE (0 << 19) /* Negative dentry */
+#define DCACHE_WHITEOUT_TYPE (1 << 19) /* Whiteout dentry (stop pathwalk) */
+#define DCACHE_DIRECTORY_TYPE (2 << 19) /* Normal directory */
+#define DCACHE_AUTODIR_TYPE (3 << 19) /* Lookupless directory (presumed automount) */
+#define DCACHE_REGULAR_TYPE (4 << 19) /* Regular file type */
+#define DCACHE_SPECIAL_TYPE (5 << 19) /* Other file type */
+#define DCACHE_SYMLINK_TYPE (6 << 19) /* Symlink */
-#define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */
-#define DCACHE_OP_REAL BIT(26)
+#define DCACHE_NOKEY_NAME BIT(22) /* Encrypted name encoded without key */
+#define DCACHE_OP_REAL BIT(23)
-#define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */
-#define DCACHE_DENTRY_CURSOR BIT(29)
-#define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */
+#define DCACHE_PAR_LOOKUP BIT(24) /* being looked up (with parent locked shared) */
+#define DCACHE_DENTRY_CURSOR BIT(25)
+#define DCACHE_NORCU BIT(26) /* No RCU delay for freeing */
extern seqlock_t rename_lock;
@@ -253,7 +253,6 @@ extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
const struct qstr *name);
-extern struct dentry * d_exact_alias(struct dentry *, struct inode *);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 78318d49276d..65efc0f5ea2e 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -240,9 +240,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
struct em_perf_state *ps;
int i;
-#ifdef CONFIG_SCHED_DEBUG
WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n");
-#endif
if (!sum_util)
return 0;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 0c0d00fcd131..ccb478eb174b 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -25,6 +25,10 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long t
/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);
+/* Copy ready events to userspace */
+int epoll_sendevents(struct file *file, struct epoll_event __user *events,
+ int maxevents);
+
/*
* This is called from inside fs/file_table.c:__fput() to unlink files
* from the eventpoll interface. We need to have this facility to cleanup
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 78f660ebc318..3c817dc6292e 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -25,7 +25,7 @@
#define FANOTIFY_FID_BITS (FAN_REPORT_DFID_NAME_TARGET)
-#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD)
+#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD | FAN_REPORT_MNT)
/*
* fanotify_init() flags that require CAP_SYS_ADMIN.
@@ -38,7 +38,8 @@
FAN_REPORT_PIDFD | \
FAN_REPORT_FD_ERROR | \
FAN_UNLIMITED_QUEUE | \
- FAN_UNLIMITED_MARKS)
+ FAN_UNLIMITED_MARKS | \
+ FAN_REPORT_MNT)
/*
* fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN.
@@ -58,7 +59,7 @@
#define FANOTIFY_INTERNAL_GROUP_FLAGS (FANOTIFY_UNPRIV)
#define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \
- FAN_MARK_FILESYSTEM)
+ FAN_MARK_FILESYSTEM | FAN_MARK_MNTNS)
#define FANOTIFY_MARK_CMD_BITS (FAN_MARK_ADD | FAN_MARK_REMOVE | \
FAN_MARK_FLUSH)
@@ -109,10 +110,13 @@
/* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */
#define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR)
+#define FANOTIFY_MOUNT_EVENTS (FAN_MNT_ATTACH | FAN_MNT_DETACH)
+
/* Events that user can request to be notified on */
#define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \
FANOTIFY_INODE_EVENTS | \
- FANOTIFY_ERROR_EVENTS)
+ FANOTIFY_ERROR_EVENTS | \
+ FANOTIFY_MOUNT_EVENTS)
/* Extra flags that may be reported with event or control handling of events */
#define FANOTIFY_EVENT_FLAGS (FAN_EVENT_ON_CHILD | FAN_ONDIR)
diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h
index 9b3a8d9b17ab..7db62fbc0500 100644
--- a/include/linux/file_ref.h
+++ b/include/linux/file_ref.h
@@ -61,6 +61,7 @@ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt)
atomic_long_set(&ref->refcnt, cnt - 1);
}
+bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt);
bool __file_ref_put(file_ref_t *ref, unsigned long cnt);
/**
@@ -161,6 +162,39 @@ static __always_inline __must_check bool file_ref_put(file_ref_t *ref)
}
/**
+ * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF
+ * @ref: Pointer to the reference count
+ *
+ * Semantically it is equivalent to calling file_ref_put(), but it trades lower
+ * performance in face of other CPUs also modifying the refcount for higher
+ * performance when this happens to be the last reference.
+ *
+ * For the last reference file_ref_put() issues 2 atomics. One to drop the
+ * reference and another to transition it to FILE_REF_DEAD. This routine does
+ * the work in one step, but in order to do it has to pre-read the variable which
+ * decreases scalability.
+ *
+ * Use with close() et al, stick to file_ref_put() by default.
+ */
+static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref)
+{
+ long old, new;
+
+ old = atomic_long_read(&ref->refcnt);
+ do {
+ if (unlikely(old < 0))
+ return __file_ref_put_badval(ref, old);
+
+ if (old == FILE_REF_ONEREF)
+ new = FILE_REF_DEAD;
+ else
+ new = old - 1;
+ } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new));
+
+ return new == FILE_REF_DEAD;
+}
+
+/**
* file_ref_read - Read the number of file references
* @ref: Pointer to the reference count
*
@@ -174,4 +208,18 @@ static inline unsigned long file_ref_read(file_ref_t *ref)
return c >= FILE_REF_RELEASED ? 0 : c + 1;
}
+/*
+ * __file_ref_read_raw - Return the value stored in ref->refcnt
+ * @ref: Pointer to the reference count
+ *
+ * Return: The raw value found in the counter
+ *
+ * A hack for file_needs_f_pos_lock(), you probably want to use
+ * file_ref_read() instead.
+ */
+static inline unsigned long __file_ref_read_raw(file_ref_t *ref)
+{
+ return atomic_long_read(&ref->refcnt);
+}
+
#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2788df98080f..1a0e23a5d02d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_FS_H
#define _LINUX_FS_H
+#include <linux/vfsdebug.h>
#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
@@ -790,19 +791,8 @@ struct inode {
static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
- int testlen;
-
- /*
- * TODO: patch it into a debug-only check if relevant macros show up.
- * In the meantime, since we are suffering strlen even on production kernels
- * to find the right length, do a fixup if the wrong value got passed.
- */
- testlen = strlen(link);
- if (testlen != linklen) {
- WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)",
- link, linklen, testlen);
- linklen = testlen;
- }
+ VFS_WARN_ON_INODE(strlen(link) != linklen, inode);
+ VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode);
inode->i_link = link;
inode->i_linklen = linklen;
inode->i_opflags |= IOP_CACHED_LINK;
@@ -1067,7 +1057,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
/**
* struct file - Represents a file
- * @f_ref: reference count
* @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
* @f_mode: FMODE_* flags often used in hotpaths
* @f_op: file operations
@@ -1077,12 +1066,12 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
* @f_flags: file flags
* @f_iocb_flags: iocb flags
* @f_cred: stashed credentials of creator/opener
+ * @f_owner: file owner
* @f_path: path of the file
* @f_pos_lock: lock protecting file position
* @f_pipe: specific to pipes
* @f_pos: file position
* @f_security: LSM security context of this file
- * @f_owner: file owner
* @f_wb_err: writeback error
* @f_sb_err: per sb writeback errors
* @f_ep: link of all epoll hooks for this file
@@ -1090,9 +1079,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
* @f_llist: work queue entrypoint
* @f_ra: file's readahead state
* @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
+ * @f_ref: reference count
*/
struct file {
- file_ref_t f_ref;
spinlock_t f_lock;
fmode_t f_mode;
const struct file_operations *f_op;
@@ -1102,6 +1091,7 @@ struct file {
unsigned int f_flags;
unsigned int f_iocb_flags;
const struct cred *f_cred;
+ struct fown_struct *f_owner;
/* --- cacheline 1 boundary (64 bytes) --- */
struct path f_path;
union {
@@ -1115,7 +1105,6 @@ struct file {
void *f_security;
#endif
/* --- cacheline 2 boundary (128 bytes) --- */
- struct fown_struct *f_owner;
errseq_t f_wb_err;
errseq_t f_sb_err;
#ifdef CONFIG_EPOLL
@@ -1127,6 +1116,7 @@ struct file {
struct file_ra_state f_ra;
freeptr_t f_freeptr;
};
+ file_ref_t f_ref;
/* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
@@ -1981,8 +1971,8 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap,
*/
int vfs_create(struct mnt_idmap *, struct inode *,
struct dentry *, umode_t, bool);
-int vfs_mkdir(struct mnt_idmap *, struct inode *,
- struct dentry *, umode_t);
+struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
+ struct dentry *, umode_t);
int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
umode_t, dev_t);
int vfs_symlink(struct mnt_idmap *, struct inode *,
@@ -2039,7 +2029,7 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);
-extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
@@ -2211,8 +2201,8 @@ struct inode_operations {
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,
const char *);
- int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,
- umode_t);
+ struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *,
+ struct dentry *, umode_t);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,
umode_t,dev_t);
@@ -2616,6 +2606,7 @@ struct file_system_type {
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */
#define FS_MGTIME 64 /* FS uses multigrain timestamps */
+#define FS_LBS 128 /* FS supports LBS */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
@@ -2653,9 +2644,6 @@ static inline bool is_mgtime(const struct inode *inode)
extern struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int));
-extern struct dentry *mount_single(struct file_system_type *fs_type,
- int flags, void *data,
- int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int));
@@ -2794,13 +2782,13 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
return mnt_idmap(mnt) != &nop_mnt_idmap;
}
-extern long vfs_truncate(const struct path *, loff_t);
+int vfs_truncate(const struct path *, loff_t);
int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start,
unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
-extern long do_sys_open(int dfd, const char __user *filename, int flags,
- umode_t mode);
+int do_sys_open(int dfd, const char __user *filename, int flags,
+ umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(const struct path *,
@@ -2851,7 +2839,10 @@ extern int filp_close(struct file *, fl_owner_t id);
extern struct filename *getname_flags(const char __user *, int);
extern struct filename *getname_uflags(const char __user *, int);
-extern struct filename *getname(const char __user *);
+static inline struct filename *getname(const char __user *name)
+{
+ return getname_flags(name, 0);
+}
extern struct filename *getname_kernel(const char *);
extern struct filename *__getname_maybe_null(const char __user *);
static inline struct filename *getname_maybe_null(const char __user *name, int flags)
@@ -2864,6 +2855,13 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f
return __getname_maybe_null(name);
}
extern void putname(struct filename *name);
+DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T))
+
+static inline struct filename *refname(struct filename *name)
+{
+ atomic_inc(&name->refcnt);
+ return name;
+}
extern int finish_open(struct file *file, struct dentry *dentry,
int (*open)(struct inode *, struct file *));
@@ -3297,7 +3295,11 @@ static inline void __iget(struct inode *inode)
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
-extern struct inode *new_inode_pseudo(struct super_block *sb);
+struct inode *alloc_inode(struct super_block *sb);
+static inline struct inode *new_inode_pseudo(struct super_block *sb)
+{
+ return alloc_inode(sb);
+}
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 4b4bfef6f053..a19e4bd32e4d 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -144,8 +144,6 @@ extern void put_fs_context(struct fs_context *fc);
extern int vfs_parse_fs_param_source(struct fs_context *fc,
struct fs_parameter *param);
extern void fc_drop_locked(struct fs_context *fc);
-int reconfigure_single(struct super_block *s,
- int flags, void *data);
extern int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h
index 18855cb44b1c..56fad33043d5 100644
--- a/include/linux/fscrypt.h
+++ b/include/linux/fscrypt.h
@@ -310,10 +310,8 @@ static inline void fscrypt_prepare_dentry(struct dentry *dentry,
/* crypto.c */
void fscrypt_enqueue_decrypt_work(struct work_struct *);
-struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
- unsigned int len,
- unsigned int offs,
- gfp_t gfp_flags);
+struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
+ size_t len, size_t offs, gfp_t gfp_flags);
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs,
u64 lblk_num, gfp_t gfp_flags);
@@ -480,10 +478,8 @@ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
{
}
-static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
- unsigned int len,
- unsigned int offs,
- gfp_t gfp_flags)
+static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
+ size_t len, size_t offs, gfp_t gfp_flags)
{
return ERR_PTR(-EOPNOTSUPP);
}
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 83d3ac97f826..454d8e466958 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -320,6 +320,11 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
__fsnotify_vfsmount_delete(mnt);
}
+static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns)
+{
+ __fsnotify_mntns_delete(mntns);
+}
+
/*
* fsnotify_inoderemove - an inode is going away
*/
@@ -528,4 +533,19 @@ static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
NULL, NULL, NULL, 0);
}
+static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt)
+{
+ fsnotify_mnt(FS_MNT_ATTACH, ns, mnt);
+}
+
+static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt)
+{
+ fsnotify_mnt(FS_MNT_DETACH, ns, mnt);
+}
+
+static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt)
+{
+ fsnotify_mnt(FS_MNT_MOVE, ns, mnt);
+}
+
#endif /* _LINUX_FS_NOTIFY_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 0d24a21a8e60..6cd8d1d28b8b 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -59,6 +59,10 @@
#define FS_PRE_ACCESS 0x00100000 /* Pre-content access hook */
+#define FS_MNT_ATTACH 0x01000000 /* Mount was attached */
+#define FS_MNT_DETACH 0x02000000 /* Mount was detached */
+#define FS_MNT_MOVE (FS_MNT_ATTACH | FS_MNT_DETACH)
+
/*
* Set on inode mark that cares about things that happen to its children.
* Always set for dnotify and inotify.
@@ -80,6 +84,9 @@
*/
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)
+/* Mount namespace events */
+#define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH)
+
/* Content events can be used to inspect file content */
#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \
FS_ACCESS_PERM)
@@ -108,6 +115,7 @@
/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
+ FSNOTIFY_MNT_EVENTS | \
FS_EVENTS_POSS_ON_CHILD | \
FS_DELETE_SELF | FS_MOVE_SELF | \
FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
@@ -298,6 +306,7 @@ enum fsnotify_data_type {
FSNOTIFY_EVENT_PATH,
FSNOTIFY_EVENT_INODE,
FSNOTIFY_EVENT_DENTRY,
+ FSNOTIFY_EVENT_MNT,
FSNOTIFY_EVENT_ERROR,
};
@@ -318,6 +327,11 @@ static inline const struct path *file_range_path(const struct file_range *range)
return range->path;
}
+struct fsnotify_mnt {
+ const struct mnt_namespace *ns;
+ u64 mnt_id;
+};
+
static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
switch (data_type) {
@@ -383,6 +397,24 @@ static inline struct super_block *fsnotify_data_sb(const void *data,
}
}
+static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data,
+ int data_type)
+{
+ switch (data_type) {
+ case FSNOTIFY_EVENT_MNT:
+ return data;
+ default:
+ return NULL;
+ }
+}
+
+static inline u64 fsnotify_data_mnt_id(const void *data, int data_type)
+{
+ const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);
+
+ return mnt_data ? mnt_data->mnt_id : 0;
+}
+
static inline struct fs_error_report *fsnotify_data_error_report(
const void *data,
int data_type)
@@ -420,6 +452,7 @@ enum fsnotify_iter_type {
FSNOTIFY_ITER_TYPE_SB,
FSNOTIFY_ITER_TYPE_PARENT,
FSNOTIFY_ITER_TYPE_INODE2,
+ FSNOTIFY_ITER_TYPE_MNTNS,
FSNOTIFY_ITER_TYPE_COUNT
};
@@ -429,6 +462,7 @@ enum fsnotify_obj_type {
FSNOTIFY_OBJ_TYPE_INODE,
FSNOTIFY_OBJ_TYPE_VFSMOUNT,
FSNOTIFY_OBJ_TYPE_SB,
+ FSNOTIFY_OBJ_TYPE_MNTNS,
FSNOTIFY_OBJ_TYPE_COUNT,
FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};
@@ -613,8 +647,10 @@ extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
+extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns);
extern void fsnotify_sb_free(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);
+extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt);
static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
@@ -928,6 +964,9 @@ static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
static inline void fsnotify_sb_delete(struct super_block *sb)
{}
+static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
+{}
+
static inline void fsnotify_sb_free(struct super_block *sb)
{}
@@ -942,6 +981,9 @@ static inline u32 fsnotify_get_cookie(void)
static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}
+static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
+{}
+
#endif /* CONFIG_FSNOTIFY */
#endif /* __KERNEL __ */
diff --git a/include/linux/idr.h b/include/linux/idr.h
index da5f5fa4a3a6..cd729be369b3 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -15,6 +15,7 @@
#include <linux/radix-tree.h>
#include <linux/gfp.h>
#include <linux/percpu.h>
+#include <linux/cleanup.h>
struct idr {
struct radix_tree_root idr_rt;
@@ -124,6 +125,22 @@ void *idr_get_next_ul(struct idr *, unsigned long *nextid);
void *idr_replace(struct idr *, void *, unsigned long id);
void idr_destroy(struct idr *);
+struct __class_idr {
+ struct idr *idr;
+ int id;
+};
+
+#define idr_null ((struct __class_idr){ NULL, -1 })
+#define take_idr_id(id) __get_and_null(id, idr_null)
+
+DEFINE_CLASS(idr_alloc, struct __class_idr,
+ if (_T.id >= 0) idr_remove(_T.idr, _T.id),
+ ((struct __class_idr){
+ .idr = idr,
+ .id = idr_alloc(idr, ptr, start, end, gfp),
+ }),
+ struct idr *idr, void *ptr, int start, int end, gfp_t gfp);
+
/**
* idr_init_base() - Initialise an IDR.
* @idr: IDR handle.
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 8cd9327e4e78..c782a74d2a30 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -448,7 +448,7 @@ irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
static inline void disable_irq_nosync_lockdep(unsigned int irq)
{
disable_irq_nosync(irq);
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)
local_irq_disable();
#endif
}
@@ -456,22 +456,14 @@ static inline void disable_irq_nosync_lockdep(unsigned int irq)
static inline void disable_irq_nosync_lockdep_irqsave(unsigned int irq, unsigned long *flags)
{
disable_irq_nosync(irq);
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)
local_irq_save(*flags);
#endif
}
-static inline void disable_irq_lockdep(unsigned int irq)
-{
- disable_irq(irq);
-#ifdef CONFIG_LOCKDEP
- local_irq_disable();
-#endif
-}
-
static inline void enable_irq_lockdep(unsigned int irq)
{
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)
local_irq_enable();
#endif
enable_irq(irq);
@@ -479,7 +471,7 @@ static inline void enable_irq_lockdep(unsigned int irq)
static inline void enable_irq_lockdep_irqrestore(unsigned int irq, unsigned long *flags)
{
-#ifdef CONFIG_LOCKDEP
+#if defined(CONFIG_LOCKDEP) && !defined(CONFIG_PREEMPT_RT)
local_irq_restore(*flags);
#endif
enable_irq(irq);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 75bf54e76f3b..02fe001feebb 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -56,6 +56,13 @@ struct vm_fault;
*
* IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
* never be merged with the mapping before it.
+ *
+ * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
+ * assigned to it yet and the file system will do that in the bio submission
+ * handler, splitting the I/O as needed.
+ *
+ * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic
+ * bio, i.e. set REQ_ATOMIC.
*/
#define IOMAP_F_NEW (1U << 0)
#define IOMAP_F_DIRTY (1U << 1)
@@ -68,6 +75,8 @@ struct vm_fault;
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)
#define IOMAP_F_BOUNDARY (1U << 6)
+#define IOMAP_F_ANON_WRITE (1U << 7)
+#define IOMAP_F_ATOMIC_BIO (1U << 8)
/*
* Flags set by the core iomap code during operations:
@@ -111,6 +120,8 @@ struct iomap {
static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
{
+ if (iomap->flags & IOMAP_F_ANON_WRITE)
+ return U64_MAX; /* invalid */
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}
@@ -182,7 +193,8 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
-#define IOMAP_ATOMIC (1 << 9)
+#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */
+#define IOMAP_DONTCACHE (1 << 10)
struct iomap_ops {
/*
@@ -211,8 +223,10 @@ struct iomap_ops {
* calls to iomap_iter(). Treat as read-only in the body.
* @len: The remaining length of the file segment we're operating on.
* It is updated at the same time as @pos.
- * @processed: The number of bytes processed by the body in the most recent
- * iteration, or a negative errno. 0 causes the iteration to stop.
+ * @iter_start_pos: The original start pos for the current iomap. Used for
+ * incremental iter advance.
+ * @status: Status of the most recent iteration. Zero on success or a negative
+ * errno on error.
* @flags: Zero or more of the iomap_begin flags above.
* @iomap: Map describing the I/O iteration
* @srcmap: Source map for COW operations
@@ -221,7 +235,8 @@ struct iomap_iter {
struct inode *inode;
loff_t pos;
u64 len;
- s64 processed;
+ loff_t iter_start_pos;
+ int status;
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
@@ -229,20 +244,46 @@ struct iomap_iter {
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
+int iomap_iter_advance(struct iomap_iter *iter, u64 *count);
/**
- * iomap_length - length of the current iomap iteration
+ * iomap_length_trim - trimmed length of the current iomap iteration
* @iter: iteration structure
+ * @pos: File position to trim from.
+ * @len: Length of the mapping to trim to.
*
- * Returns the length that the operation applies to for the current iteration.
+ * Returns a trimmed length that the operation applies to for the current
+ * iteration.
*/
-static inline u64 iomap_length(const struct iomap_iter *iter)
+static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos,
+ u64 len)
{
u64 end = iter->iomap.offset + iter->iomap.length;
if (iter->srcmap.type != IOMAP_HOLE)
end = min(end, iter->srcmap.offset + iter->srcmap.length);
- return min(iter->len, end - iter->pos);
+ return min(len, end - pos);
+}
+
+/**
+ * iomap_length - length of the current iomap iteration
+ * @iter: iteration structure
+ *
+ * Returns the length that the operation applies to for the current iteration.
+ */
+static inline u64 iomap_length(const struct iomap_iter *iter)
+{
+ return iomap_length_trim(iter, iter->pos, iter->len);
+}
+
+/**
+ * iomap_iter_advance_full - advance by the full length of current map
+ */
+static inline int iomap_iter_advance_full(struct iomap_iter *iter)
+{
+ u64 length = iomap_length(iter);
+
+ return iomap_iter_advance(iter, &length);
}
/**
@@ -306,12 +347,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
- bool *did_zero, const struct iomap_ops *ops);
+ bool *did_zero, const struct iomap_ops *ops, void *private);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops);
-vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
- const struct iomap_ops *ops);
-
+ const struct iomap_ops *ops, void *private);
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
+ void *private);
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
struct iomap *iomap);
void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
@@ -328,16 +368,42 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
const struct iomap_ops *ops);
/*
+ * Flags for iomap_ioend->io_flags.
+ */
+/* shared COW extent */
+#define IOMAP_IOEND_SHARED (1U << 0)
+/* unwritten extent */
+#define IOMAP_IOEND_UNWRITTEN (1U << 1)
+/* don't merge into previous ioend */
+#define IOMAP_IOEND_BOUNDARY (1U << 2)
+/* is direct I/O */
+#define IOMAP_IOEND_DIRECT (1U << 3)
+
+/*
+ * Flags that if set on either ioend prevent the merge of two ioends.
+ * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
+ */
+#define IOMAP_IOEND_NOMERGE_FLAGS \
+ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
+
+/*
* Structure for writeback I/O completions.
+ *
+ * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io
+ * for direct I/O) can split a bio generated by iomap. In that case the parent
+ * ioend it was split from is recorded in ioend->io_parent.
*/
struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
- u16 io_type;
- u16 io_flags; /* IOMAP_F_* */
+ u16 io_flags; /* IOMAP_IOEND_* */
struct inode *io_inode; /* file being written to */
- size_t io_size; /* size of data within eof */
+ size_t io_size; /* size of the extent */
+ atomic_t io_remaining; /* completetion defer count */
+ int io_error; /* stashed away status */
+ struct iomap_ioend *io_parent; /* parent for completions */
loff_t io_offset; /* offset in the file */
sector_t io_sector; /* start sector of ioend */
+ void *io_private; /* file system private data */
struct bio io_bio; /* MUST BE LAST! */
};
@@ -362,12 +428,14 @@ struct iomap_writeback_ops {
loff_t offset, unsigned len);
/*
- * Optional, allows the file systems to perform actions just before
- * submitting the bio and/or override the bio end_io handler for complex
- * operations like copy on write extent manipulation or unwritten extent
- * conversions.
+ * Optional, allows the file systems to hook into bio submission,
+ * including overriding the bi_end_io handler.
+ *
+ * Returns 0 if the bio was successfully submitted, or a negative
+ * error code if status was non-zero or another error happened and
+ * the bio could not be submitted.
*/
- int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
+ int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
/*
* Optional, allows the file system to discard state on a page where
@@ -383,6 +451,10 @@ struct iomap_writepage_ctx {
u32 nr_folios; /* folios added to the ioend */
};
+struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
+ loff_t file_offset, u16 ioend_flags);
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
+ unsigned int max_len, bool is_append);
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends);
@@ -454,4 +526,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO)
#endif /* CONFIG_SWAP */
+extern struct bio_set iomap_ioend_bioset;
+
#endif /* LINUX_IOMAP_H */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index f0e9f8eda7a3..c840431eadda 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -68,8 +68,6 @@ extern note_buf_t __percpu *crash_notes;
#define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE
#endif
-#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME
-
/*
* This structure is used to hold the arguments that are used when loading
* kernel binaries.
diff --git a/include/linux/key.h b/include/linux/key.h
index 074dca3222b9..ba05de8579ec 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -236,6 +236,7 @@ struct key {
#define KEY_FLAG_ROOT_CAN_INVAL 7 /* set if key can be invalidated by root without permission */
#define KEY_FLAG_KEEP 8 /* set if key should not be removed */
#define KEY_FLAG_UID_KEYRING 9 /* set if key is a user or user session keyring */
+#define KEY_FLAG_FINAL_PUT 10 /* set if final put has happened on key */
/* the key type and key description string
* - the desc is used to match a key against search criteria
diff --git a/include/linux/kstrtox.h b/include/linux/kstrtox.h
index 7fcf29a4e0de..6ea897222af1 100644
--- a/include/linux/kstrtox.h
+++ b/include/linux/kstrtox.h
@@ -143,6 +143,7 @@ static inline int __must_check kstrtos32_from_user(const char __user *s, size_t
*/
extern unsigned long simple_strtoul(const char *,char **,unsigned int);
+extern unsigned long simple_strntoul(const char *,char **,unsigned int,size_t);
extern long simple_strtol(const char *,char **,unsigned int);
extern unsigned long long simple_strtoull(const char *,char **,unsigned int);
extern long long simple_strtoll(const char *,char **,unsigned int);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c1c57f814b98..e5695998acb0 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -88,6 +88,7 @@ enum ata_quirks {
__ATA_QUIRK_MAX_SEC_1024, /* Limit max sects to 1024 */
__ATA_QUIRK_MAX_TRIM_128M, /* Limit max trim size to 128M */
__ATA_QUIRK_NO_NCQ_ON_ATI, /* Disable NCQ on ATI chipset */
+ __ATA_QUIRK_NO_LPM_ON_ATI, /* Disable LPM on ATI chipset */
__ATA_QUIRK_NO_ID_DEV_LOG, /* Identify device log missing */
__ATA_QUIRK_NO_LOG_DIR, /* Do not read log directory */
__ATA_QUIRK_NO_FUA, /* Do not use FUA */
@@ -432,6 +433,7 @@ enum {
ATA_QUIRK_MAX_SEC_1024 = (1U << __ATA_QUIRK_MAX_SEC_1024),
ATA_QUIRK_MAX_TRIM_128M = (1U << __ATA_QUIRK_MAX_TRIM_128M),
ATA_QUIRK_NO_NCQ_ON_ATI = (1U << __ATA_QUIRK_NO_NCQ_ON_ATI),
+ ATA_QUIRK_NO_LPM_ON_ATI = (1U << __ATA_QUIRK_NO_LPM_ON_ATI),
ATA_QUIRK_NO_ID_DEV_LOG = (1U << __ATA_QUIRK_NO_ID_DEV_LOG),
ATA_QUIRK_NO_LOG_DIR = (1U << __ATA_QUIRK_NO_LOG_DIR),
ATA_QUIRK_NO_FUA = (1U << __ATA_QUIRK_NO_FUA),
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index 49eef10c8e59..4bf261d41a6d 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -60,7 +60,6 @@ struct misc_cg {
struct misc_res res[MISC_CG_RES_TYPES];
};
-u64 misc_cg_res_total_usage(enum misc_res_type type);
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity);
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
@@ -104,11 +103,6 @@ static inline void put_misc_cg(struct misc_cg *cg)
#else /* !CONFIG_CGROUP_MISC */
-static inline u64 misc_cg_res_total_usage(enum misc_res_type type)
-{
- return 0;
-}
-
static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity)
{
return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8483e09aeb2c..2edb8d14d165 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1458,7 +1458,10 @@ static inline void folio_get(struct folio *folio)
static inline void get_page(struct page *page)
{
- folio_get(page_folio(page));
+ struct folio *folio = page_folio(page);
+ if (WARN_ON_ONCE(folio_test_slab(folio)))
+ return;
+ folio_get(folio);
}
static inline __must_check bool try_get_page(struct page *page)
@@ -1552,6 +1555,9 @@ static inline void put_page(struct page *page)
{
struct folio *folio = page_folio(page);
+ if (folio_test_slab(folio))
+ return;
+
/*
* For some devmap managed pages we need to catch refcount transition
* from 2 to 1:
@@ -2549,7 +2555,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct task_struct *task, bool bypass_rlim);
struct kvec;
-struct page *get_dump_page(unsigned long addr);
+struct page *get_dump_page(unsigned long addr, int *locked);
bool folio_mark_dirty(struct folio *folio);
bool folio_mark_dirty_lock(struct folio *folio);
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index b1b219bc3422..e71a6070a8f8 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -25,6 +25,11 @@ static_assert(sizeof(vfsgid_t) == sizeof(kgid_t));
static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val));
static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val));
+static inline bool is_valid_mnt_idmap(const struct mnt_idmap *idmap)
+{
+ return idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap;
+}
+
#ifdef CONFIG_MULTIUSER
static inline uid_t __vfsuid_val(vfsuid_t uid)
{
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 8ec8fed3bce8..e3042176cdf4 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -18,35 +18,36 @@ enum { MAX_NESTED_LINKS = 8 };
enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
/* pathwalk mode */
-#define LOOKUP_FOLLOW 0x0001 /* follow links at the end */
-#define LOOKUP_DIRECTORY 0x0002 /* require a directory */
-#define LOOKUP_AUTOMOUNT 0x0004 /* force terminal automount */
-#define LOOKUP_EMPTY 0x4000 /* accept empty path [user_... only] */
-#define LOOKUP_DOWN 0x8000 /* follow mounts in the starting point */
-#define LOOKUP_MOUNTPOINT 0x0080 /* follow mounts in the end */
-
-#define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */
-#define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */
+#define LOOKUP_FOLLOW BIT(0) /* follow links at the end */
+#define LOOKUP_DIRECTORY BIT(1) /* require a directory */
+#define LOOKUP_AUTOMOUNT BIT(2) /* force terminal automount */
+#define LOOKUP_EMPTY BIT(3) /* accept empty path [user_... only] */
+#define LOOKUP_LINKAT_EMPTY BIT(4) /* Linkat request with empty path. */
+#define LOOKUP_DOWN BIT(5) /* follow mounts in the starting point */
+#define LOOKUP_MOUNTPOINT BIT(6) /* follow mounts in the end */
+#define LOOKUP_REVAL BIT(7) /* tell ->d_revalidate() to trust no cache */
+#define LOOKUP_RCU BIT(8) /* RCU pathwalk mode; semi-internal */
+#define LOOKUP_CACHED BIT(9) /* Only do cached lookup */
+#define LOOKUP_PARENT BIT(10) /* Looking up final parent in path */
+/* 5 spare bits for pathwalk */
/* These tell filesystem methods that we are dealing with the final component... */
-#define LOOKUP_OPEN 0x0100 /* ... in open */
-#define LOOKUP_CREATE 0x0200 /* ... in object creation */
-#define LOOKUP_EXCL 0x0400 /* ... in exclusive creation */
-#define LOOKUP_RENAME_TARGET 0x0800 /* ... in destination of rename() */
+#define LOOKUP_OPEN BIT(16) /* ... in open */
+#define LOOKUP_CREATE BIT(17) /* ... in object creation */
+#define LOOKUP_EXCL BIT(18) /* ... in target must not exist */
+#define LOOKUP_RENAME_TARGET BIT(19) /* ... in destination of rename() */
-/* internal use only */
-#define LOOKUP_PARENT 0x0010
+/* 4 spare bits for intent */
/* Scoping flags for lookup. */
-#define LOOKUP_NO_SYMLINKS 0x010000 /* No symlink crossing. */
-#define LOOKUP_NO_MAGICLINKS 0x020000 /* No nd_jump_link() crossing. */
-#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */
-#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */
-#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */
-#define LOOKUP_CACHED 0x200000 /* Only do cached lookup */
-#define LOOKUP_LINKAT_EMPTY 0x400000 /* Linkat request with empty path. */
+#define LOOKUP_NO_SYMLINKS BIT(24) /* No symlink crossing. */
+#define LOOKUP_NO_MAGICLINKS BIT(25) /* No nd_jump_link() crossing. */
+#define LOOKUP_NO_XDEV BIT(26) /* No mountpoint crossing. */
+#define LOOKUP_BENEATH BIT(27) /* No escaping from starting point. */
+#define LOOKUP_IN_ROOT BIT(28) /* Treat dirfd as fs root. */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
+/* 3 spare bits for scoping */
extern int path_pts(struct path *path);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9155a6ffc370..d66c61cbbd1d 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1802,7 +1802,7 @@ struct nfs_rpc_ops {
int (*link) (struct inode *, struct inode *, const struct qstr *);
int (*symlink) (struct inode *, struct dentry *, struct folio *,
unsigned int, struct iattr *);
- int (*mkdir) (struct inode *, struct dentry *, struct iattr *);
+ struct dentry *(*mkdir) (struct inode *, struct dentry *, struct iattr *);
int (*rmdir) (struct inode *, const struct qstr *);
int (*readdir) (struct nfs_readdir_arg *, struct nfs_readdir_res *);
int (*mknod) (struct inode *, struct dentry *, struct iattr *,
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a8dfb38c9bb6..e78fa535f61d 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -17,7 +17,6 @@
void lockup_detector_init(void);
void lockup_detector_retry_init(void);
void lockup_detector_soft_poweroff(void);
-void lockup_detector_cleanup(void);
extern int watchdog_user_enabled;
extern int watchdog_thresh;
@@ -37,7 +36,6 @@ extern int sysctl_hardlockup_all_cpu_backtrace;
static inline void lockup_detector_init(void) { }
static inline void lockup_detector_retry_init(void) { }
static inline void lockup_detector_soft_poweroff(void) { }
-static inline void lockup_detector_cleanup(void) { }
#endif /* !CONFIG_LOCKUP_DETECTOR */
#ifdef CONFIG_SOFTLOCKUP_DETECTOR
@@ -104,12 +102,10 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
#if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF)
extern void hardlockup_detector_perf_stop(void);
extern void hardlockup_detector_perf_restart(void);
-extern void hardlockup_detector_perf_cleanup(void);
extern void hardlockup_config_perf_event(const char *str);
#else
static inline void hardlockup_detector_perf_stop(void) { }
static inline void hardlockup_detector_perf_restart(void) { }
-static inline void hardlockup_detector_perf_cleanup(void) { }
static inline void hardlockup_config_perf_event(const char *str) { }
#endif
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 9fd7a0ce9c1a..f0ac0633366b 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -94,7 +94,6 @@
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/nodemask_types.h>
-#include <linux/numa.h>
#include <linux/random.h>
extern nodemask_t _unused_nodemask_arg_;
@@ -191,6 +190,13 @@ static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *s
bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}
+#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
+static __always_inline void __nodes_copy(nodemask_t *dstp,
+ const nodemask_t *srcp, unsigned int nbits)
+{
+ bitmap_copy(dstp->bits, srcp->bits, nbits);
+}
+
#define nodes_complement(dst, src) \
__nodes_complement(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_complement(nodemask_t *dstp,
diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h
index 6b28d97ea6ed..f850a48742f1 100644
--- a/include/linux/nodemask_types.h
+++ b/include/linux/nodemask_types.h
@@ -3,7 +3,16 @@
#define __LINUX_NODEMASK_TYPES_H
#include <linux/bitops.h>
-#include <linux/numa.h>
+
+#ifdef CONFIG_NODES_SHIFT
+#define NODES_SHIFT CONFIG_NODES_SHIFT
+#else
+#define NODES_SHIFT 0
+#endif
+
+#define MAX_NUMNODES (1 << NODES_SHIFT)
+
+#define NUMA_NO_NODE (-1)
typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 3567e40329eb..e6baaf6051bc 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -3,16 +3,8 @@
#define _LINUX_NUMA_H
#include <linux/init.h>
#include <linux/types.h>
+#include <linux/nodemask.h>
-#ifdef CONFIG_NODES_SHIFT
-#define NODES_SHIFT CONFIG_NODES_SHIFT
-#else
-#define NODES_SHIFT 0
-#endif
-
-#define MAX_NUMNODES (1 << NODES_SHIFT)
-
-#define NUMA_NO_NODE (-1)
#define NUMA_NO_MEMBLK (-1)
static inline bool numa_valid_node(int nid)
@@ -39,6 +31,8 @@ void __init alloc_offline_node_data(int nid);
/* Generic implementation available */
int numa_nearest_node(int node, unsigned int state);
+int nearest_node_nodemask(int node, nodemask_t *mask);
+
#ifndef memory_add_physaddr_to_nid
int memory_add_physaddr_to_nid(u64 start);
#endif
@@ -55,6 +49,11 @@ static inline int numa_nearest_node(int node, unsigned int state)
return NUMA_NO_NODE;
}
+static inline int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ return NUMA_NO_NODE;
+}
+
static inline int memory_add_physaddr_to_nid(u64 start)
{
return 0;
diff --git a/include/linux/objpool.h b/include/linux/objpool.h
index cb1758eaa2d3..b713a1fe7521 100644
--- a/include/linux/objpool.h
+++ b/include/linux/objpool.h
@@ -170,17 +170,16 @@ static inline void *objpool_pop(struct objpool_head *pool)
{
void *obj = NULL;
unsigned long flags;
- int i, cpu;
+ int start, cpu;
/* disable local irq to avoid preemption & interruption */
raw_local_irq_save(flags);
- cpu = raw_smp_processor_id();
- for (i = 0; i < pool->nr_possible_cpus; i++) {
+ start = raw_smp_processor_id();
+ for_each_possible_cpu_wrap(cpu, start) {
obj = __objpool_try_get_slot(pool, cpu);
if (obj)
break;
- cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
}
raw_local_irq_restore(flags);
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 36d283552f80..df9234e5f478 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -925,14 +925,15 @@ FOLIO_FLAG_FALSE(has_hwpoisoned)
enum pagetype {
/* 0x00-0x7f are positive numbers, ie mapcount */
/* Reserve 0x80-0xef for mapcount overflow. */
- PGTY_buddy = 0xf0,
- PGTY_offline = 0xf1,
- PGTY_table = 0xf2,
- PGTY_guard = 0xf3,
- PGTY_hugetlb = 0xf4,
- PGTY_slab = 0xf5,
- PGTY_zsmalloc = 0xf6,
- PGTY_unaccepted = 0xf7,
+ PGTY_buddy = 0xf0,
+ PGTY_offline = 0xf1,
+ PGTY_table = 0xf2,
+ PGTY_guard = 0xf3,
+ PGTY_hugetlb = 0xf4,
+ PGTY_slab = 0xf5,
+ PGTY_zsmalloc = 0xf6,
+ PGTY_unaccepted = 0xf7,
+ PGTY_large_kmalloc = 0xf8,
PGTY_mapcount_underflow = 0xff
};
@@ -1075,6 +1076,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)
* Serialized with zone lock.
*/
PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
+FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc)
/**
* PageHuge - Determine if the page belongs to hugetlbfs
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 47bfc6b1b632..d2432cb02fa0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1044,21 +1044,23 @@ static inline pgoff_t page_pgoff(const struct folio *folio,
return folio->index + folio_page_idx(folio, page);
}
-/*
- * Return byte-offset into filesystem object for page.
+/**
+ * folio_pos - Returns the byte position of this folio in its file.
+ * @folio: The folio.
*/
-static inline loff_t page_offset(struct page *page)
+static inline loff_t folio_pos(const struct folio *folio)
{
- return ((loff_t)page->index) << PAGE_SHIFT;
+ return ((loff_t)folio->index) * PAGE_SIZE;
}
-/**
- * folio_pos - Returns the byte position of this folio in its file.
- * @folio: The folio.
+/*
+ * Return byte-offset into filesystem object for page.
*/
-static inline loff_t folio_pos(struct folio *folio)
+static inline loff_t page_offset(struct page *page)
{
- return page_offset(&folio->page);
+ struct folio *folio = page_folio(page);
+
+ return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE;
}
/*
@@ -1603,34 +1605,6 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
}
/**
- * page_mkwrite_check_truncate - check if page was truncated
- * @page: the page to check
- * @inode: the inode to check the page against
- *
- * Returns the number of bytes in the page up to EOF,
- * or -EFAULT if the page was truncated.
- */
-static inline int page_mkwrite_check_truncate(struct page *page,
- struct inode *inode)
-{
- loff_t size = i_size_read(inode);
- pgoff_t index = size >> PAGE_SHIFT;
- int offset = offset_in_page(size);
-
- if (page->mapping != inode->i_mapping)
- return -EFAULT;
-
- /* page is wholly inside EOF */
- if (page->index < index)
- return PAGE_SIZE;
- /* page is wholly past EOF */
- if (page->index > index || !offset)
- return -EFAULT;
- /* page is partially inside EOF */
- return offset;
-}
-
-/**
* i_blocks_per_folio - How many blocks fit in this folio.
* @inode: The inode which contains the blocks.
* @folio: The folio.
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index c012df33a9f0..af7d75ede619 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -8,6 +8,7 @@
#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>
+#include <linux/cleanup.h>
struct percpu_rw_semaphore {
struct rcu_sync rss;
@@ -125,6 +126,13 @@ extern bool percpu_is_read_locked(struct percpu_rw_semaphore *);
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);
+DEFINE_GUARD(percpu_read, struct percpu_rw_semaphore *,
+ percpu_down_read(_T), percpu_up_read(_T))
+DEFINE_GUARD_COND(percpu_read, _try, percpu_down_read_trylock(_T))
+
+DEFINE_GUARD(percpu_write, struct percpu_rw_semaphore *,
+ percpu_down_write(_T), percpu_up_write(_T))
+
static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem)
{
return atomic_read(&sem->block);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8333f132f4a9..63dddb3b54f0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -343,8 +343,7 @@ struct pmu {
*/
unsigned int scope;
- int __percpu *pmu_disable_count;
- struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
+ struct perf_cpu_pmu_context * __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
int task_ctx_nr;
int hrtimer_interval_ms;
@@ -495,7 +494,7 @@ struct pmu {
* context-switches callback
*/
void (*sched_task) (struct perf_event_pmu_context *pmu_ctx,
- bool sched_in);
+ struct task_struct *task, bool sched_in);
/*
* Kmem cache of PMU specific data
@@ -503,16 +502,6 @@ struct pmu {
struct kmem_cache *task_ctx_cache;
/*
- * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
- * can be synchronized using this function. See Intel LBR callstack support
- * implementation and Perf core context switch handling callbacks for usage
- * examples.
- */
- void (*swap_task_ctx) (struct perf_event_pmu_context *prev_epc,
- struct perf_event_pmu_context *next_epc);
- /* optional */
-
- /*
* Set up pmu-private data structures for an AUX area
*/
void *(*setup_aux) (struct perf_event *event, void **pages,
@@ -673,13 +662,16 @@ struct swevent_hlist {
struct rcu_head rcu_head;
};
-#define PERF_ATTACH_CONTEXT 0x01
-#define PERF_ATTACH_GROUP 0x02
-#define PERF_ATTACH_TASK 0x04
-#define PERF_ATTACH_TASK_DATA 0x08
-#define PERF_ATTACH_ITRACE 0x10
-#define PERF_ATTACH_SCHED_CB 0x20
-#define PERF_ATTACH_CHILD 0x40
+#define PERF_ATTACH_CONTEXT 0x0001
+#define PERF_ATTACH_GROUP 0x0002
+#define PERF_ATTACH_TASK 0x0004
+#define PERF_ATTACH_TASK_DATA 0x0008
+#define PERF_ATTACH_GLOBAL_DATA 0x0010
+#define PERF_ATTACH_SCHED_CB 0x0020
+#define PERF_ATTACH_CHILD 0x0040
+#define PERF_ATTACH_EXCLUSIVE 0x0080
+#define PERF_ATTACH_CALLCHAIN 0x0100
+#define PERF_ATTACH_ITRACE 0x0200
struct bpf_prog;
struct perf_cgroup;
@@ -921,7 +913,7 @@ struct perf_event_pmu_context {
struct list_head pinned_active;
struct list_head flexible_active;
- /* Used to avoid freeing per-cpu perf_event_pmu_context */
+ /* Used to identify the per-cpu perf_event_pmu_context */
unsigned int embedded : 1;
unsigned int nr_events;
@@ -931,7 +923,6 @@ struct perf_event_pmu_context {
atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head;
- void *task_ctx_data; /* pmu specific data */
/*
* Set when one or more (plausibly active) event can't be scheduled
* due to pmu overcommit or pmu constraints, except tolerant to
@@ -979,7 +970,6 @@ struct perf_event_context {
int nr_user;
int is_active;
- int nr_task_data;
int nr_stat;
int nr_freq;
int rotate_disable;
@@ -1020,6 +1010,41 @@ struct perf_event_context {
local_t nr_no_switch_fast;
};
+/**
+ * struct perf_ctx_data - PMU specific data for a task
+ * @rcu_head: To avoid the race on free PMU specific data
+ * @refcount: To track users
+ * @global: To track system-wide users
+ * @ctx_cache: Kmem cache of PMU specific data
+ * @data: PMU specific data
+ *
+ * Currently, the struct is only used in Intel LBR call stack mode to
+ * save/restore the call stack of a task on context switches.
+ *
+ * The rcu_head is used to prevent the race on free the data.
+ * The data only be allocated when Intel LBR call stack mode is enabled.
+ * The data will be freed when the mode is disabled.
+ * The content of the data will only be accessed in context switch, which
+ * should be protected by rcu_read_lock().
+ *
+ * Because of the alignment requirement of Intel Arch LBR, the Kmem cache
+ * is used to allocate the PMU specific data. The ctx_cache is to track
+ * the Kmem cache.
+ *
+ * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct.
+ * When system-wide Intel LBR call stack mode is enabled, a buffer with
+ * constant size will be allocated for each task.
+ * Also, system memory consumption can further grow when the size of
+ * struct perf_ctx_data enlarges.
+ */
+struct perf_ctx_data {
+ struct rcu_head rcu_head;
+ refcount_t refcount;
+ int global;
+ struct kmem_cache *ctx_cache;
+ void *data;
+};
+
struct perf_cpu_pmu_context {
struct perf_event_pmu_context epc;
struct perf_event_pmu_context *task_epc;
@@ -1029,6 +1054,7 @@ struct perf_cpu_pmu_context {
int active_oncpu;
int exclusive;
+ int pmu_disable_count;
raw_spinlock_t hrtimer_lock;
struct hrtimer hrtimer;
@@ -1062,7 +1088,13 @@ struct perf_output_handle {
struct perf_buffer *rb;
unsigned long wakeup;
unsigned long size;
- u64 aux_flags;
+ union {
+ u64 flags; /* perf_output*() */
+ u64 aux_flags; /* perf_aux_output*() */
+ struct {
+ u64 skip_read : 1;
+ };
+ };
union {
void *addr;
unsigned long head;
@@ -1339,6 +1371,9 @@ static inline void perf_sample_save_brstack(struct perf_sample_data *data,
if (branch_sample_hw_index(event))
size += sizeof(u64);
+
+ brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr);
+
size += brs->nr * sizeof(struct perf_branch_entry);
/*
@@ -1646,19 +1681,10 @@ static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64
}
extern int sysctl_perf_event_paranoid;
-extern int sysctl_perf_event_mlock;
extern int sysctl_perf_event_sample_rate;
-extern int sysctl_perf_cpu_time_max_percent;
extern void perf_sample_event_took(u64 sample_len_ns);
-int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
-int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
-int perf_event_max_stack_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos);
-
/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN 0
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 98837a1ff0f3..311ecebd7d56 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -101,9 +101,9 @@ extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
* these helpers must be called with the tasklist_lock write-held.
*/
extern void attach_pid(struct task_struct *task, enum pid_type);
-extern void detach_pid(struct task_struct *task, enum pid_type);
-extern void change_pid(struct task_struct *task, enum pid_type,
- struct pid *pid);
+void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type);
+void change_pid(struct pid **pids, struct task_struct *task, enum pid_type,
+ struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type);
@@ -129,6 +129,7 @@ extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size);
extern void free_pid(struct pid *pid);
+void free_pids(struct pid **pids);
extern void disable_pid_allocation(struct pid_namespace *ns);
/*
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 7c830d0dec9a..05e6f8f4a026 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -6,6 +6,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
void __init pidfs_init(void);
void pidfs_add_pid(struct pid *pid);
void pidfs_remove_pid(struct pid *pid);
+void pidfs_exit(struct task_struct *tsk);
extern const struct dentry_operations pidfs_dentry_operations;
#endif /* _LINUX_PID_FS_H */
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b698758000f8..9d42d473d201 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -108,7 +108,7 @@ struct pipe_inode_info {
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
- struct page *tmp_page;
+ struct page *tmp_page[2];
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4c1af9b7e28b..b0af8d4ef6e6 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -516,6 +516,8 @@ static inline bool preempt_model_rt(void)
return IS_ENABLED(CONFIG_PREEMPT_RT);
}
+extern const char *preempt_model_str(void);
+
/*
* Does the preemption model allow non-cooperative preemption?
*
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4217a9f412b2..5b462029d03c 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -207,6 +207,7 @@ void printk_legacy_allow_panic_sync(void);
extern bool nbcon_device_try_acquire(struct console *con);
extern void nbcon_device_release(struct console *con);
void nbcon_atomic_flush_unsafe(void);
+bool pr_flush(int timeout_ms, bool reset_on_progress);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
@@ -315,6 +316,11 @@ static inline void nbcon_atomic_flush_unsafe(void)
{
}
+static inline bool pr_flush(int timeout_ms, bool reset_on_progress)
+{
+ return true;
+}
+
#endif
bool this_cpu_in_panic(void);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 0b2a89854440..ea62201c74c4 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -20,10 +20,13 @@ enum {
* If in doubt, ignore this flag.
*/
#ifdef MODULE
- PROC_ENTRY_PERMANENT = 0U,
+ PROC_ENTRY_PERMANENT = 0U,
#else
- PROC_ENTRY_PERMANENT = 1U << 0,
+ PROC_ENTRY_PERMANENT = 1U << 0,
#endif
+
+ PROC_ENTRY_proc_read_iter = 1U << 1,
+ PROC_ENTRY_proc_compat_ioctl = 1U << 2,
};
struct proc_ops {
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 48e5c03df1dd..f8159f8a7d73 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -95,9 +95,9 @@ static inline void __rcu_read_lock(void)
static inline void __rcu_read_unlock(void)
{
- preempt_enable();
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
rcu_read_unlock_strict();
+ preempt_enable();
}
static inline int rcu_preempt_depth(void)
@@ -121,12 +121,6 @@ void rcu_init(void);
extern int rcu_scheduler_active;
void rcu_sched_clock_irq(int user);
-#ifdef CONFIG_TASKS_RCU_GENERIC
-void rcu_init_tasks_generic(void);
-#else
-static inline void rcu_init_tasks_generic(void) { }
-#endif
-
#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
@@ -806,11 +800,9 @@ do { \
* sections, invocation of the corresponding RCU callback is deferred
* until after the all the other CPUs exit their critical sections.
*
- * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also
- * wait for regions of code with preemption disabled, including regions of
- * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which
- * define synchronize_sched(), only code enclosed within rcu_read_lock()
- * and rcu_read_unlock() are guaranteed to be waited for.
+ * Both synchronize_rcu() and call_rcu() also wait for regions of code
+ * with preemption disabled, including regions of code with interrupts or
+ * softirqs disabled.
*
* Note, however, that RCU callbacks are permitted to run concurrently
* with new RCU read-side critical sections. One way that this can happen
@@ -865,11 +857,10 @@ static __always_inline void rcu_read_lock(void)
* rcu_read_unlock() - marks the end of an RCU read-side critical section.
*
* In almost all situations, rcu_read_unlock() is immune from deadlock.
- * In recent kernels that have consolidated synchronize_sched() and
- * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity
- * also extends to the scheduler's runqueue and priority-inheritance
- * spinlocks, courtesy of the quiescent-state deferral that is carried
- * out when rcu_read_unlock() is invoked with interrupts disabled.
+ * This deadlock immunity also extends to the scheduler's runqueue
+ * and priority-inheritance spinlocks, courtesy of the quiescent-state
+ * deferral that is carried out when rcu_read_unlock() is invoked with
+ * interrupts disabled.
*
* See rcu_read_lock() for more information.
*/
@@ -1025,12 +1016,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
#define RCU_POINTER_INITIALIZER(p, v) \
.p = RCU_INITIALIZER(v)
-/*
- * Does the specified offset indicate that the corresponding rcu_head
- * structure can be handled by kvfree_rcu()?
- */
-#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
-
/**
* kfree_rcu() - kfree an object after a grace period.
* @ptr: pointer to kfree for double-argument invocations.
@@ -1041,11 +1026,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
* when they are used in a kernel module, that module must invoke the
* high-latency rcu_barrier() function at module-unload time.
*
- * The kfree_rcu() function handles this issue. Rather than encoding a
- * function address in the embedded rcu_head structure, kfree_rcu() instead
- * encodes the offset of the rcu_head structure within the base structure.
- * Because the functions are not allowed in the low-order 4096 bytes of
- * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
+ * The kfree_rcu() function handles this issue. In order to have a universal
+ * callback function handling different offsets of rcu_head, the callback needs
+ * to determine the starting address of the freed object, which can be a large
+ * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to
+ * page boundary for those, only offsets up to 4095 bytes can be accommodated.
* If the offset is larger than 4095 bytes, a compile-time error will
* be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
* either fall back to use of call_rcu() or rearrange the structure to
@@ -1082,14 +1067,23 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
+/*
+ * In mm/slab_common.c, no suitable header to include here.
+ */
+void kvfree_call_rcu(struct rcu_head *head, void *ptr);
+
+/*
+ * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
+ * comment of kfree_rcu() for details.
+ */
#define kvfree_rcu_arg_2(ptr, rhf) \
do { \
typeof (ptr) ___p = (ptr); \
\
- if (___p) { \
- BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \
- kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \
- } \
+ if (___p) { \
+ BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \
+ kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \
+ } \
} while (0)
#define kvfree_rcu_arg_1(ptr) \
diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index f9bed3d3f78d..4c92d4291cce 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -16,6 +16,9 @@
struct rcu_synchronize {
struct rcu_head head;
struct completion completion;
+
+ /* This is for debugging. */
+ struct rcu_gp_oldstate oldstate;
};
void wakeme_after_rcu(struct rcu_head *head);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index fe42315f667f..f519cd680228 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -90,41 +90,6 @@ static inline void synchronize_rcu_expedited(void)
synchronize_rcu();
}
-/*
- * Add one more declaration of kvfree() here. It is
- * not so straight forward to just include <linux/mm.h>
- * where it is defined due to getting many compile
- * errors caused by that include.
- */
-extern void kvfree(const void *addr);
-
-static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr)
-{
- if (head) {
- call_rcu(head, (rcu_callback_t) ((void *) head - ptr));
- return;
- }
-
- // kvfree_rcu(one_arg) call.
- might_sleep();
- synchronize_rcu();
- kvfree(ptr);
-}
-
-static inline void kvfree_rcu_barrier(void)
-{
- rcu_barrier();
-}
-
-#ifdef CONFIG_KASAN_GENERIC
-void kvfree_call_rcu(struct rcu_head *head, void *ptr);
-#else
-static inline void kvfree_call_rcu(struct rcu_head *head, void *ptr)
-{
- __kvfree_call_rcu(head, ptr);
-}
-#endif
-
void rcu_qs(void);
static inline void rcu_softirq_qs(void)
@@ -164,7 +129,6 @@ static inline void rcu_end_inkernel_boot(void) { }
static inline bool rcu_inkernel_boot_has_ended(void) { return true; }
static inline bool rcu_is_watching(void) { return true; }
static inline void rcu_momentary_eqs(void) { }
-static inline void kfree_rcu_scheduler_running(void) { }
/* Avoid RCU read-side critical sections leaking across. */
static inline void rcu_all_qs(void) { barrier(); }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 27d86d912781..9d2d7bd251d4 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,12 +34,9 @@ static inline void rcu_virt_note_context_switch(void)
}
void synchronize_rcu_expedited(void);
-void kvfree_call_rcu(struct rcu_head *head, void *ptr);
-void kvfree_rcu_barrier(void);
void rcu_barrier(void);
void rcu_momentary_eqs(void);
-void kfree_rcu_scheduler_running(void);
struct rcu_gp_oldstate {
unsigned long rgos_norm;
@@ -103,7 +100,7 @@ extern int rcu_scheduler_active;
void rcu_end_inkernel_boot(void);
bool rcu_inkernel_boot_has_ended(void);
bool rcu_is_watching(void);
-#ifndef CONFIG_PREEMPTION
+#ifndef CONFIG_PREEMPT_RCU
void rcu_all_qs(void);
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c15365a30c0..6e5c38718ff5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -65,6 +65,7 @@ struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
+struct perf_ctx_data;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
@@ -382,6 +383,11 @@ enum uclamp_id {
#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
+extern void sched_domains_mutex_lock(void);
+extern void sched_domains_mutex_unlock(void);
+#else
+static inline void sched_domains_mutex_lock(void) { }
+static inline void sched_domains_mutex_unlock(void) { }
#endif
struct sched_param {
@@ -1311,6 +1317,7 @@ struct task_struct {
struct perf_event_context *perf_event_ctxp;
struct mutex perf_event_mutex;
struct list_head perf_event_list;
+ struct perf_ctx_data __rcu *perf_ctx_data;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long preempt_disable_ip;
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 3a912ab42bb5..f9aabbc9d22e 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -34,7 +34,11 @@ static inline bool dl_time_before(u64 a, u64 b)
struct root_domain;
extern void dl_add_task_root_domain(struct task_struct *p);
extern void dl_clear_root_domain(struct root_domain *rd);
+extern void dl_clear_root_domain_cpu(int cpu);
#endif /* CONFIG_SMP */
+extern u64 dl_cookie;
+extern bool dl_bw_visited(int cpu, u64 cookie);
+
#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index b5035afa2396..35ed4577a6cc 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -35,12 +35,10 @@ extern void show_stack(struct task_struct *task, unsigned long *sp,
extern void sched_show_task(struct task_struct *p);
-#ifdef CONFIG_SCHED_DEBUG
struct seq_file;
extern void proc_sched_show_task(struct task_struct *p,
struct pid_namespace *ns, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p);
-#endif
/* Attach to any functions which should be ignored in wchan output. */
#define __sched __section(".sched.text")
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1d70a9867fb1..f7545430a548 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -146,6 +146,7 @@ struct sched_ext_entity {
u32 weight;
s32 sticky_cpu;
s32 holding_cpu;
+ s32 selected_cpu;
u32 kf_mask; /* see scx_kf_mask above */
struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */
atomic_long_t ops_state;
diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index e670ac282333..439f6029d3b9 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -79,6 +79,21 @@ static __always_inline bool __must_check current_clr_polling_and_test(void)
return unlikely(tif_need_resched());
}
+static __always_inline void current_clr_polling(void)
+{
+ __current_clr_polling();
+
+ /*
+ * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
+ * Once the bit is cleared, we'll get IPIs with every new
+ * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
+ * fold.
+ */
+ smp_mb__after_atomic(); /* paired with resched_curr() */
+
+ preempt_fold_need_resched();
+}
+
#else
static inline void __current_set_polling(void) { }
static inline void __current_clr_polling(void) { }
@@ -91,21 +106,15 @@ static inline bool __must_check current_clr_polling_and_test(void)
{
return unlikely(tif_need_resched());
}
-#endif
static __always_inline void current_clr_polling(void)
{
__current_clr_polling();
- /*
- * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
- * Once the bit is cleared, we'll get IPIs with every new
- * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
- * fold.
- */
smp_mb(); /* paired with resched_curr() */
preempt_fold_need_resched();
}
+#endif
#endif /* _LINUX_SCHED_IDLE_H */
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 928a626725e6..b13474825130 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -531,6 +531,13 @@ enum {
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
+ /*
+ * The atomic_read() below prevents CSE. The following should
+ * help the compiler generate more efficient code on architectures
+ * where sync_core_before_usermode() is a no-op.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE))
+ return;
if (current->mm != mm)
return;
if (likely(!(atomic_read(&mm->membarrier_state) &
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7f3dbafe1817..7b4301b7235f 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -25,16 +25,12 @@ enum {
};
#undef SD_FLAG
-#ifdef CONFIG_SCHED_DEBUG
-
struct sd_flag_debug {
unsigned int meta_flags;
char *name;
};
extern const struct sd_flag_debug sd_flag_debug[];
-#endif
-
#ifdef CONFIG_SCHED_SMT
static inline int cpu_smt_flags(void)
{
@@ -166,10 +162,6 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
return to_cpumask(sd->span);
}
-extern void partition_sched_domains_locked(int ndoms_new,
- cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new);
-
extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new);
@@ -211,12 +203,6 @@ extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
struct sched_domain_attr;
static inline void
-partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
-{
-}
-
-static inline void
partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index e45531455d3b..9b959972bf4a 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -22,21 +22,17 @@
#include <linux/atomic.h>
#include <asm/seccomp.h>
+extern int __secure_computing(void);
+
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-extern int __secure_computing(const struct seccomp_data *sd);
static inline int secure_computing(void)
{
if (unlikely(test_syscall_work(SECCOMP)))
- return __secure_computing(NULL);
+ return __secure_computing();
return 0;
}
#else
extern void secure_computing_strict(int this_syscall);
-static inline int __secure_computing(const struct seccomp_data *sd)
-{
- secure_computing_strict(sd->nr);
- return 0;
-}
#endif
extern long prctl_get_seccomp(void);
@@ -58,7 +54,7 @@ static inline int secure_computing(void) { return 0; }
#else
static inline void secure_computing_strict(int this_syscall) { return; }
#endif
-static inline int __secure_computing(const struct seccomp_data *sd) { return 0; }
+static inline int __secure_computing(void) { return 0; }
static inline long prctl_get_seccomp(void)
{
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 09eedaecf120..98e07e9e9e58 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -16,6 +16,7 @@
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
+#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>
#include <linux/cleanup.h>
@@ -941,8 +942,6 @@ static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t siz
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- if (__builtin_constant_p(n) && __builtin_constant_p(size))
- return kmalloc_noprof(bytes, flags);
return kmalloc_noprof(bytes, flags);
}
#define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))
@@ -1082,6 +1081,19 @@ extern void kvfree_sensitive(const void *addr, size_t len);
unsigned int kmem_cache_size(struct kmem_cache *s);
+#ifndef CONFIG_KVFREE_RCU_BATCHED
+static inline void kvfree_rcu_barrier(void)
+{
+ rcu_barrier();
+}
+
+static inline void kfree_rcu_scheduler_running(void) { }
+#else
+void kvfree_rcu_barrier(void);
+
+void kfree_rcu_scheduler_running(void);
+#endif
+
/**
* kmalloc_size_roundup - Report allocation bucket size for the given size
*
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index d7ba46e74f58..900b0d5c05f5 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -47,7 +47,13 @@ int init_srcu_struct(struct srcu_struct *ssp);
#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock().
#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe().
#define SRCU_READ_FLAVOR_LITE 0x4 // srcu_read_lock_lite().
-#define SRCU_READ_FLAVOR_ALL 0x7 // All of the above.
+#define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast().
+#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \
+ SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above.
+#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST)
+ // Flavors requiring synchronize_rcu()
+ // instead of smp_mb().
+void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
@@ -60,15 +66,6 @@ int init_srcu_struct(struct srcu_struct *ssp);
void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
-int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
-void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
-#ifdef CONFIG_TINY_SRCU
-#define __srcu_read_lock_lite __srcu_read_lock
-#define __srcu_read_unlock_lite __srcu_read_unlock
-#else // #ifdef CONFIG_TINY_SRCU
-int __srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp);
-void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx) __releases(ssp);
-#endif // #else // #ifdef CONFIG_TINY_SRCU
void synchronize_srcu(struct srcu_struct *ssp);
#define SRCU_GET_STATE_COMPLETED 0x1
@@ -258,6 +255,51 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
}
/**
+ * srcu_read_lock_fast - register a new reader for an SRCU-protected structure.
+ * @ssp: srcu_struct in which to register the new reader.
+ *
+ * Enter an SRCU read-side critical section, but for a light-weight
+ * smp_mb()-free reader. See srcu_read_lock() for more information.
+ *
+ * If srcu_read_lock_fast() is ever used on an srcu_struct structure,
+ * then none of the other flavors may be used, whether before, during,
+ * or after. Note that grace-period auto-expediting is disabled for _fast
+ * srcu_struct structures because auto-expedited grace periods invoke
+ * synchronize_rcu_expedited(), IPIs and all.
+ *
+ * Note that srcu_read_lock_fast() can be invoked only from those contexts
+ * where RCU is watching, that is, from contexts where it would be legal
+ * to invoke rcu_read_lock(). Otherwise, lockdep will complain.
+ */
+static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp)
+{
+ struct srcu_ctr __percpu *retval;
+
+ srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
+ retval = __srcu_read_lock_fast(ssp);
+ rcu_try_lock_acquire(&ssp->dep_map);
+ return retval;
+}
+
+/**
+ * srcu_down_read_fast - register a new reader for an SRCU-protected structure.
+ * @ssp: srcu_struct in which to register the new reader.
+ *
+ * Enter a semaphore-like SRCU read-side critical section, but for
+ * a light-weight smp_mb()-free reader. See srcu_read_lock_fast() and
+ * srcu_down_read() for more information.
+ *
+ * The same srcu_struct may be used concurrently by srcu_down_read_fast()
+ * and srcu_read_lock_fast().
+ */
+static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
+ srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
+ return __srcu_read_lock_fast(ssp);
+}
+
+/**
* srcu_read_lock_lite - register a new reader for an SRCU-protected structure.
* @ssp: srcu_struct in which to register the new reader.
*
@@ -278,7 +320,7 @@ static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp)
{
int retval;
- srcu_check_read_flavor_lite(ssp);
+ srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE);
retval = __srcu_read_lock_lite(ssp);
rcu_try_lock_acquire(&ssp->dep_map);
return retval;
@@ -335,7 +377,8 @@ srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
* srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler.
*
* Calls to srcu_down_read() may be nested, similar to the manner in
- * which calls to down_read() may be nested.
+ * which calls to down_read() may be nested. The same srcu_struct may be
+ * used concurrently by srcu_down_read() and srcu_read_lock().
*/
static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
{
@@ -361,9 +404,40 @@ static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
}
/**
+ * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure.
+ * @ssp: srcu_struct in which to unregister the old reader.
+ * @scp: return value from corresponding srcu_read_lock_fast().
+ *
+ * Exit a light-weight SRCU read-side critical section.
+ */
+static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
+ __releases(ssp)
+{
+ srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
+ srcu_lock_release(&ssp->dep_map);
+ __srcu_read_unlock_fast(ssp, scp);
+}
+
+/**
+ * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure.
+ * @ssp: srcu_struct in which to unregister the old reader.
+ * @scp: return value from corresponding srcu_read_lock_fast().
+ *
+ * Exit an SRCU read-side critical section, but not necessarily from
+ * the same context as the maching srcu_down_read_fast().
+ */
+static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
+ __releases(ssp)
+{
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
+ srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
+ __srcu_read_unlock_fast(ssp, scp);
+}
+
+/**
* srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure.
* @ssp: srcu_struct in which to unregister the old reader.
- * @idx: return value from corresponding srcu_read_lock().
+ * @idx: return value from corresponding srcu_read_lock_lite().
*
* Exit a light-weight SRCU read-side critical section.
*/
@@ -379,7 +453,7 @@ static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
/**
* srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
* @ssp: srcu_struct in which to unregister the old reader.
- * @idx: return value from corresponding srcu_read_lock().
+ * @idx: return value from corresponding srcu_read_lock_nmisafe().
*
* Exit an SRCU read-side critical section, but in an NMI-safe manner.
*/
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 1321da803274..380260317d98 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -64,13 +64,38 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp)
{
int idx;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1);
preempt_enable();
return idx;
}
+struct srcu_ctr;
+
+static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp)
+{
+ return (int)(intptr_t)(struct srcu_ctr __force __kernel *)scpp;
+}
+
+static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx)
+{
+ return (struct srcu_ctr __percpu *)(intptr_t)idx;
+}
+
+static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp)
+{
+ return __srcu_ctr_to_ptr(ssp, __srcu_read_lock(ssp));
+}
+
+static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
+{
+ __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp));
+}
+
+#define __srcu_read_lock_lite __srcu_read_lock
+#define __srcu_read_unlock_lite __srcu_read_unlock
+
static inline void synchronize_srcu_expedited(struct srcu_struct *ssp)
{
synchronize_srcu(ssp);
@@ -82,7 +107,7 @@ static inline void srcu_barrier(struct srcu_struct *ssp)
}
#define srcu_check_read_flavor(ssp, read_flavor) do { } while (0)
-#define srcu_check_read_flavor_lite(ssp) do { } while (0)
+#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0)
/* Defined here to avoid size increase for non-torture kernels. */
static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index b17814c9d1c7..8bed7e6cc4c1 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -17,14 +17,19 @@
struct srcu_node;
struct srcu_struct;
+/* One element of the srcu_data srcu_ctrs array. */
+struct srcu_ctr {
+ atomic_long_t srcu_locks; /* Locks per CPU. */
+ atomic_long_t srcu_unlocks; /* Unlocks per CPU. */
+};
+
/*
* Per-CPU structure feeding into leaf srcu_node, similar in function
* to rcu_node.
*/
struct srcu_data {
/* Read-side state. */
- atomic_long_t srcu_lock_count[2]; /* Locks per CPU. */
- atomic_long_t srcu_unlock_count[2]; /* Unlocks per CPU. */
+ struct srcu_ctr srcu_ctrs[2]; /* Locks and unlocks per CPU. */
int srcu_reader_flavor; /* Reader flavor for srcu_struct structure? */
/* Values: SRCU_READ_FLAVOR_.* */
@@ -95,7 +100,7 @@ struct srcu_usage {
* Per-SRCU-domain structure, similar in function to rcu_state.
*/
struct srcu_struct {
- unsigned int srcu_idx; /* Current rdr array element. */
+ struct srcu_ctr __percpu *srcu_ctrp;
struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */
struct lockdep_map dep_map;
struct srcu_usage *srcu_sup; /* Update-side data. */
@@ -162,6 +167,7 @@ struct srcu_struct {
#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \
{ \
.sda = &pcpu_name, \
+ .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \
__SRCU_STRUCT_INIT_COMMON(name, usage_name) \
}
@@ -201,10 +207,77 @@ struct srcu_struct {
#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */)
#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static)
+int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
void synchronize_srcu_expedited(struct srcu_struct *ssp);
void srcu_barrier(struct srcu_struct *ssp);
void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf);
+// Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that
+// element's index.
+static inline bool __srcu_ptr_to_ctr(struct srcu_struct *ssp, struct srcu_ctr __percpu *scpp)
+{
+ return scpp - &ssp->sda->srcu_ctrs[0];
+}
+
+// Converts an integer to a per-CPU pointer to the corresponding
+// ->srcu_ctrs[] array element.
+static inline struct srcu_ctr __percpu *__srcu_ctr_to_ptr(struct srcu_struct *ssp, int idx)
+{
+ return &ssp->sda->srcu_ctrs[idx];
+}
+
+/*
+ * Counts the new reader in the appropriate per-CPU element of the
+ * srcu_struct. Returns a pointer that must be passed to the matching
+ * srcu_read_unlock_fast().
+ *
+ * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side
+ * critical sections either because they disables interrupts, because they
+ * are a single instruction, or because they are a read-modify-write atomic
+ * operation, depending on the whims of the architecture.
+ *
+ * This means that __srcu_read_lock_fast() is not all that fast
+ * on architectures that support NMIs but do not supply NMI-safe
+ * implementations of this_cpu_inc().
+ */
+static inline struct srcu_ctr __percpu *__srcu_read_lock_fast(struct srcu_struct *ssp)
+{
+ struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
+
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast().");
+ if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
+ this_cpu_inc(scp->srcu_locks.counter); /* Y */
+ else
+ atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); /* Z */
+ barrier(); /* Avoid leaking the critical section. */
+ return scp;
+}
+
+/*
+ * Removes the count for the old reader from the appropriate
+ * per-CPU element of the srcu_struct. Note that this may well be a
+ * different CPU than that which was incremented by the corresponding
+ * srcu_read_lock_fast(), but it must be within the same task.
+ *
+ * Note that both this_cpu_inc() and atomic_long_inc() are RCU read-side
+ * critical sections either because they disables interrupts, because they
+ * are a single instruction, or because they are a read-modify-write atomic
+ * operation, depending on the whims of the architecture.
+ *
+ * This means that __srcu_read_unlock_fast() is not all that fast
+ * on architectures that support NMIs but do not supply NMI-safe
+ * implementations of this_cpu_inc().
+ */
+static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
+{
+ barrier(); /* Avoid leaking the critical section. */
+ if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE))
+ this_cpu_inc(scp->srcu_unlocks.counter); /* Z */
+ else
+ atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); /* Z */
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast().");
+}
+
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct. Returns an index that must be passed to the matching
@@ -217,13 +290,12 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf);
*/
static inline int __srcu_read_lock_lite(struct srcu_struct *ssp)
{
- int idx;
+ struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_lite().");
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); /* Y */
+ this_cpu_inc(scp->srcu_locks.counter); /* Y */
barrier(); /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scp);
}
/*
@@ -240,22 +312,24 @@ static inline int __srcu_read_lock_lite(struct srcu_struct *ssp)
static inline void __srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
{
barrier(); /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); /* Z */
+ this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter); /* Z */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_lite().");
}
void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor);
-// Record _lite() usage even for CONFIG_PROVE_RCU=n kernels.
-static inline void srcu_check_read_flavor_lite(struct srcu_struct *ssp)
+// Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is
+// needed only for flavors that require grace-period smp_mb() calls to be
+// promoted to synchronize_rcu().
+static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor)
{
struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
- if (likely(READ_ONCE(sdp->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE))
+ if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor))
return;
// Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered.
- __srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
+ __srcu_check_read_flavor(ssp, read_flavor);
}
// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels.
diff --git a/include/linux/string.h b/include/linux/string.h
index f8e21e80942f..0403a4ca4c11 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -415,8 +415,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
*/
#define strtomem_pad(dest, src, pad) do { \
const size_t _dest_len = __must_be_byte_array(dest) + \
+ __must_be_noncstr(dest) + \
ARRAY_SIZE(dest); \
- const size_t _src_len = __builtin_object_size(src, 1); \
+ const size_t _src_len = __must_be_cstr(src) + \
+ __builtin_object_size(src, 1); \
\
BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \
_dest_len == (size_t)-1); \
@@ -439,8 +441,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
*/
#define strtomem(dest, src) do { \
const size_t _dest_len = __must_be_byte_array(dest) + \
+ __must_be_noncstr(dest) + \
ARRAY_SIZE(dest); \
- const size_t _src_len = __builtin_object_size(src, 1); \
+ const size_t _src_len = __must_be_cstr(src) + \
+ __builtin_object_size(src, 1); \
\
BUILD_BUG_ON(!__builtin_constant_p(_dest_len) || \
_dest_len == (size_t)-1); \
@@ -459,8 +463,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
*/
#define memtostr(dest, src) do { \
const size_t _dest_len = __must_be_byte_array(dest) + \
+ __must_be_cstr(dest) + \
ARRAY_SIZE(dest); \
- const size_t _src_len = __builtin_object_size(src, 1); \
+ const size_t _src_len = __must_be_noncstr(src) + \
+ __builtin_object_size(src, 1); \
const size_t _src_chars = strnlen(src, _src_len); \
const size_t _copy_len = min(_dest_len - 1, _src_chars); \
\
@@ -485,8 +491,10 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
*/
#define memtostr_pad(dest, src) do { \
const size_t _dest_len = __must_be_byte_array(dest) + \
+ __must_be_cstr(dest) + \
ARRAY_SIZE(dest); \
- const size_t _src_len = __builtin_object_size(src, 1); \
+ const size_t _src_len = __must_be_noncstr(src) + \
+ __builtin_object_size(src, 1); \
const size_t _src_chars = strnlen(src, _src_len); \
const size_t _copy_len = min(_dest_len - 1, _src_chars); \
\
diff --git a/include/linux/string_choices.h b/include/linux/string_choices.h
index 120ca0f28e95..f3ba4f52ff26 100644
--- a/include/linux/string_choices.h
+++ b/include/linux/string_choices.h
@@ -41,23 +41,23 @@ static inline const char *str_high_low(bool v)
}
#define str_low_high(v) str_high_low(!(v))
-static inline const char *str_read_write(bool v)
-{
- return v ? "read" : "write";
-}
-#define str_write_read(v) str_read_write(!(v))
-
static inline const char *str_on_off(bool v)
{
return v ? "on" : "off";
}
#define str_off_on(v) str_on_off(!(v))
-static inline const char *str_yes_no(bool v)
+static inline const char *str_read_write(bool v)
{
- return v ? "yes" : "no";
+ return v ? "read" : "write";
}
-#define str_no_yes(v) str_yes_no(!(v))
+#define str_write_read(v) str_read_write(!(v))
+
+static inline const char *str_true_false(bool v)
+{
+ return v ? "true" : "false";
+}
+#define str_false_true(v) str_true_false(!(v))
static inline const char *str_up_down(bool v)
{
@@ -65,11 +65,11 @@ static inline const char *str_up_down(bool v)
}
#define str_down_up(v) str_up_down(!(v))
-static inline const char *str_true_false(bool v)
+static inline const char *str_yes_no(bool v)
{
- return v ? "true" : "false";
+ return v ? "yes" : "no";
}
-#define str_false_true(v) str_true_false(!(v))
+#define str_no_yes(v) str_yes_no(!(v))
/**
* str_plural - Return the simple pluralization based on English counts
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index b5ec038069da..91cdf12190a0 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -6,7 +6,7 @@
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
-extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent);
+extern void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent);
extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents);
extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
extern int swap_cgroup_swapon(int type, unsigned long max_pages);
@@ -15,7 +15,7 @@ extern void swap_cgroup_swapoff(int type);
#else
static inline
-void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
+void swap_cgroup_record(struct folio *folio, unsigned short id, swp_entry_t ent)
{
}
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c6333204d451..e5603cc91963 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -951,6 +951,10 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
int flags, uint32_t sig);
asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags);
+asmlinkage long sys_open_tree_attr(int dfd, const char __user *path,
+ unsigned flags,
+ struct mount_attr __user *uattr,
+ size_t usize);
asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path,
int to_dfd, const char __user *to_path,
unsigned int ms_flags);
@@ -1266,14 +1270,14 @@ static inline long ksys_lchown(const char __user *filename, uid_t user,
AT_SYMLINK_NOFOLLOW);
}
-extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small);
static inline long ksys_ftruncate(unsigned int fd, loff_t length)
{
return do_sys_ftruncate(fd, length, 1);
}
-extern long do_sys_truncate(const char __user *pathname, loff_t length);
+int do_sys_truncate(const char __user *pathname, loff_t length);
static inline long ksys_truncate(const char __user *pathname, loff_t length)
{
diff --git a/include/linux/sysv_fs.h b/include/linux/sysv_fs.h
deleted file mode 100644
index 5cf77dbb8d86..000000000000
--- a/include/linux/sysv_fs.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SYSV_FS_H
-#define _LINUX_SYSV_FS_H
-
-#define __packed2__ __attribute__((packed, aligned(2)))
-
-
-#ifndef __KERNEL__
-typedef u16 __fs16;
-typedef u32 __fs16;
-#endif
-
-/* inode numbers are 16 bit */
-typedef __fs16 sysv_ino_t;
-
-/* Block numbers are 24 bit, sometimes stored in 32 bit.
- On Coherent FS, they are always stored in PDP-11 manner: the least
- significant 16 bits come last. */
-typedef __fs32 sysv_zone_t;
-
-/* 0 is non-existent */
-#define SYSV_BADBL_INO 1 /* inode of bad blocks file */
-#define SYSV_ROOT_INO 2 /* inode of root directory */
-
-
-/* Xenix super-block data on disk */
-#define XENIX_NICINOD 100 /* number of inode cache entries */
-#define XENIX_NICFREE 100 /* number of free block list chunk entries */
-struct xenix_super_block {
- __fs16 s_isize; /* index of first data zone */
- __fs32 s_fsize __packed2__; /* total number of zones of this fs */
- /* the start of the free block list: */
- __fs16 s_nfree; /* number of free blocks in s_free, <= XENIX_NICFREE */
- sysv_zone_t s_free[XENIX_NICFREE]; /* first free block list chunk */
- /* the cache of free inodes: */
- __fs16 s_ninode; /* number of free inodes in s_inode, <= XENIX_NICINOD */
- sysv_ino_t s_inode[XENIX_NICINOD]; /* some free inodes */
- /* locks, not used by Linux: */
- char s_flock; /* lock during free block list manipulation */
- char s_ilock; /* lock during inode cache manipulation */
- char s_fmod; /* super-block modified flag */
- char s_ronly; /* flag whether fs is mounted read-only */
- __fs32 s_time __packed2__; /* time of last super block update */
- __fs32 s_tfree __packed2__; /* total number of free zones */
- __fs16 s_tinode; /* total number of free inodes */
- __fs16 s_dinfo[4]; /* device information ?? */
- char s_fname[6]; /* file system volume name */
- char s_fpack[6]; /* file system pack name */
- char s_clean; /* set to 0x46 when filesystem is properly unmounted */
- char s_fill[371];
- s32 s_magic; /* version of file system */
- __fs32 s_type; /* type of file system: 1 for 512 byte blocks
- 2 for 1024 byte blocks
- 3 for 2048 byte blocks */
-
-};
-
-/*
- * SystemV FS comes in two variants:
- * sysv2: System V Release 2 (e.g. Microport), structure elements aligned(2).
- * sysv4: System V Release 4 (e.g. Consensys), structure elements aligned(4).
- */
-#define SYSV_NICINOD 100 /* number of inode cache entries */
-#define SYSV_NICFREE 50 /* number of free block list chunk entries */
-
-/* SystemV4 super-block data on disk */
-struct sysv4_super_block {
- __fs16 s_isize; /* index of first data zone */
- u16 s_pad0;
- __fs32 s_fsize; /* total number of zones of this fs */
- /* the start of the free block list: */
- __fs16 s_nfree; /* number of free blocks in s_free, <= SYSV_NICFREE */
- u16 s_pad1;
- sysv_zone_t s_free[SYSV_NICFREE]; /* first free block list chunk */
- /* the cache of free inodes: */
- __fs16 s_ninode; /* number of free inodes in s_inode, <= SYSV_NICINOD */
- u16 s_pad2;
- sysv_ino_t s_inode[SYSV_NICINOD]; /* some free inodes */
- /* locks, not used by Linux: */
- char s_flock; /* lock during free block list manipulation */
- char s_ilock; /* lock during inode cache manipulation */
- char s_fmod; /* super-block modified flag */
- char s_ronly; /* flag whether fs is mounted read-only */
- __fs32 s_time; /* time of last super block update */
- __fs16 s_dinfo[4]; /* device information ?? */
- __fs32 s_tfree; /* total number of free zones */
- __fs16 s_tinode; /* total number of free inodes */
- u16 s_pad3;
- char s_fname[6]; /* file system volume name */
- char s_fpack[6]; /* file system pack name */
- s32 s_fill[12];
- __fs32 s_state; /* file system state: 0x7c269d38-s_time means clean */
- s32 s_magic; /* version of file system */
- __fs32 s_type; /* type of file system: 1 for 512 byte blocks
- 2 for 1024 byte blocks */
-};
-
-/* SystemV2 super-block data on disk */
-struct sysv2_super_block {
- __fs16 s_isize; /* index of first data zone */
- __fs32 s_fsize __packed2__; /* total number of zones of this fs */
- /* the start of the free block list: */
- __fs16 s_nfree; /* number of free blocks in s_free, <= SYSV_NICFREE */
- sysv_zone_t s_free[SYSV_NICFREE]; /* first free block list chunk */
- /* the cache of free inodes: */
- __fs16 s_ninode; /* number of free inodes in s_inode, <= SYSV_NICINOD */
- sysv_ino_t s_inode[SYSV_NICINOD]; /* some free inodes */
- /* locks, not used by Linux: */
- char s_flock; /* lock during free block list manipulation */
- char s_ilock; /* lock during inode cache manipulation */
- char s_fmod; /* super-block modified flag */
- char s_ronly; /* flag whether fs is mounted read-only */
- __fs32 s_time __packed2__; /* time of last super block update */
- __fs16 s_dinfo[4]; /* device information ?? */
- __fs32 s_tfree __packed2__; /* total number of free zones */
- __fs16 s_tinode; /* total number of free inodes */
- char s_fname[6]; /* file system volume name */
- char s_fpack[6]; /* file system pack name */
- s32 s_fill[14];
- __fs32 s_state; /* file system state: 0xcb096f43 means clean */
- s32 s_magic; /* version of file system */
- __fs32 s_type; /* type of file system: 1 for 512 byte blocks
- 2 for 1024 byte blocks */
-};
-
-/* V7 super-block data on disk */
-#define V7_NICINOD 100 /* number of inode cache entries */
-#define V7_NICFREE 50 /* number of free block list chunk entries */
-struct v7_super_block {
- __fs16 s_isize; /* index of first data zone */
- __fs32 s_fsize __packed2__; /* total number of zones of this fs */
- /* the start of the free block list: */
- __fs16 s_nfree; /* number of free blocks in s_free, <= V7_NICFREE */
- sysv_zone_t s_free[V7_NICFREE]; /* first free block list chunk */
- /* the cache of free inodes: */
- __fs16 s_ninode; /* number of free inodes in s_inode, <= V7_NICINOD */
- sysv_ino_t s_inode[V7_NICINOD]; /* some free inodes */
- /* locks, not used by Linux or V7: */
- char s_flock; /* lock during free block list manipulation */
- char s_ilock; /* lock during inode cache manipulation */
- char s_fmod; /* super-block modified flag */
- char s_ronly; /* flag whether fs is mounted read-only */
- __fs32 s_time __packed2__; /* time of last super block update */
- /* the following fields are not maintained by V7: */
- __fs32 s_tfree __packed2__; /* total number of free zones */
- __fs16 s_tinode; /* total number of free inodes */
- __fs16 s_m; /* interleave factor */
- __fs16 s_n; /* interleave factor */
- char s_fname[6]; /* file system name */
- char s_fpack[6]; /* file system pack name */
-};
-/* Constants to aid sanity checking */
-/* This is not a hard limit, nor enforced by v7 kernel. It's actually just
- * the limit used by Seventh Edition's ls, though is high enough to assume
- * that no reasonable file system would have that much entries in root
- * directory. Thus, if we see anything higher, we just probably got the
- * endiannes wrong. */
-#define V7_NFILES 1024
-/* The disk addresses are three-byte (despite direct block addresses being
- * aligned word-wise in inode). If the most significant byte is non-zero,
- * something is most likely wrong (not a filesystem, bad bytesex). */
-#define V7_MAXSIZE 0x00ffffff
-
-/* Coherent super-block data on disk */
-#define COH_NICINOD 100 /* number of inode cache entries */
-#define COH_NICFREE 64 /* number of free block list chunk entries */
-struct coh_super_block {
- __fs16 s_isize; /* index of first data zone */
- __fs32 s_fsize __packed2__; /* total number of zones of this fs */
- /* the start of the free block list: */
- __fs16 s_nfree; /* number of free blocks in s_free, <= COH_NICFREE */
- sysv_zone_t s_free[COH_NICFREE] __packed2__; /* first free block list chunk */
- /* the cache of free inodes: */
- __fs16 s_ninode; /* number of free inodes in s_inode, <= COH_NICINOD */
- sysv_ino_t s_inode[COH_NICINOD]; /* some free inodes */
- /* locks, not used by Linux: */
- char s_flock; /* lock during free block list manipulation */
- char s_ilock; /* lock during inode cache manipulation */
- char s_fmod; /* super-block modified flag */
- char s_ronly; /* flag whether fs is mounted read-only */
- __fs32 s_time __packed2__; /* time of last super block update */
- __fs32 s_tfree __packed2__; /* total number of free zones */
- __fs16 s_tinode; /* total number of free inodes */
- __fs16 s_interleave_m; /* interleave factor */
- __fs16 s_interleave_n;
- char s_fname[6]; /* file system volume name */
- char s_fpack[6]; /* file system pack name */
- __fs32 s_unique; /* zero, not used */
-};
-
-/* SystemV/Coherent inode data on disk */
-struct sysv_inode {
- __fs16 i_mode;
- __fs16 i_nlink;
- __fs16 i_uid;
- __fs16 i_gid;
- __fs32 i_size;
- u8 i_data[3*(10+1+1+1)];
- u8 i_gen;
- __fs32 i_atime; /* time of last access */
- __fs32 i_mtime; /* time of last modification */
- __fs32 i_ctime; /* time of creation */
-};
-
-/* SystemV/Coherent directory entry on disk */
-#define SYSV_NAMELEN 14 /* max size of name in struct sysv_dir_entry */
-struct sysv_dir_entry {
- sysv_ino_t inode;
- char name[SYSV_NAMELEN]; /* up to 14 characters, the rest are zeroes */
-};
-
-#define SYSV_DIRSIZE sizeof(struct sysv_dir_entry) /* size of every directory entry */
-
-#endif /* _LINUX_SYSV_FS_H */
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index cf2446c9c30d..dd925d84fa46 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -217,54 +217,6 @@ static inline int arch_within_stack_frames(const void * const stack,
}
#endif
-#ifdef CONFIG_HARDENED_USERCOPY
-extern void __check_object_size(const void *ptr, unsigned long n,
- bool to_user);
-
-static __always_inline void check_object_size(const void *ptr, unsigned long n,
- bool to_user)
-{
- if (!__builtin_constant_p(n))
- __check_object_size(ptr, n, to_user);
-}
-#else
-static inline void check_object_size(const void *ptr, unsigned long n,
- bool to_user)
-{ }
-#endif /* CONFIG_HARDENED_USERCOPY */
-
-extern void __compiletime_error("copy source size is too small")
-__bad_copy_from(void);
-extern void __compiletime_error("copy destination size is too small")
-__bad_copy_to(void);
-
-void __copy_overflow(int size, unsigned long count);
-
-static inline void copy_overflow(int size, unsigned long count)
-{
- if (IS_ENABLED(CONFIG_BUG))
- __copy_overflow(size, count);
-}
-
-static __always_inline __must_check bool
-check_copy_size(const void *addr, size_t bytes, bool is_source)
-{
- int sz = __builtin_object_size(addr, 0);
- if (unlikely(sz >= 0 && sz < bytes)) {
- if (!__builtin_constant_p(bytes))
- copy_overflow(sz, bytes);
- else if (is_source)
- __bad_copy_from();
- else
- __bad_copy_to();
- return false;
- }
- if (WARN_ON_ONCE(bytes > INT_MAX))
- return false;
- check_object_size(addr, bytes, is_source);
- return true;
-}
-
#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3..a1815f4395ab 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -262,6 +262,36 @@ sched_numa_hop_mask(unsigned int node, unsigned int hops)
#endif /* CONFIG_NUMA */
/**
+ * for_each_node_numadist() - iterate over nodes in increasing distance
+ * order, starting from a given node
+ * @node: the iteration variable and the starting node.
+ * @unvisited: a nodemask to keep track of the unvisited nodes.
+ *
+ * This macro iterates over NUMA node IDs in increasing distance from the
+ * starting @node and yields MAX_NUMNODES when all the nodes have been
+ * visited.
+ *
+ * Note that by the time the loop completes, the @unvisited nodemask will
+ * be fully cleared, unless the loop exits early.
+ *
+ * The difference between for_each_node() and for_each_node_numadist() is
+ * that the former allows to iterate over nodes in numerical order, whereas
+ * the latter iterates over nodes in increasing order of distance.
+ *
+ * This complexity of this iterator is O(N^2), where N represents the
+ * number of nodes, as each iteration involves scanning all nodes to
+ * find the one with the shortest distance.
+ *
+ * Requires rcu_lock to be held.
+ */
+#define for_each_node_numadist(node, unvisited) \
+ for (int __start = (node), \
+ (node) = nearest_node_nodemask((__start), &(unvisited)); \
+ (node) < MAX_NUMNODES; \
+ node_clear((node), (unvisited)), \
+ (node) = nearest_node_nodemask((__start), &(unvisited)))
+
+/**
* for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
* from a given node.
* @mask: the iteration variable.
diff --git a/include/linux/torture.h b/include/linux/torture.h
index 0134e7221cae..1b59056c3b18 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -104,6 +104,7 @@ int torture_stutter_init(int s, int sgap);
/* Initialization and cleanup. */
bool torture_init_begin(char *ttype, int v);
void torture_init_end(void);
+unsigned long get_torture_init_jiffies(void);
bool torture_cleanup_begin(void);
void torture_cleanup_end(void);
bool torture_must_stop(void);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index e9c702c1908d..7c06f4795670 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -7,7 +7,7 @@
#include <linux/minmax.h>
#include <linux/nospec.h>
#include <linux/sched.h>
-#include <linux/thread_info.h>
+#include <linux/ucopysize.h>
#include <asm/uaccess.h>
diff --git a/include/linux/ucopysize.h b/include/linux/ucopysize.h
new file mode 100644
index 000000000000..41c2d9720466
--- /dev/null
+++ b/include/linux/ucopysize.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Perform sanity checking for object sizes for uaccess.h and uio.h. */
+#ifndef __LINUX_UCOPYSIZE_H__
+#define __LINUX_UCOPYSIZE_H__
+
+#include <linux/bug.h>
+
+#ifdef CONFIG_HARDENED_USERCOPY
+#include <linux/jump_label.h>
+extern void __check_object_size(const void *ptr, unsigned long n,
+ bool to_user);
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
+ validate_usercopy_range);
+
+static __always_inline void check_object_size(const void *ptr, unsigned long n,
+ bool to_user)
+{
+ if (!__builtin_constant_p(n) &&
+ static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
+ &validate_usercopy_range)) {
+ __check_object_size(ptr, n, to_user);
+ }
+}
+#else
+static inline void check_object_size(const void *ptr, unsigned long n,
+ bool to_user)
+{ }
+#endif /* CONFIG_HARDENED_USERCOPY */
+
+extern void __compiletime_error("copy source size is too small")
+__bad_copy_from(void);
+extern void __compiletime_error("copy destination size is too small")
+__bad_copy_to(void);
+
+void __copy_overflow(int size, unsigned long count);
+
+static inline void copy_overflow(int size, unsigned long count)
+{
+ if (IS_ENABLED(CONFIG_BUG))
+ __copy_overflow(size, count);
+}
+
+static __always_inline __must_check bool
+check_copy_size(const void *addr, size_t bytes, bool is_source)
+{
+ int sz = __builtin_object_size(addr, 0);
+ if (unlikely(sz >= 0 && sz < bytes)) {
+ if (!__builtin_constant_p(bytes))
+ copy_overflow(sz, bytes);
+ else if (is_source)
+ __bad_copy_from();
+ else
+ __bad_copy_to();
+ return false;
+ }
+ if (WARN_ON_ONCE(bytes > INT_MAX))
+ return false;
+ check_object_size(addr, bytes, is_source);
+ return true;
+}
+
+#endif /* __LINUX_UCOPYSIZE_H__ */
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index f85ec5613721..2dc767e08f54 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -132,6 +132,7 @@ static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
u32 map_id_down(struct uid_gid_map *map, u32 id);
u32 map_id_up(struct uid_gid_map *map, u32 id);
+u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count);
#else
@@ -186,6 +187,11 @@ static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
return id;
}
+static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count)
+{
+ return id;
+}
+
static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
{
return id;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 8ada84e85447..49ece9e1888f 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -6,8 +6,8 @@
#define __LINUX_UIO_H
#include <linux/kernel.h>
-#include <linux/thread_info.h>
#include <linux/mm_types.h>
+#include <linux/ucopysize.h>
#include <uapi/linux/uio.h>
struct page;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b1df7d792fa1..2e46b69ff0a6 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -39,6 +39,8 @@ struct page;
#define MAX_URETPROBE_DEPTH 64
+#define UPROBE_NO_TRAMPOLINE_VADDR (~0UL)
+
struct uprobe_consumer {
/*
* handler() can return UPROBE_HANDLER_REMOVE to signal the need to
@@ -143,6 +145,7 @@ struct uprobe_task {
struct uprobe *active_uprobe;
unsigned long xol_vaddr;
+ bool signal_denied;
struct arch_uprobe *auprobe;
};
diff --git a/include/linux/vfsdebug.h b/include/linux/vfsdebug.h
new file mode 100644
index 000000000000..9cf22d3eb9dd
--- /dev/null
+++ b/include/linux/vfsdebug.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VFS_DEBUG_H
+#define LINUX_VFS_DEBUG_H 1
+
+#include <linux/bug.h>
+
+struct inode;
+
+#ifdef CONFIG_DEBUG_VFS
+void dump_inode(struct inode *inode, const char *reason);
+
+#define VFS_BUG_ON(cond) BUG_ON(cond)
+#define VFS_WARN_ON(cond) (void)WARN_ON(cond)
+#define VFS_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
+#define VFS_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
+#define VFS_WARN(cond, format...) (void)WARN(cond, format)
+
+#define VFS_BUG_ON_INODE(cond, inode) ({ \
+ if (unlikely(!!(cond))) { \
+ dump_inode(inode, "VFS_BUG_ON_INODE(" #cond")");\
+ BUG_ON(1); \
+ } \
+})
+
+#define VFS_WARN_ON_INODE(cond, inode) ({ \
+ int __ret_warn = !!(cond); \
+ \
+ if (unlikely(__ret_warn)) { \
+ dump_inode(inode, "VFS_WARN_ON_INODE(" #cond")");\
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn); \
+})
+#else
+#define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VFS_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VFS_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VFS_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
+#define VFS_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
+
+#define VFS_BUG_ON_INODE(cond, inode) VFS_BUG_ON(cond)
+#define VFS_WARN_ON_INODE(cond, inode) BUILD_BUG_ON_INVALID(cond)
+#endif /* CONFIG_DEBUG_VFS */
+
+#endif
diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h
index e1dec1a6a749..37e003ae5262 100644
--- a/include/linux/vmcore_info.h
+++ b/include/linux/vmcore_info.h
@@ -6,9 +6,8 @@
#include <linux/elfcore.h>
#include <linux/elf.h>
-#define CRASH_CORE_NOTE_NAME "CORE"
#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4)
-#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4)
+#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4)
#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4)
/*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 6d90ad974408..3503fe822e38 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -316,6 +316,9 @@ extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);
} \
\
cmd; \
+ \
+ if (condition) \
+ break; \
} \
finish_wait(&wq_head, &__wq_entry); \
__out: __ret; \