summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup-v1.c2
-rw-r--r--kernel/cgroup/cgroup.c30
-rw-r--r--kernel/dma/Kconfig13
-rw-r--r--kernel/dma/Makefile1
-rw-r--r--kernel/dma/contiguous.c2
-rw-r--r--kernel/dma/map_benchmark.c361
-rw-r--r--kernel/dma/mapping.c12
-rw-r--r--kernel/dma/pool.c3
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/irq/irqdesc.c45
-rw-r--r--kernel/irq/manage.c37
-rw-r--r--kernel/irq/proc.c5
-rw-r--r--kernel/jump_label.c8
-rw-r--r--kernel/rcu/tasks.h25
-rw-r--r--kernel/sched/core.c40
-rw-r--r--kernel/sched/cpufreq_schedutil.c106
-rw-r--r--kernel/sched/sched.h13
-rw-r--r--kernel/sched/wait.c17
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/tick-common.c10
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/workqueue.c13
25 files changed, 627 insertions, 135 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 191c329e482a..32596fdbcd5b 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -908,6 +908,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
if (strcmp(param->key, "source") == 0) {
+ if (fc->source)
+ return invalf(fc, "Multiple sources not supported");
fc->source = param->string;
param->string = NULL;
return 0;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index fefa21981027..613845769103 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -244,7 +244,7 @@ bool cgroup_ssid_enabled(int ssid)
*
* The default hierarchy is the v2 interface of cgroup and this function
* can be used to test whether a cgroup is on the default hierarchy for
- * cases where a subsystem should behave differnetly depending on the
+ * cases where a subsystem should behave differently depending on the
* interface version.
*
* List of changed behaviors:
@@ -262,7 +262,7 @@ bool cgroup_ssid_enabled(int ssid)
* "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they got
- * recycled inbetween reads.
+ * recycled in-between reads.
*
* - "release_agent" and "notify_on_release" are removed. Replacement
* notification mechanism will be implemented.
@@ -342,7 +342,7 @@ static bool cgroup_is_mixable(struct cgroup *cgrp)
return !cgroup_parent(cgrp);
}
-/* can @cgrp become a thread root? should always be true for a thread root */
+/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
/* mixables don't care */
@@ -527,7 +527,7 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
* the root css is returned, so this function always returns a valid css.
*
* The returned css is not guaranteed to be online, and therefore it is the
- * callers responsiblity to tryget a reference for it.
+ * callers responsibility to try get a reference for it.
*/
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
@@ -699,7 +699,7 @@ EXPORT_SYMBOL_GPL(of_css);
; \
else
-/* walk live descendants in preorder */
+/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
@@ -933,7 +933,7 @@ void put_css_set_locked(struct css_set *cset)
WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
- /* This css_set is dead. unlink it and release cgroup and css refs */
+ /* This css_set is dead. Unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
css_put(cset->subsys[ssid]);
@@ -1058,7 +1058,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
/*
* Build the set of subsystem state objects that we want to see in the
- * new css_set. while subsystems can change globally, the entries here
+ * new css_set. While subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
@@ -1148,7 +1148,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
/*
* Always add links to the tail of the lists so that the lists are
- * in choronological order.
+ * in chronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
@@ -3654,7 +3654,7 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
static int cgroup_file_open(struct kernfs_open_file *of)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
if (cft->open)
return cft->open(of);
@@ -3663,7 +3663,7 @@ static int cgroup_file_open(struct kernfs_open_file *of)
static void cgroup_file_release(struct kernfs_open_file *of)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
if (cft->release)
cft->release(of);
@@ -3674,7 +3674,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
{
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *cgrp = of->kn->parent->priv;
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
@@ -3724,7 +3724,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
if (cft->poll)
return cft->poll(of, pt);
@@ -4134,7 +4134,7 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
* implies that if we observe !CSS_RELEASED on @pos in this RCU
* critical section, the one pointed to by its next pointer is
* guaranteed to not have finished its RCU grace period even if we
- * have dropped rcu_read_lock() inbetween iterations.
+ * have dropped rcu_read_lock() in-between iterations.
*
* If @pos has CSS_RELEASED set, its next pointer can't be
* dereferenced; however, as each css is given a monotonically
@@ -4382,7 +4382,7 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
}
/**
- * css_task_iter_advance_css_set - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task iterator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
@@ -6308,7 +6308,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
* if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index fd2db2665fc6..479fc145acfc 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -20,6 +20,10 @@ config DMA_OPS
config DMA_OPS_BYPASS
bool
+# Lets platform IOMMU driver choose between bypass and IOMMU
+config ARCH_HAS_DMA_MAP_DIRECT
+ bool
+
config NEED_SG_DMA_LENGTH
bool
@@ -220,3 +224,12 @@ config DMA_API_DEBUG_SG
is technically out-of-spec.
If unsure, say N.
+
+config DMA_MAP_BENCHMARK
+ bool "Enable benchmarking of streaming DMA mapping"
+ depends on DEBUG_FS
+ help
+ Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
+ performance of dma_(un)map_page.
+
+ See tools/testing/selftests/dma/dma_map_benchmark.c
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index cd1d86358a7a..0dd65ec1d234 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -9,3 +9,4 @@ obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
obj-$(CONFIG_DMA_REMAP) += remap.o
+obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 16b95ff12e4d..3d63d91cba5c 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -20,7 +20,7 @@
* coders, etc.
*
* Such devices often require big memory buffers (a full HD frame
- * is, for instance, more then 2 mega pixels large, i.e. more than 6
+ * is, for instance, more than 2 mega pixels large, i.e. more than 6
* MB of memory), which makes mechanisms such as kmalloc() or
* alloc_page() ineffective.
*
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
new file mode 100644
index 000000000000..b1496e744c68
--- /dev/null
+++ b/kernel/dma/map_benchmark.c
@@ -0,0 +1,361 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Hisilicon Limited.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/math64.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/timekeeping.h>
+
+#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
+#define DMA_MAP_MAX_THREADS 1024
+#define DMA_MAP_MAX_SECONDS 300
+
+#define DMA_MAP_BIDIRECTIONAL 0
+#define DMA_MAP_TO_DEVICE 1
+#define DMA_MAP_FROM_DEVICE 2
+
+struct map_benchmark {
+ __u64 avg_map_100ns; /* average map latency in 100ns */
+ __u64 map_stddev; /* standard deviation of map latency */
+ __u64 avg_unmap_100ns; /* as above */
+ __u64 unmap_stddev;
+ __u32 threads; /* how many threads will do map/unmap in parallel */
+ __u32 seconds; /* how long the test will last */
+ __s32 node; /* which numa node this benchmark will run on */
+ __u32 dma_bits; /* DMA addressing capability */
+ __u32 dma_dir; /* DMA data direction */
+ __u64 expansion[10]; /* For future use */
+};
+
+struct map_benchmark_data {
+ struct map_benchmark bparam;
+ struct device *dev;
+ struct dentry *debugfs;
+ enum dma_data_direction dir;
+ atomic64_t sum_map_100ns;
+ atomic64_t sum_unmap_100ns;
+ atomic64_t sum_sq_map;
+ atomic64_t sum_sq_unmap;
+ atomic64_t loops;
+};
+
+static int map_benchmark_thread(void *data)
+{
+ void *buf;
+ dma_addr_t dma_addr;
+ struct map_benchmark_data *map = data;
+ int ret = 0;
+
+ buf = (void *)__get_free_page(GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ while (!kthread_should_stop()) {
+ u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
+ ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
+ ktime_t map_delta, unmap_delta;
+
+ /*
+ * for a non-coherent device, if we don't stain them in the
+ * cache, this will give an underestimate of the real-world
+ * overhead of BIDIRECTIONAL or TO_DEVICE mappings;
+ * 66 means evertything goes well! 66 is lucky.
+ */
+ if (map->dir != DMA_FROM_DEVICE)
+ memset(buf, 0x66, PAGE_SIZE);
+
+ map_stime = ktime_get();
+ dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir);
+ if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
+ pr_err("dma_map_single failed on %s\n",
+ dev_name(map->dev));
+ ret = -ENOMEM;
+ goto out;
+ }
+ map_etime = ktime_get();
+ map_delta = ktime_sub(map_etime, map_stime);
+
+ unmap_stime = ktime_get();
+ dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
+ unmap_etime = ktime_get();
+ unmap_delta = ktime_sub(unmap_etime, unmap_stime);
+
+ /* calculate sum and sum of squares */
+
+ map_100ns = div64_ul(map_delta, 100);
+ unmap_100ns = div64_ul(unmap_delta, 100);
+ map_sq = map_100ns * map_100ns;
+ unmap_sq = unmap_100ns * unmap_100ns;
+
+ atomic64_add(map_100ns, &map->sum_map_100ns);
+ atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
+ atomic64_add(map_sq, &map->sum_sq_map);
+ atomic64_add(unmap_sq, &map->sum_sq_unmap);
+ atomic64_inc(&map->loops);
+ }
+
+out:
+ free_page((unsigned long)buf);
+ return ret;
+}
+
+static int do_map_benchmark(struct map_benchmark_data *map)
+{
+ struct task_struct **tsk;
+ int threads = map->bparam.threads;
+ int node = map->bparam.node;
+ const cpumask_t *cpu_mask = cpumask_of_node(node);
+ u64 loops;
+ int ret = 0;
+ int i;
+
+ tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
+ if (!tsk)
+ return -ENOMEM;
+
+ get_device(map->dev);
+
+ for (i = 0; i < threads; i++) {
+ tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
+ map->bparam.node, "dma-map-benchmark/%d", i);
+ if (IS_ERR(tsk[i])) {
+ pr_err("create dma_map thread failed\n");
+ ret = PTR_ERR(tsk[i]);
+ goto out;
+ }
+
+ if (node != NUMA_NO_NODE)
+ kthread_bind_mask(tsk[i], cpu_mask);
+ }
+
+ /* clear the old value in the previous benchmark */
+ atomic64_set(&map->sum_map_100ns, 0);
+ atomic64_set(&map->sum_unmap_100ns, 0);
+ atomic64_set(&map->sum_sq_map, 0);
+ atomic64_set(&map->sum_sq_unmap, 0);
+ atomic64_set(&map->loops, 0);
+
+ for (i = 0; i < threads; i++)
+ wake_up_process(tsk[i]);
+
+ msleep_interruptible(map->bparam.seconds * 1000);
+
+ /* wait for the completion of benchmark threads */
+ for (i = 0; i < threads; i++) {
+ ret = kthread_stop(tsk[i]);
+ if (ret)
+ goto out;
+ }
+
+ loops = atomic64_read(&map->loops);
+ if (likely(loops > 0)) {
+ u64 map_variance, unmap_variance;
+ u64 sum_map = atomic64_read(&map->sum_map_100ns);
+ u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
+ u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
+ u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
+
+ /* average latency */
+ map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
+ map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
+
+ /* standard deviation of latency */
+ map_variance = div64_u64(sum_sq_map, loops) -
+ map->bparam.avg_map_100ns *
+ map->bparam.avg_map_100ns;
+ unmap_variance = div64_u64(sum_sq_unmap, loops) -
+ map->bparam.avg_unmap_100ns *
+ map->bparam.avg_unmap_100ns;
+ map->bparam.map_stddev = int_sqrt64(map_variance);
+ map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
+ }
+
+out:
+ put_device(map->dev);
+ kfree(tsk);
+ return ret;
+}
+
+static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct map_benchmark_data *map = file->private_data;
+ void __user *argp = (void __user *)arg;
+ u64 old_dma_mask;
+
+ int ret;
+
+ if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case DMA_MAP_BENCHMARK:
+ if (map->bparam.threads == 0 ||
+ map->bparam.threads > DMA_MAP_MAX_THREADS) {
+ pr_err("invalid thread number\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.seconds == 0 ||
+ map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
+ pr_err("invalid duration seconds\n");
+ return -EINVAL;
+ }
+
+ if (map->bparam.node != NUMA_NO_NODE &&
+ !node_possible(map->bparam.node)) {
+ pr_err("invalid numa node\n");
+ return -EINVAL;
+ }
+
+ switch (map->bparam.dma_dir) {
+ case DMA_MAP_BIDIRECTIONAL:
+ map->dir = DMA_BIDIRECTIONAL;
+ break;
+ case DMA_MAP_FROM_DEVICE:
+ map->dir = DMA_FROM_DEVICE;
+ break;
+ case DMA_MAP_TO_DEVICE:
+ map->dir = DMA_TO_DEVICE;
+ break;
+ default:
+ pr_err("invalid DMA direction\n");
+ return -EINVAL;
+ }
+
+ old_dma_mask = dma_get_mask(map->dev);
+
+ ret = dma_set_mask(map->dev,
+ DMA_BIT_MASK(map->bparam.dma_bits));
+ if (ret) {
+ pr_err("failed to set dma_mask on device %s\n",
+ dev_name(map->dev));
+ return -EINVAL;
+ }
+
+ ret = do_map_benchmark(map);
+
+ /*
+ * restore the original dma_mask as many devices' dma_mask are
+ * set by architectures, acpi, busses. When we bind them back
+ * to their original drivers, those drivers shouldn't see
+ * dma_mask changed by benchmark
+ */
+ dma_set_mask(map->dev, old_dma_mask);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static const struct file_operations map_benchmark_fops = {
+ .open = simple_open,
+ .unlocked_ioctl = map_benchmark_ioctl,
+};
+
+static void map_benchmark_remove_debugfs(void *data)
+{
+ struct map_benchmark_data *map = (struct map_benchmark_data *)data;
+
+ debugfs_remove(map->debugfs);
+}
+
+static int __map_benchmark_probe(struct device *dev)
+{
+ struct dentry *entry;
+ struct map_benchmark_data *map;
+ int ret;
+
+ map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
+ if (!map)
+ return -ENOMEM;
+ map->dev = dev;
+
+ ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
+ if (ret) {
+ pr_err("Can't add debugfs remove action\n");
+ return ret;
+ }
+
+ /*
+ * we only permit a device bound with this driver, 2nd probe
+ * will fail
+ */
+ entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
+ &map_benchmark_fops);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+ map->debugfs = entry;
+
+ return 0;
+}
+
+static int map_benchmark_platform_probe(struct platform_device *pdev)
+{
+ return __map_benchmark_probe(&pdev->dev);
+}
+
+static struct platform_driver map_benchmark_platform_driver = {
+ .driver = {
+ .name = "dma_map_benchmark",
+ },
+ .probe = map_benchmark_platform_probe,
+};
+
+static int
+map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ return __map_benchmark_probe(&pdev->dev);
+}
+
+static struct pci_driver map_benchmark_pci_driver = {
+ .name = "dma_map_benchmark",
+ .probe = map_benchmark_pci_probe,
+};
+
+static int __init map_benchmark_init(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&map_benchmark_pci_driver);
+ if (ret)
+ return ret;
+
+ ret = platform_driver_register(&map_benchmark_platform_driver);
+ if (ret) {
+ pci_unregister_driver(&map_benchmark_pci_driver);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit map_benchmark_cleanup(void)
+{
+ platform_driver_unregister(&map_benchmark_platform_driver);
+ pci_unregister_driver(&map_benchmark_pci_driver);
+}
+
+module_init(map_benchmark_init);
+module_exit(map_benchmark_cleanup);
+
+MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
+MODULE_DESCRIPTION("dma_map benchmark driver");
+MODULE_LICENSE("GPL");
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 51bb8fa8eb89..f87a89d08654 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -149,7 +149,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
- if (dma_map_direct(dev, ops))
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
@@ -165,7 +166,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
- if (dma_map_direct(dev, ops))
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_unmap_page_direct(dev, addr + size))
dma_direct_unmap_page(dev, addr, size, dir, attrs);
else if (ops->unmap_page)
ops->unmap_page(dev, addr, size, dir, attrs);
@@ -188,7 +190,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
if (WARN_ON_ONCE(!dev->dma_mask))
return 0;
- if (dma_map_direct(dev, ops))
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_map_sg_direct(dev, sg, nents))
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
@@ -207,7 +210,8 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
debug_dma_unmap_sg(dev, sg, nents, dir);
- if (dma_map_direct(dev, ops))
+ if (dma_map_direct(dev, ops) ||
+ arch_dma_unmap_sg_direct(dev, sg, nents))
dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
else if (ops->unmap_sg)
ops->unmap_sg(dev, sg, nents, dir, attrs);
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index d4637f72239b..5f84e6cdb78e 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -38,9 +38,6 @@ static void __init dma_atomic_pool_debugfs_init(void)
struct dentry *root;
root = debugfs_create_dir("dma_pools", NULL);
- if (IS_ERR_OR_NULL(root))
- return;
-
debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
diff --git a/kernel/exit.c b/kernel/exit.c
index 3594291a8542..04029e35e69a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -63,6 +63,7 @@
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
+#include <linux/io_uring.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -776,6 +777,7 @@ void __noreturn do_exit(long code)
schedule();
}
+ io_uring_files_cancel(tsk->files);
exit_signals(tsk); /* sets PF_EXITING */
/* sync mm's RSS info before statistics gathering */
diff --git a/kernel/fork.c b/kernel/fork.c
index 41906a52a764..37720a6d04ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -225,8 +225,8 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
- /* Clear the KASAN shadow of the stack. */
- kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+ /* Mark stack accessible for KASAN. */
+ kasan_unpoison_range(s->addr, THREAD_SIZE);
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index e810eb9906ea..cc1a09406c6e 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -147,12 +147,12 @@ static ssize_t per_cpu_count_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
- int cpu, irq = desc->irq_data.irq;
ssize_t ret = 0;
char *p = "";
+ int cpu;
for_each_possible_cpu(cpu) {
- unsigned int c = kstat_irqs_cpu(irq, cpu);
+ unsigned int c = irq_desc_kstat_cpu(desc, cpu);
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
p = ",";
@@ -352,7 +352,9 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return radix_tree_lookup(&irq_desc_tree, irq);
}
-EXPORT_SYMBOL(irq_to_desc);
+#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
+EXPORT_SYMBOL_GPL(irq_to_desc);
+#endif
static void delete_irq_desc(unsigned int irq)
{
@@ -924,15 +926,7 @@ static bool irq_is_nmi(struct irq_desc *desc)
return desc->istate & IRQS_NMI;
}
-/**
- * kstat_irqs - Get the statistics for an interrupt
- * @irq: The interrupt number
- *
- * Returns the sum of interrupt counts on all cpus since boot for
- * @irq. The caller must ensure that the interrupt is not removed
- * concurrently.
- */
-unsigned int kstat_irqs(unsigned int irq)
+static unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned int sum = 0;
@@ -943,21 +937,22 @@ unsigned int kstat_irqs(unsigned int irq)
if (!irq_settings_is_per_cpu_devid(desc) &&
!irq_settings_is_per_cpu(desc) &&
!irq_is_nmi(desc))
- return desc->tot_count;
+ return data_race(desc->tot_count);
for_each_possible_cpu(cpu)
- sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
+ sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
return sum;
}
/**
- * kstat_irqs_usr - Get the statistics for an interrupt
+ * kstat_irqs_usr - Get the statistics for an interrupt from thread context
* @irq: The interrupt number
*
* Returns the sum of interrupt counts on all cpus since boot for @irq.
- * Contrary to kstat_irqs() this can be called from any context.
- * It uses rcu since a concurrent removal of an interrupt descriptor is
- * observing an rcu grace period before delayed_free_desc()/irq_kobj_release().
+ *
+ * It uses rcu to protect the access since a concurrent removal of an
+ * interrupt descriptor is observing an rcu grace period before
+ * delayed_free_desc()/irq_kobj_release().
*/
unsigned int kstat_irqs_usr(unsigned int irq)
{
@@ -968,3 +963,17 @@ unsigned int kstat_irqs_usr(unsigned int irq)
rcu_read_unlock();
return sum;
}
+
+#ifdef CONFIG_LOCKDEP
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
+ struct lock_class_key *request_class)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ lockdep_set_class(&desc->lock, lock_class);
+ lockdep_set_class(&desc->request_mutex, request_class);
+ }
+}
+EXPORT_SYMBOL_GPL(__irq_set_lockdep_class);
+#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c826ba4141fe..ab8567f32501 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2822,3 +2822,40 @@ out_unlock:
return err;
}
EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_has_action - Check whether an interrupt is requested
+ * @irq: The linux irq number
+ *
+ * Returns: A snapshot of the current state
+ */
+bool irq_has_action(unsigned int irq)
+{
+ bool res;
+
+ rcu_read_lock();
+ res = irq_desc_has_action(irq_to_desc(irq));
+ rcu_read_unlock();
+ return res;
+}
+EXPORT_SYMBOL_GPL(irq_has_action);
+
+/**
+ * irq_check_status_bit - Check whether bits in the irq descriptor status are set
+ * @irq: The linux irq number
+ * @bitmask: The bitmask to evaluate
+ *
+ * Returns: True if one of the bits in @bitmask is set
+ */
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask)
+{
+ struct irq_desc *desc;
+ bool res = false;
+
+ rcu_read_lock();
+ desc = irq_to_desc(irq);
+ if (desc)
+ res = !!(desc->status_use_accessors & bitmask);
+ rcu_read_unlock();
+ return res;
+}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 72513ed2a5fc..98138788cb04 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -488,9 +488,10 @@ int show_interrupts(struct seq_file *p, void *v)
if (!desc || irq_settings_is_hidden(desc))
goto outsparse;
- if (desc->kstat_irqs)
+ if (desc->kstat_irqs) {
for_each_online_cpu(j)
- any_count |= *per_cpu_ptr(desc->kstat_irqs, j);
+ any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j));
+ }
if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
goto outsparse;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 015ef903ce8c..c6a39d662935 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -793,6 +793,7 @@ int jump_label_text_reserved(void *start, void *end)
static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
+ bool init = system_state < SYSTEM_RUNNING;
struct jump_entry *entry;
#ifdef CONFIG_MODULES
struct module *mod;
@@ -804,15 +805,16 @@ static void jump_label_update(struct static_key *key)
preempt_disable();
mod = __module_address((unsigned long)key);
- if (mod)
+ if (mod) {
stop = mod->jump_entries + mod->num_jump_entries;
+ init = mod->state == MODULE_STATE_COMING;
+ }
preempt_enable();
#endif
entry = static_key_entries(key);
/* if there are no users, entry can be NULL */
if (entry)
- __jump_label_update(key, entry, stop,
- system_state < SYSTEM_RUNNING);
+ __jump_label_update(key, entry, stop, init);
}
#ifdef CONFIG_STATIC_KEYS_SELFTEST
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 35bdcfd84d42..36607551f966 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -241,7 +241,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
}
}
-/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */
+/* Spawn RCU-tasks grace-period kthread. */
static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
{
struct task_struct *t;
@@ -564,7 +564,6 @@ static int __init rcu_spawn_tasks_kthread(void)
rcu_spawn_tasks_kthread_generic(&rcu_tasks);
return 0;
}
-core_initcall(rcu_spawn_tasks_kthread);
#if !defined(CONFIG_TINY_RCU)
void show_rcu_tasks_classic_gp_kthread(void)
@@ -692,7 +691,6 @@ static int __init rcu_spawn_tasks_rude_kthread(void)
rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude);
return 0;
}
-core_initcall(rcu_spawn_tasks_rude_kthread);
#if !defined(CONFIG_TINY_RCU)
void show_rcu_tasks_rude_gp_kthread(void)
@@ -968,6 +966,11 @@ static void rcu_tasks_trace_pregp_step(void)
static void rcu_tasks_trace_pertask(struct task_struct *t,
struct list_head *hop)
{
+ // During early boot when there is only the one boot CPU, there
+ // is no idle task for the other CPUs. Just return.
+ if (unlikely(t == NULL))
+ return;
+
WRITE_ONCE(t->trc_reader_special.b.need_qs, false);
WRITE_ONCE(t->trc_reader_checked, false);
t->trc_ipi_to_cpu = -1;
@@ -1193,7 +1196,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace);
return 0;
}
-core_initcall(rcu_spawn_tasks_trace_kthread);
#if !defined(CONFIG_TINY_RCU)
void show_rcu_tasks_trace_gp_kthread(void)
@@ -1222,6 +1224,21 @@ void show_rcu_tasks_gp_kthreads(void)
}
#endif /* #ifndef CONFIG_TINY_RCU */
+void __init rcu_init_tasks_generic(void)
+{
+#ifdef CONFIG_TASKS_RCU
+ rcu_spawn_tasks_kthread();
+#endif
+
+#ifdef CONFIG_TASKS_RUDE_RCU
+ rcu_spawn_tasks_rude_kthread();
+#endif
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ rcu_spawn_tasks_trace_kthread();
+#endif
+}
+
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
void show_rcu_tasks_gp_kthreads(void) {}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 21b548b69455..15d2562118d1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3985,15 +3985,20 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
}
}
+static void balance_push(struct rq *rq);
+
+struct callback_head balance_push_callback = {
+ .next = NULL,
+ .func = (void (*)(struct callback_head *))balance_push,
+};
+
static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
{
struct callback_head *head = rq->balance_callback;
lockdep_assert_held(&rq->lock);
- if (head) {
+ if (head)
rq->balance_callback = NULL;
- rq->balance_flags &= ~BALANCE_WORK;
- }
return head;
}
@@ -4014,21 +4019,6 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
}
}
-static void balance_push(struct rq *rq);
-
-static inline void balance_switch(struct rq *rq)
-{
- if (likely(!rq->balance_flags))
- return;
-
- if (rq->balance_flags & BALANCE_PUSH) {
- balance_push(rq);
- return;
- }
-
- __balance_callbacks(rq);
-}
-
#else
static inline void __balance_callbacks(struct rq *rq)
@@ -4044,10 +4034,6 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
{
}
-static inline void balance_switch(struct rq *rq)
-{
-}
-
#endif
static inline void
@@ -4075,7 +4061,7 @@ static inline void finish_lock_switch(struct rq *rq)
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- balance_switch(rq);
+ __balance_callbacks(rq);
raw_spin_unlock_irq(&rq->lock);
}
@@ -7282,6 +7268,10 @@ static void balance_push(struct rq *rq)
lockdep_assert_held(&rq->lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
+ /*
+ * Ensure the thing is persistent until balance_push_set(.on = false);
+ */
+ rq->balance_callback = &balance_push_callback;
/*
* Both the cpu-hotplug and stop task are in this case and are
@@ -7331,9 +7321,9 @@ static void balance_push_set(int cpu, bool on)
rq_lock_irqsave(rq, &rf);
if (on)
- rq->balance_flags |= BALANCE_PUSH;
+ rq->balance_callback = &balance_push_callback;
else
- rq->balance_flags &= ~BALANCE_PUSH;
+ rq->balance_callback = NULL;
rq_unlock_irqrestore(rq, &rf);
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index b0ad37bf95ee..6931f0cdeb80 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,6 +53,7 @@ struct sugov_cpu {
unsigned int iowait_boost;
u64 last_update;
+ unsigned long util;
unsigned long bw_dl;
unsigned long max;
@@ -276,16 +277,15 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
return min(max, util);
}
-static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
+static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- unsigned long util = cpu_util_cfs(rq);
unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
-
- return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
+ sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,
+ FREQUENCY_UTIL, NULL);
}
/**
@@ -362,8 +362,6 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
- * @util: the utilization to (eventually) boost
- * @max: the maximum value the utilization can be boosted to
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
@@ -377,18 +375,17 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
- unsigned long util, unsigned long max)
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
{
unsigned long boost;
/* No boost currently required */
if (!sg_cpu->iowait_boost)
- return util;
+ return;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
- return util;
+ return;
if (!sg_cpu->iowait_boost_pending) {
/*
@@ -397,18 +394,19 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
sg_cpu->iowait_boost = 0;
- return util;
+ return;
}
}
sg_cpu->iowait_boost_pending = false;
/*
- * @util is already in capacity scale; convert iowait_boost
+ * sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
- return max(boost, util);
+ boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ if (sg_cpu->util < boost)
+ sg_cpu->util = boost;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -434,14 +432,10 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_p
sg_policy->limits_changed = true;
}
-static void sugov_update_single(struct update_util_data *hook, u64 time,
- unsigned int flags)
+static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
+ u64 time, unsigned int flags)
{
- struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
- unsigned long util, max;
- unsigned int next_f;
- unsigned int cached_freq = sg_policy->cached_raw_freq;
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -449,12 +443,26 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (!sugov_should_update_freq(sg_policy, time))
+ return false;
+
+ sugov_get_util(sg_cpu);
+ sugov_iowait_apply(sg_cpu, time);
+
+ return true;
+}
+
+static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned int next_f;
+
+ if (!sugov_update_single_common(sg_cpu, time, flags))
return;
- util = sugov_get_util(sg_cpu);
- max = sg_cpu->max;
- util = sugov_iowait_apply(sg_cpu, time, util, max);
- next_f = get_next_freq(sg_policy, util, max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
@@ -480,6 +488,38 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
}
}
+static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
+ unsigned int flags)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ unsigned long prev_util = sg_cpu->util;
+
+ /*
+ * Fall back to the "frequency" path if frequency invariance is not
+ * supported, because the direct mapping between the utilization and
+ * the performance levels depends on the frequency invariance.
+ */
+ if (!arch_scale_freq_invariant()) {
+ sugov_update_single_freq(hook, time, flags);
+ return;
+ }
+
+ if (!sugov_update_single_common(sg_cpu, time, flags))
+ return;
+
+ /*
+ * Do not reduce the target performance level if the CPU has not been
+ * idle recently, as the reduction is likely to be premature then.
+ */
+ if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+ sg_cpu->util = prev_util;
+
+ cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
+ map_util_perf(sg_cpu->util), sg_cpu->max);
+
+ sg_cpu->sg_policy->last_freq_update_time = time;
+}
+
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -491,9 +531,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
- j_util = sugov_get_util(j_sg_cpu);
+ sugov_get_util(j_sg_cpu);
+ sugov_iowait_apply(j_sg_cpu, time);
+ j_util = j_sg_cpu->util;
j_max = j_sg_cpu->max;
- j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
if (j_util * max > j_max * util) {
util = j_util;
@@ -817,6 +858,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
static int sugov_start(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
+ void (*uu)(struct update_util_data *data, u64 time, unsigned int flags);
unsigned int cpu;
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
@@ -836,13 +878,17 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->sg_policy = sg_policy;
}
+ if (policy_is_shared(policy))
+ uu = sugov_update_shared;
+ else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
+ uu = sugov_update_single_perf;
+ else
+ uu = sugov_update_single_freq;
+
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- policy_is_shared(policy) ?
- sugov_update_shared :
- sugov_update_single);
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu);
}
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f5acb6c5ce49..12ada79d40f3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -975,7 +975,6 @@ struct rq {
unsigned long cpu_capacity_orig;
struct callback_head *balance_callback;
- unsigned char balance_flags;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
@@ -1226,6 +1225,8 @@ struct rq_flags {
#endif
};
+extern struct callback_head balance_push_callback;
+
/*
* Lockdep annotation that avoids accidental unlocks; it's like a
* sticky/continuous lockdep_assert_held().
@@ -1243,9 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-#endif
#ifdef CONFIG_SMP
- SCHED_WARN_ON(rq->balance_callback);
+ SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
+#endif
#endif
}
@@ -1408,9 +1409,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SMP
-#define BALANCE_WORK 0x01
-#define BALANCE_PUSH 0x02
-
static inline void
queue_balance_callback(struct rq *rq,
struct callback_head *head,
@@ -1418,13 +1416,12 @@ queue_balance_callback(struct rq *rq,
{
lockdep_assert_held(&rq->lock);
- if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
+ if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
return;
head->func = (void (*)(struct callback_head *))func;
head->next = rq->balance_callback;
rq->balance_callback = head;
- rq->balance_flags |= BALANCE_WORK;
}
#define rcu_dereference_check_sched_domain(p) \
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 01f5d3020589..183cc6ae68a6 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue
}
EXPORT_SYMBOL(add_wait_queue_exclusive);
+void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
+{
+ unsigned long flags;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ spin_lock_irqsave(&wq_head->lock, flags);
+ __add_wait_queue(wq_head, wq_entry);
+ spin_unlock_irqrestore(&wq_head->lock, flags);
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -57,7 +68,11 @@ EXPORT_SYMBOL(remove_wait_queue);
/*
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
- * number) then we wake all the non-exclusive tasks and one exclusive task.
+ * number) then we wake that number of exclusive tasks, and potentially all
+ * the non-exclusive tasks. Normally, exclusive tasks will be at the end of
+ * the list and any non-exclusive tasks will be woken first. A priority task
+ * may be at the head of the list, and can consume the event without any other
+ * tasks being woken.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d5bfd5e661fc..9d71046ea247 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -186,7 +186,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
* Keep preemption disabled until we are done with
* softirq processing:
*/
- preempt_count_sub(cnt - 1);
+ __preempt_count_sub(cnt - 1);
if (unlikely(!in_interrupt() && local_softirq_pending())) {
/*
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f27ac94d5fa7..19aa806890d5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1);
COND_SYSCALL(epoll_ctl);
COND_SYSCALL(epoll_pwait);
COND_SYSCALL_COMPAT(epoll_pwait);
+COND_SYSCALL(epoll_pwait2);
+COND_SYSCALL_COMPAT(epoll_pwait2);
/* fs/fcntl.c */
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 9a41848b6ebb..83e158d016ba 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -141,7 +141,7 @@ config CONTEXT_TRACKING_FORCE
dynticks working.
This option stands for testing when an arch implements the
- context tracking backend but doesn't yet fullfill all the
+ context tracking backend but doesn't yet fulfill all the
requirements to make the full dynticks feature working.
Without the full dynticks, there is no way to test the support
for context tracking and the subsystems that rely on it: RCU
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a03764df5366..9d3a22510bab 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -407,17 +407,13 @@ EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
/*
* Transfer the do_timer job away from a dying cpu.
*
- * Called with interrupts disabled. Not locking required. If
+ * Called with interrupts disabled. No locking required. If
* tick_do_timer_cpu is owned by this cpu, nothing can change it.
*/
void tick_handover_do_timer(void)
{
- if (tick_do_timer_cpu == smp_processor_id()) {
- int cpu = cpumask_first(cpu_online_mask);
-
- tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
- TICK_DO_TIMER_NONE;
- }
+ if (tick_do_timer_cpu == smp_processor_id())
+ tick_do_timer_cpu = cpumask_first(cpu_online_mask);
}
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 030282994b3e..e10a4af88737 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -989,13 +989,6 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
*/
if (tick_do_timer_cpu == cpu)
return false;
- /*
- * Boot safety: make sure the timekeeping duty has been
- * assigned before entering dyntick-idle mode,
- * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
- */
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
- return false;
/* Should not happen for nohz-full */
if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b5295a0b0536..9880b6c0e272 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3731,17 +3731,24 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
* is updated and visible.
*/
if (!freezable || !workqueue_freezing) {
+ bool kick = false;
+
pwq->max_active = wq->saved_max_active;
while (!list_empty(&pwq->delayed_works) &&
- pwq->nr_active < pwq->max_active)
+ pwq->nr_active < pwq->max_active) {
pwq_activate_first_delayed(pwq);
+ kick = true;
+ }
/*
* Need to kick a worker after thawed or an unbound wq's
- * max_active is bumped. It's a slow path. Do it always.
+ * max_active is bumped. In realtime scenarios, always kicking a
+ * worker will cause interference on the isolated cpu cores, so
+ * let's kick iff work items were activated.
*/
- wake_up_worker(pwq->pool);
+ if (kick)
+ wake_up_worker(pwq->pool);
} else {
pwq->max_active = 0;
}