diff options
Diffstat (limited to 'init')
-rw-r--r-- | init/Kconfig | 350 | ||||
-rw-r--r-- | init/main.c | 27 |
2 files changed, 24 insertions, 353 deletions
diff --git a/init/Kconfig b/init/Kconfig index 1d3475fc9496..ee0f03b69d11 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -472,354 +472,7 @@ config TASK_IO_ACCOUNTING endmenu # "CPU/Task time and stats accounting" -menu "RCU Subsystem" - -config TREE_RCU - bool - default y if !PREEMPT && SMP - help - This option selects the RCU implementation that is - designed for very large SMP system with hundreds or - thousands of CPUs. It also scales down nicely to - smaller systems. - -config PREEMPT_RCU - bool - default y if PREEMPT - help - This option selects the RCU implementation that is - designed for very large SMP systems with hundreds or - thousands of CPUs, but for which real-time response - is also required. It also scales down nicely to - smaller systems. - - Select this option if you are unsure. - -config TINY_RCU - bool - default y if !PREEMPT && !SMP - help - This option selects the RCU implementation that is - designed for UP systems from which real-time response - is not required. This option greatly reduces the - memory footprint of RCU. - -config RCU_EXPERT - bool "Make expert-level adjustments to RCU configuration" - default n - help - This option needs to be enabled if you wish to make - expert-level adjustments to RCU configuration. By default, - no such adjustments can be made, which has the often-beneficial - side-effect of preventing "make oldconfig" from asking you all - sorts of detailed questions about how you would like numerous - obscure RCU options to be set up. - - Say Y if you need to make expert-level adjustments to RCU. - - Say N if you are unsure. - -config SRCU - bool - default y - help - This option selects the sleepable version of RCU. This version - permits arbitrary sleeping or blocking within RCU read-side critical - sections. - -config CLASSIC_SRCU - bool "Use v4.11 classic SRCU implementation" - default n - depends on RCU_EXPERT && SRCU - help - This option selects the traditional well-tested classic SRCU - implementation from v4.11, as might be desired for enterprise - Linux distributions. Without this option, the shiny new - Tiny SRCU and Tree SRCU implementations are used instead. - At some point, it is hoped that Tiny SRCU and Tree SRCU - will accumulate enough test time and confidence to allow - Classic SRCU to be dropped entirely. - - Say Y if you need a rock-solid SRCU. - - Say N if you would like help test Tree SRCU. - -config TINY_SRCU - bool - default y if SRCU && TINY_RCU && !CLASSIC_SRCU - help - This option selects the single-CPU non-preemptible version of SRCU. - -config TREE_SRCU - bool - default y if SRCU && !TINY_RCU && !CLASSIC_SRCU - help - This option selects the full-fledged version of SRCU. - -config TASKS_RCU - bool - default n - select SRCU - help - This option enables a task-based RCU implementation that uses - only voluntary context switch (not preemption!), idle, and - user-mode execution as quiescent states. - -config RCU_STALL_COMMON - def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE ) - help - This option enables RCU CPU stall code that is common between - the TINY and TREE variants of RCU. The purpose is to allow - the tiny variants to disable RCU CPU stall warnings, while - making these warnings mandatory for the tree variants. - -config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU ) - -config CONTEXT_TRACKING - bool - -config CONTEXT_TRACKING_FORCE - bool "Force context tracking" - depends on CONTEXT_TRACKING - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to - support the context tracking subsystem. But there are also - other dependencies to provide in order to make the full - dynticks working. - - This option stands for testing when an arch implements the - context tracking backend but doesn't yet fullfill all the - requirements to make the full dynticks feature working. - Without the full dynticks, there is no way to test the support - for context tracking and the subsystems that rely on it: RCU - userspace extended quiescent state and tickless cputime - accounting. This option copes with the absence of the full - dynticks subsystem by forcing the context tracking on all - CPUs in the system. - - Say Y only if you're working on the development of an - architecture backend for the context tracking. - - Say N otherwise, this option brings an overhead that you - don't want in production. - - -config RCU_FANOUT - int "Tree-based hierarchical RCU fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT - default 64 if 64BIT - default 32 if !64BIT - help - This option controls the fanout of hierarchical implementations - of RCU, allowing RCU to work efficiently on machines with - large numbers of CPUs. This value must be at least the fourth - root of NR_CPUS, which allows NR_CPUS to be insanely large. - The default value of RCU_FANOUT should be used for production - systems, but if you are stress-testing the RCU implementation - itself, small RCU_FANOUT values allow you to test large-system - code paths on small(er) systems. - - Select a specific number if testing RCU itself. - Take the default if unsure. - -config RCU_FANOUT_LEAF - int "Tree-based hierarchical RCU leaf-level fanout value" - range 2 64 if 64BIT - range 2 32 if !64BIT - depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT - default 16 - help - This option controls the leaf-level fanout of hierarchical - implementations of RCU, and allows trading off cache misses - against lock contention. Systems that synchronize their - scheduling-clock interrupts for energy-efficiency reasons will - want the default because the smaller leaf-level fanout keeps - lock contention levels acceptably low. Very large systems - (hundreds or thousands of CPUs) will instead want to set this - value to the maximum value possible in order to reduce the - number of cache misses incurred during RCU's grace-period - initialization. These systems tend to run CPU-bound, and thus - are not helped by synchronized interrupts, and thus tend to - skew them, which reduces lock contention enough that large - leaf-level fanouts work well. That said, setting leaf-level - fanout to a large number will likely cause problematic - lock contention on the leaf-level rcu_node structures unless - you boot with the skew_tick kernel parameter. - - Select a specific number if testing RCU itself. - - Select the maximum permissible value for large systems, but - please understand that you may also need to set the skew_tick - kernel boot parameter to avoid contention on the rcu_node - structure's locks. - - Take the default if unsure. - -config RCU_FAST_NO_HZ - bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ_COMMON && SMP && RCU_EXPERT - default n - help - This option permits CPUs to enter dynticks-idle state even if - they have RCU callbacks queued, and prevents RCU from waking - these CPUs up more than roughly once every four jiffies (by - default, you can adjust this using the rcutree.rcu_idle_gp_delay - parameter), thus improving energy efficiency. On the other - hand, this option increases the duration of RCU grace periods, - for example, slowing down synchronize_rcu(). - - Say Y if energy efficiency is critically important, and you - don't care about increased grace-period durations. - - Say N if you are unsure. - -config TREE_RCU_TRACE - def_bool RCU_TRACE && ( TREE_RCU || PREEMPT_RCU ) - select DEBUG_FS - help - This option provides tracing for the TREE_RCU and - PREEMPT_RCU implementations, permitting Makefile to - trivially select kernel/rcutree_trace.c. - -config RCU_BOOST - bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT - default n - help - This option boosts the priority of preempted RCU readers that - block the current preemptible RCU grace period for too long. - This option also prevents heavy loads from blocking RCU - callback invocation for all flavors of RCU. - - Say Y here if you are working with real-time apps or heavy loads - Say N here if you are unsure. - -config RCU_KTHREAD_PRIO - int "Real-time priority to use for RCU worker threads" - range 1 99 if RCU_BOOST - range 0 99 if !RCU_BOOST - default 1 if RCU_BOOST - default 0 if !RCU_BOOST - depends on RCU_EXPERT - help - This option specifies the SCHED_FIFO priority value that will be - assigned to the rcuc/n and rcub/n threads and is also the value - used for RCU_BOOST (if enabled). If you are working with a - real-time application that has one or more CPU-bound threads - running at a real-time priority level, you should set - RCU_KTHREAD_PRIO to a priority higher than the highest-priority - real-time CPU-bound application thread. The default RCU_KTHREAD_PRIO - value of 1 is appropriate in the common case, which is real-time - applications that do not have any CPU-bound threads. - - Some real-time applications might not have a single real-time - thread that saturates a given CPU, but instead might have - multiple real-time threads that, taken together, fully utilize - that CPU. In this case, you should set RCU_KTHREAD_PRIO to - a priority higher than the lowest-priority thread that is - conspiring to prevent the CPU from running any non-real-time - tasks. For example, if one thread at priority 10 and another - thread at priority 5 are between themselves fully consuming - the CPU time on a given CPU, then RCU_KTHREAD_PRIO should be - set to priority 6 or higher. - - Specify the real-time priority, or take the default if unsure. - -config RCU_BOOST_DELAY - int "Milliseconds to delay boosting after RCU grace-period start" - range 0 3000 - depends on RCU_BOOST - default 500 - help - This option specifies the time to wait after the beginning of - a given grace period before priority-boosting preempted RCU - readers blocking that grace period. Note that any RCU reader - blocking an expedited RCU grace period is boosted immediately. - - Accept the default if unsure. - -config RCU_NOCB_CPU - bool "Offload RCU callback processing from boot-selected CPUs" - depends on TREE_RCU || PREEMPT_RCU - depends on RCU_EXPERT || NO_HZ_FULL - default n - help - Use this option to reduce OS jitter for aggressive HPC or - real-time workloads. It can also be used to offload RCU - callback invocation to energy-efficient CPUs in battery-powered - asymmetric multiprocessors. - - This option offloads callback invocation from the set of - CPUs specified at boot time by the rcu_nocbs parameter. - For each such CPU, a kthread ("rcuox/N") will be created to - invoke callbacks, where the "N" is the CPU being offloaded, - and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and - "s" for RCU-sched. Nothing prevents this kthread from running - on the specified CPUs, but (1) the kthreads may be preempted - between each callback, and (2) affinity or cgroups can be used - to force the kthreads to run on whatever set of CPUs is desired. - - Say Y here if you want to help to debug reduced OS jitter. - Say N here if you are unsure. - -choice - prompt "Build-forced no-CBs CPUs" - default RCU_NOCB_CPU_NONE - depends on RCU_NOCB_CPU - help - This option allows no-CBs CPUs (whose RCU callbacks are invoked - from kthreads rather than from softirq context) to be specified - at build time. Additional no-CBs CPUs may be specified by - the rcu_nocbs= boot parameter. - -config RCU_NOCB_CPU_NONE - bool "No build_forced no-CBs CPUs" - help - This option does not force any of the CPUs to be no-CBs CPUs. - Only CPUs designated by the rcu_nocbs= boot parameter will be - no-CBs CPUs, whose RCU callbacks will be invoked by per-CPU - kthreads whose names begin with "rcuo". All other CPUs will - invoke their own RCU callbacks in softirq context. - - Select this option if you want to choose no-CBs CPUs at - boot time, for example, to allow testing of different no-CBs - configurations without having to rebuild the kernel each time. - -config RCU_NOCB_CPU_ZERO - bool "CPU 0 is a build_forced no-CBs CPU" - help - This option forces CPU 0 to be a no-CBs CPU, so that its RCU - callbacks are invoked by a per-CPU kthread whose name begins - with "rcuo". Additional CPUs may be designated as no-CBs - CPUs using the rcu_nocbs= boot parameter will be no-CBs CPUs. - All other CPUs will invoke their own RCU callbacks in softirq - context. - - Select this if CPU 0 needs to be a no-CBs CPU for real-time - or energy-efficiency reasons, but the real reason it exists - is to ensure that randconfig testing covers mixed systems. - -config RCU_NOCB_CPU_ALL - bool "All CPUs are build_forced no-CBs CPUs" - help - This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs= - boot parameter will be ignored. All CPUs' RCU callbacks will - be executed in the context of per-CPU rcuo kthreads created for - this purpose. Assuming that the kthreads whose names start with - "rcuo" are bound to "housekeeping" CPUs, this reduces OS jitter - on the remaining CPUs, but might decrease memory locality during - RCU-callback invocation, thus potentially degrading throughput. - - Select this if all CPUs need to be no-CBs CPUs for real-time - or energy-efficiency reasons. - -endchoice - -endmenu # "RCU Subsystem" +source "kernel/rcu/Kconfig" config BUILD_BIN2C bool @@ -1156,6 +809,7 @@ config CGROUP_HUGETLB config CPUSETS bool "Cpuset controller" + depends on SMP help This option will let you create and manage CPUSETs which allow dynamically partitioning a system into sets of CPUs and diff --git a/init/main.c b/init/main.c index f866510472d7..df58a416dd1d 100644 --- a/init/main.c +++ b/init/main.c @@ -389,6 +389,7 @@ static __initdata DECLARE_COMPLETION(kthreadd_done); static noinline void __ref rest_init(void) { + struct task_struct *tsk; int pid; rcu_scheduler_starting(); @@ -397,12 +398,32 @@ static noinline void __ref rest_init(void) * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - kernel_thread(kernel_init, NULL, CLONE_FS); + pid = kernel_thread(kernel_init, NULL, CLONE_FS); + /* + * Pin init on the boot CPU. Task migration is not properly working + * until sched_init_smp() has been run. It will set the allowed + * CPUs for init to the non isolated CPUs. + */ + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); + rcu_read_unlock(); + numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns); rcu_read_unlock(); + + /* + * Enable might_sleep() and smp_processor_id() checks. + * They cannot be enabled earlier because with CONFIG_PRREMPT=y + * kernel_thread() would trigger might_sleep() splats. With + * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled + * already, but it's stuck on the kthreadd_done completion. + */ + system_state = SYSTEM_SCHEDULING; + complete(&kthreadd_done); /* @@ -1015,10 +1036,6 @@ static noinline void __init kernel_init_freeable(void) * init can allocate pages on any node */ set_mems_allowed(node_states[N_MEMORY]); - /* - * init can run on any cpu. - */ - set_cpus_allowed_ptr(current, cpu_all_mask); cad_pid = task_pid(current); |