From 773e3eb7b81e5ba13b5155dfb3bb75b8ce37f8f9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:33 -0800 Subject: init: Move radix_tree_init() early Prepare for using radix trees in early_irq_init(). Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-30-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 4cb47a159f02..845187822e5c 100644 --- a/init/main.c +++ b/init/main.c @@ -584,6 +584,7 @@ asmlinkage void __init start_kernel(void) local_irq_disable(); } rcu_init(); + radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); init_IRQ(); @@ -657,7 +658,6 @@ asmlinkage void __init start_kernel(void) proc_caches_init(); buffer_init(); key_init(); - radix_tree_init(); security_init(); vfs_caches_init(totalram_pages); signals_init(); -- cgit v1.2.3 From 2b633e3fac5efada088b57d31e65401f22bcc18f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:37 -0800 Subject: smp: Use nr_cpus= to set nr_cpu_ids early On x86, before prefill_possible_map(), nr_cpu_ids will be NR_CPUS aka CONFIG_NR_CPUS. Add nr_cpus= to set nr_cpu_ids. so we can simulate cpus <=8 are installed on normal config. -v2: accordging to Christoph, acpi_numa_init should use nr_cpu_ids in stead of NR_CPUS. -v3: add doc in kernel-parameters.txt according to Andrew. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-34-git-send-email-yinghai@kernel.org> Acked-by: Linus Torvalds Signed-off-by: H. Peter Anvin Cc: Tony Luck --- Documentation/kernel-parameters.txt | 6 ++++++ arch/ia64/kernel/acpi.c | 4 ++-- arch/x86/kernel/smpboot.c | 7 ++++--- drivers/acpi/numa.c | 4 ++-- init/main.c | 14 ++++++++++++++ 5 files changed, 28 insertions(+), 7 deletions(-) (limited to 'init') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 736d45602886..51bceb0fb277 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1772,6 +1772,12 @@ and is between 256 and 4096 characters. It is defined in the file purges which is reported from either PAL_VM_SUMMARY or SAL PALO. + nr_cpus= [SMP] Maximum number of processors that an SMP kernel + could support. nr_cpus=n : n >= 1 limits the kernel to + supporting 'n' processors. Later in runtime you can not + use hotplug cpu feature to put more cpu back to online. + just like you compile the kernel NR_CPUS=n + nr_uarts= [SERIAL] maximum number of UARTs to be registered. numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 40574ae11401..605a08b29721 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -881,8 +881,8 @@ __init void prefill_possible_map(void) possible = available_cpus + additional_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; + if (possible > nr_cpu_ids) + possible = nr_cpu_ids; printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max((possible - available_cpus), 0)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f3..eff2fe175422 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1213,11 +1213,12 @@ __init void prefill_possible_map(void) total_cpus = max_t(int, possible, num_processors + disabled_cpus); - if (possible > CONFIG_NR_CPUS) { + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { printk(KERN_WARNING "%d Processors exceeds NR_CPUS limit of %d\n", - possible, CONFIG_NR_CPUS); - possible = CONFIG_NR_CPUS; + possible, nr_cpu_ids); + possible = nr_cpu_ids; } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 7ad48dfc12db..b8725461d887 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -279,9 +279,9 @@ int __init acpi_numa_init(void) /* SRAT: Static Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, - acpi_parse_x2apic_affinity, NR_CPUS); + acpi_parse_x2apic_affinity, nr_cpu_ids); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, - acpi_parse_processor_affinity, NR_CPUS); + acpi_parse_processor_affinity, nr_cpu_ids); ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); diff --git a/init/main.c b/init/main.c index 845187822e5c..05b5283e98fa 100644 --- a/init/main.c +++ b/init/main.c @@ -149,6 +149,20 @@ static int __init nosmp(char *str) early_param("nosmp", nosmp); +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); -- cgit v1.2.3 From 2bd3a997befc226ab4b504f05c5cbba305f3e0e6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Mar 2010 23:53:19 -0800 Subject: init: Open /dev/console from rootfs To avoid potential problems with an empty /dev open /dev/console from rootfs instead of waiting to mount our root filesystem and mounting it there. This effectively guarantees that there will be a device node, and it won't be on a filesystem that we will ever unmount, so there are no issues with leaving /dev/console open and pinning the filesystem. This is actually more effective than automatically mounting devtmpfs on /dev because it removes removes the occasionally problematic assumption that /dev/console exists from the boot code. With this patch I was able to throw busybox on my /boot partition (which has no /dev directory) and boot into userspace without problems. The only possible negative consequence I can think of is that someone out there deliberately used did not use a character device that is major 5 minor 2 for /dev/console. Does anyone know of a situation in which that could make sense? Signed-off-by: Eric W. Biederman Signed-off-by: Al Viro --- init/do_mounts_initrd.c | 4 ---- init/main.c | 11 ++++++----- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'init') diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 614241b5200c..2b108538d0d9 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -30,11 +30,7 @@ static int __init do_linuxrc(void * shell) extern char * envp_init[]; sys_close(old_fd);sys_close(root_fd); - sys_close(0);sys_close(1);sys_close(2); sys_setsid(); - (void) sys_open("/dev/console",O_RDWR,0); - (void) sys_dup(0); - (void) sys_dup(0); return kernel_execve(shell, argv, envp_init); } diff --git a/init/main.c b/init/main.c index 4cb47a159f02..106e02d7ffa5 100644 --- a/init/main.c +++ b/init/main.c @@ -806,11 +806,6 @@ static noinline int init_post(void) system_state = SYSTEM_RUNNING; numa_default_policy(); - if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) - printk(KERN_WARNING "Warning: unable to open an initial console.\n"); - - (void) sys_dup(0); - (void) sys_dup(0); current->signal->flags |= SIGNAL_UNKILLABLE; @@ -873,6 +868,12 @@ static int __init kernel_init(void * unused) do_basic_setup(); + /* Open the /dev/console on the rootfs, this should never fail */ + if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) + printk(KERN_WARNING "Warning: unable to open an initial console.\n"); + + (void) sys_dup(0); + (void) sys_dup(0); /* * check if there is an early userspace init. If yes, let it do all * the work -- cgit v1.2.3 From 452aa6999e6703ffbddd7f6ea124d3968915f3e3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 5 Mar 2010 13:42:13 -0800 Subject: mm/pm: force GFP_NOIO during suspend/hibernation and resume There are quite a few GFP_KERNEL memory allocations made during suspend/hibernation and resume that may cause the system to hang, because the I/O operations they depend on cannot be completed due to the underlying devices being suspended. Avoid this problem by clearing the __GFP_IO and __GFP_FS bits in gfp_allowed_mask before suspend/hibernation and restoring the original values of these bits in gfp_allowed_mask durig the subsequent resume. [akpm@linux-foundation.org: fix CONFIG_PM=n linkage] Signed-off-by: Rafael J. Wysocki Reported-by: Maxim Levitsky Cc: Sebastian Ott Cc: Benjamin Herrenschmidt Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 7 +++---- init/main.c | 2 +- kernel/power/hibernate.c | 9 +++++++++ kernel/power/suspend.c | 3 +++ mm/page_alloc.c | 25 +++++++++++++++++++++++++ 5 files changed, 41 insertions(+), 5 deletions(-) (limited to 'init') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e5567e6762f3..2e1b32c0484d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -83,6 +83,7 @@ struct vm_area_struct; #define GFP_HIGHUSER_MOVABLE (__GFP_WAIT | __GFP_IO | __GFP_FS | \ __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) +#define GFP_IOFS (__GFP_IO | __GFP_FS) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) @@ -337,9 +338,7 @@ void drain_local_pages(void *dummy); extern gfp_t gfp_allowed_mask; -static inline void set_gfp_allowed_mask(gfp_t mask) -{ - gfp_allowed_mask = mask; -} +extern void set_gfp_allowed_mask(gfp_t mask); +extern gfp_t clear_gfp_allowed_mask(gfp_t mask); #endif /* __LINUX_GFP_H */ diff --git a/init/main.c b/init/main.c index 40aaa020cd68..41d0f10dbbc7 100644 --- a/init/main.c +++ b/init/main.c @@ -618,7 +618,7 @@ asmlinkage void __init start_kernel(void) local_irq_enable(); /* Interrupts are enabled now so all GFP allocations are safe. */ - set_gfp_allowed_mask(__GFP_BITS_MASK); + gfp_allowed_mask = __GFP_BITS_MASK; kmem_cache_init_late(); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index bbfe472d7524..da5288ec2392 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -323,6 +323,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { int error; + gfp_t saved_mask; error = platform_begin(platform_mode); if (error) @@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode) dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: platform_end(platform_mode); @@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode) int hibernation_restore(int platform_mode) { int error; + gfp_t saved_mask; pm_prepare_console(); suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); dpm_resume_end(PMSG_RECOVER); } + set_gfp_allowed_mask(saved_mask); resume_console(); pm_restore_console(); return error; @@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { int error; + gfp_t saved_mask; if (!hibernation_ops) return -ENOSYS; @@ -481,6 +488,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -518,6 +526,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6f10dfc2d3e9..44cce10b582d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; + gfp_t saved_mask; if (!suspend_ops) return -ENOSYS; @@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: if (suspend_ops->end) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0734bedabd9c..298f307c63a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -76,6 +76,31 @@ unsigned long totalreserve_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with pm_mutex held (gfp_allowed_mask also should + * only be modified with pm_mutex held, unless the suspend/hibernate code is + * guaranteed not to run in parallel with that modification). + */ +void set_gfp_allowed_mask(gfp_t mask) +{ + WARN_ON(!mutex_is_locked(&pm_mutex)); + gfp_allowed_mask = mask; +} + +gfp_t clear_gfp_allowed_mask(gfp_t mask) +{ + gfp_t ret = gfp_allowed_mask; + + WARN_ON(!mutex_is_locked(&pm_mutex)); + gfp_allowed_mask &= ~mask; + return ret; +} +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; #endif -- cgit v1.2.3 From 9a85b8d6049cbb0e7961df2069322fbc4192026a Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Fri, 5 Mar 2010 13:42:39 -0800 Subject: init/main.c: improve usability in case of init binary failure - new Documentation/init.txt file describing various forms of failure trying to load the init binary after kernel bootup - extend the init/main.c init failure message to direct to Documentation/init.txt Signed-off-by: Andreas Mohr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/init.txt | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ init/main.c | 3 ++- 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 Documentation/init.txt (limited to 'init') diff --git a/Documentation/init.txt b/Documentation/init.txt new file mode 100644 index 000000000000..535ad5e82b98 --- /dev/null +++ b/Documentation/init.txt @@ -0,0 +1,49 @@ +Explaining the dreaded "No init found." boot hang message +========================================================= + +OK, so you've got this pretty unintuitive message (currently located +in init/main.c) and are wondering what the H*** went wrong. +Some high-level reasons for failure (listed roughly in order of execution) +to load the init binary are: +A) Unable to mount root FS +B) init binary doesn't exist on rootfs +C) broken console device +D) binary exists but dependencies not available +E) binary cannot be loaded + +Detailed explanations: +0) Set "debug" kernel parameter (in bootloader config file or CONFIG_CMDLINE) + to get more detailed kernel messages. +A) make sure you have the correct root FS type + (and root= kernel parameter points to the correct partition), + required drivers such as storage hardware (such as SCSI or USB!) + and filesystem (ext3, jffs2 etc.) are builtin (alternatively as modules, + to be pre-loaded by an initrd) +C) Possibly a conflict in console= setup --> initial console unavailable. + E.g. some serial consoles are unreliable due to serial IRQ issues (e.g. + missing interrupt-based configuration). + Try using a different console= device or e.g. netconsole= . +D) e.g. required library dependencies of the init binary such as + /lib/ld-linux.so.2 missing or broken. Use readelf -d |grep NEEDED + to find out which libraries are required. +E) make sure the binary's architecture matches your hardware. + E.g. i386 vs. x86_64 mismatch, or trying to load x86 on ARM hardware. + In case you tried loading a non-binary file here (shell script?), + you should make sure that the script specifies an interpreter in its shebang + header line (#!/...) that is fully working (including its library + dependencies). And before tackling scripts, better first test a simple + non-script binary such as /bin/sh and confirm its successful execution. + To find out more, add code to init/main.c to display kernel_execve()s + return values. + +Please extend this explanation whenever you find new failure causes +(after all loading the init binary is a CRITICAL and hard transition step +which needs to be made as painless as possible), then submit patch to LKML. +Further TODOs: +- Implement the various run_init_process() invocations via a struct array + which can then store the kernel_execve() result value and on failure + log it all by iterating over _all_ results (very important usability fix). +- try to make the implementation itself more helpful in general, + e.g. by providing additional error messages at affected places. + +Andreas Mohr diff --git a/init/main.c b/init/main.c index 41d0f10dbbc7..b09a828f38b5 100644 --- a/init/main.c +++ b/init/main.c @@ -847,7 +847,8 @@ static noinline int init_post(void) run_init_process("/bin/init"); run_init_process("/bin/sh"); - panic("No init found. Try passing init= option to kernel."); + panic("No init found. Try passing init= option to kernel. " + "See Linux Documentation/init.txt for guidance."); } static int __init kernel_init(void * unused) -- cgit v1.2.3 From 8aaed5bec2b9177eab1796c8c4f7a4c90804eef6 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 5 Mar 2010 13:42:39 -0800 Subject: init/initramfs.c: fix "symbol shadows an earlier one" noise The symbol 'count' is a local global variable in this file. The function clean_rootfs() should use a different symbol name to prevent "symbol shadows an earlier one" noise. Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'init') diff --git a/init/initramfs.c b/init/initramfs.c index b37d34beb90b..37d3859b1b32 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -525,7 +525,7 @@ static void __init clean_rootfs(void) int fd; void *buf; struct linux_dirent64 *dirp; - int count; + int num; fd = sys_open("/", O_RDONLY, 0); WARN_ON(fd < 0); @@ -539,9 +539,9 @@ static void __init clean_rootfs(void) } dirp = buf; - count = sys_getdents64(fd, dirp, BUF_SIZE); - while (count > 0) { - while (count > 0) { + num = sys_getdents64(fd, dirp, BUF_SIZE); + while (num > 0) { + while (num > 0) { struct stat st; int ret; @@ -554,12 +554,12 @@ static void __init clean_rootfs(void) sys_unlink(dirp->d_name); } - count -= dirp->d_reclen; + num -= dirp->d_reclen; dirp = (void *)dirp + dirp->d_reclen; } dirp = buf; memset(buf, 0, BUF_SIZE); - count = sys_getdents64(fd, dirp, BUF_SIZE); + num = sys_getdents64(fd, dirp, BUF_SIZE); } sys_close(fd); -- cgit v1.2.3 From 7463e633c5f94792dcff1afefb0d2961318a9d09 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Fri, 5 Mar 2010 13:42:46 -0800 Subject: init/main.c: make setup_max_cpus static for !SMP The only in tree external users of the symbol setup_max_cpus are in arch/x86/. The files ./kernel/alternative.c, ./kernel/visws_quirks.c, and ./mm/kmemcheck/kmemcheck.c are all guarded by CONFIG_SMP being defined. For this case the symbol is an unsigned int and declared as an extern in include/linux/smp.h. When CONFIG_SMP is not defined the symbol setup_max_cpus is a constant value that is only used in init/main.c. Make the symbol static for this case. Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index b09a828f38b5..a1ab78ceb4b6 100644 --- a/init/main.c +++ b/init/main.c @@ -174,7 +174,7 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); #else -const unsigned int setup_max_cpus = NR_CPUS; +static const unsigned int setup_max_cpus = NR_CPUS; #endif /* -- cgit v1.2.3 From 0dea116876eefc9c7ca9c5d74fe665481e499fa3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Mar 2010 15:22:20 -0800 Subject: cgroup: implement eventfd-based generic API for notifications This patchset introduces eventfd-based API for notifications in cgroups and implements memory notifications on top of it. It uses statistics in memory controler to track memory usage. Output of time(1) on building kernel on tmpfs: Root cgroup before changes: make -j2 506.37 user 60.93s system 193% cpu 4:52.77 total Non-root cgroup before changes: make -j2 507.14 user 62.66s system 193% cpu 4:54.74 total Root cgroup after changes (0 thresholds): make -j2 507.13 user 62.20s system 193% cpu 4:53.55 total Non-root cgroup after changes (0 thresholds): make -j2 507.70 user 64.20s system 193% cpu 4:55.70 total Root cgroup after changes (1 thresholds, never crossed): make -j2 506.97 user 62.20s system 193% cpu 4:53.90 total Non-root cgroup after changes (1 thresholds, never crossed): make -j2 507.55 user 64.08s system 193% cpu 4:55.63 total This patch: Introduce the write-only file "cgroup.event_control" in every cgroup. To register new notification handler you need: - create an eventfd; - open a control file to be monitored. Callbacks register_event() and unregister_event() must be defined for the control file; - write " " to cgroup.event_control. Interpretation of args is defined by control file implementation; eventfd will be woken up by control file implementation or when the cgroup is removed. To unregister notification handler just close eventfd. If you need notification functionality for a control file you have to implement callbacks register_event() and unregister_event() in the struct cftype. [kamezawa.hiroyu@jp.fujitsu.com: Kconfig fix] Signed-off-by: Kirill A. Shutemov Reviewed-by: KAMEZAWA Hiroyuki Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Dan Malek Cc: Vladislav Buzov Cc: Daisuke Nishimura Cc: Alexander Shishkin Cc: Davide Libenzi Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 20 ++++ include/linux/cgroup.h | 24 ++++ init/Kconfig | 1 + kernel/cgroup.c | 228 +++++++++++++++++++++++++++++++++++++- 4 files changed, 272 insertions(+), 1 deletion(-) (limited to 'init') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index c0358c30c64f..fd588ff0e296 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -23,6 +23,7 @@ CONTENTS: 2.1 Basic Usage 2.2 Attaching processes 2.3 Mounting hierarchies by name + 2.4 Notification API 3. Kernel API 3.1 Overview 3.2 Synchronization @@ -435,6 +436,25 @@ you give a subsystem a name. The name of the subsystem appears as part of the hierarchy description in /proc/mounts and /proc//cgroups. +2.4 Notification API +-------------------- + +There is mechanism which allows to get notifications about changing +status of a cgroup. + +To register new notification handler you need: + - create a file descriptor for event notification using eventfd(2); + - open a control file to be monitored (e.g. memory.usage_in_bytes); + - write " " to cgroup.event_control. + Interpretation of args is defined by control file implementation; + +eventfd will be woken up by control file implementation or when the +cgroup is removed. + +To unregister notification handler just close eventfd. + +NOTE: Support of notifications should be implemented for the control +file. See documentation for the subsystem. 3. Kernel API ============= diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2a59d3101e5d..b4f2201321cd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -235,6 +235,10 @@ struct cgroup { /* For RCU-protected deletion */ struct rcu_head rcu_head; + + /* List of events which userspace want to recieve */ + struct list_head event_list; + spinlock_t event_list_lock; }; /* @@ -378,6 +382,26 @@ struct cftype { int (*trigger)(struct cgroup *cgrp, unsigned int event); int (*release)(struct inode *inode, struct file *file); + + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to the cftype. Implement it if + * you want to provide this functionality. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct cgroup *cgrp, struct cftype *cft, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace + * closes the eventfd or on cgroup removing. + * This callback must be implemented, if you want provide + * notification functionality. + * + * Be careful. It can be called after destroy(), so you have + * to keep all nesessary data, until all events are removed. + */ + int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, + struct eventfd_ctx *eventfd); }; struct cgroup_scanner { diff --git a/init/Kconfig b/init/Kconfig index 089a230e5652..eb77e8ccde1c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -463,6 +463,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK menuconfig CGROUPS boolean "Control Group support" + depends on EVENTFD help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1bf4d6db54ab..ea94984a3895 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4,6 +4,10 @@ * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * + * Notifications support + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. @@ -53,6 +57,8 @@ #include #include #include /* TODO: replace with more sophisticated array */ +#include +#include #include @@ -152,6 +158,35 @@ struct css_id { unsigned short stack[0]; /* Array of Length (depth+1) */ }; +/* + * cgroup_event represents events which userspace want to recieve. + */ +struct cgroup_event { + /* + * Cgroup which the event belongs to. + */ + struct cgroup *cgrp; + /* + * Control file which the event associated. + */ + struct cftype *cft; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; /* The list of hierarchy roots */ @@ -760,14 +795,28 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) static int cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; + struct cgroup_event *event, *tmp; int ret = 0; for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { ret = ss->pre_destroy(ss, cgrp); if (ret) - break; + goto out; } + + /* + * Unregister events and notify userspace. + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del(&event->list); + eventfd_signal(event->eventfd, 1); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + +out: return ret; } @@ -1239,6 +1288,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + INIT_LIST_HEAD(&cgrp->event_list); + spin_lock_init(&cgrp->event_list_lock); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -2077,6 +2128,16 @@ static const struct inode_operations cgroup_dir_inode_operations = { .rename = cgroup_rename, }; +/* + * Check if a file is a control file + */ +static inline struct cftype *__file_cft(struct file *file) +{ + if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + return ERR_PTR(-EINVAL); + return __d_cft(file->f_dentry); +} + static int cgroup_create_file(struct dentry *dentry, mode_t mode, struct super_block *sb) { @@ -2930,6 +2991,166 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, return 0; } +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void cgroup_event_remove(struct work_struct *work) +{ + struct cgroup_event *event = container_of(work, struct cgroup_event, + remove); + struct cgroup *cgrp = event->cgrp; + + /* TODO: check return code */ + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + + eventfd_ctx_put(event->eventfd); + remove_wait_queue(event->wqh, &event->wait); + kfree(event); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct cgroup_event *event = container_of(wait, + struct cgroup_event, wait); + struct cgroup *cgrp = event->cgrp; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + spin_lock(&cgrp->event_list_lock); + list_del(&event->list); + spin_unlock(&cgrp->event_list_lock); + /* + * We are in atomic context, but cgroup_event_remove() may + * sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + + return 0; +} + +static void cgroup_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct cgroup_event *event = container_of(pt, + struct cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ +static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + struct cgroup_event *event = NULL; + unsigned int efd, cfd; + struct file *efile = NULL; + struct file *cfile = NULL; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + event->cgrp = cgrp; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); + INIT_WORK(&event->remove, cgroup_event_remove); + + efile = eventfd_fget(efd); + if (IS_ERR(efile)) { + ret = PTR_ERR(efile); + goto fail; + } + + event->eventfd = eventfd_ctx_fileget(efile); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto fail; + } + + cfile = fget(cfd); + if (!cfile) { + ret = -EBADF; + goto fail; + } + + /* the process need read permission on control file */ + ret = file_permission(cfile, MAY_READ); + if (ret < 0) + goto fail; + + event->cft = __file_cft(cfile); + if (IS_ERR(event->cft)) { + ret = PTR_ERR(event->cft); + goto fail; + } + + if (!event->cft->register_event || !event->cft->unregister_event) { + ret = -EINVAL; + goto fail; + } + + ret = event->cft->register_event(cgrp, event->cft, + event->eventfd, buffer); + if (ret) + goto fail; + + if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + ret = 0; + goto fail; + } + + spin_lock(&cgrp->event_list_lock); + list_add(&event->list, &cgrp->event_list); + spin_unlock(&cgrp->event_list_lock); + + fput(cfile); + fput(efile); + + return 0; + +fail: + if (cfile) + fput(cfile); + + if (event && event->eventfd && !IS_ERR(event->eventfd)) + eventfd_ctx_put(event->eventfd); + + if (!IS_ERR_OR_NULL(efile)) + fput(efile); + + kfree(event); + + return ret; +} + /* * for the common functions, 'private' gives the type of file */ @@ -2955,6 +3176,11 @@ static struct cftype files[] = { .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, + { + .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .write_string = cgroup_write_event_control, + .mode = S_IWUGO, + }, }; static struct cftype cft_release_agent = { -- cgit v1.2.3 From 5ab116c9349ef52d6fbd2e2917a53f13194b048e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 23 Mar 2010 13:35:34 -0700 Subject: cpuset: fix the problem that cpuset_mem_spread_node() returns an offline node cpuset_mem_spread_node() returns an offline node, and causes an oops. This patch fixes it by initializing task->mems_allowed to node_states[N_HIGH_MEMORY], and updating task->mems_allowed when doing memory hotplug. Signed-off-by: Miao Xie Acked-by: David Rientjes Reported-by: Nick Piggin Tested-by: Nick Piggin Cc: Paul Menage Cc: Li Zefan Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 2 +- kernel/cpuset.c | 20 ++++++++++++-------- kernel/kthread.c | 2 +- 3 files changed, 14 insertions(+), 10 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index a1ab78ceb4b6..cbead27caefc 100644 --- a/init/main.c +++ b/init/main.c @@ -858,7 +858,7 @@ static int __init kernel_init(void * unused) /* * init can allocate pages on any node */ - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); /* * init can run on any cpu. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba401fab459f..5d38bd74483c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * call to guarantee_online_mems(), as we know no one is changing * our task's cpuset. * - * Hold callback_mutex around the two modifications of our tasks - * mems_allowed to synchronize with cpuset_mems_allowed(). - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -1391,11 +1388,10 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); - to = node_possible_map; } else { guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &to); } + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ cpuset_attach_task(tsk, &to, cs); @@ -2090,15 +2086,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { + nodemask_t oldmems; + cgroup_lock(); switch (action) { case MEM_ONLINE: - case MEM_OFFLINE: + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - if (action == MEM_OFFLINE) - scan_for_empty_cpusets(&top_cpuset); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); + break; + case MEM_OFFLINE: + /* + * needn't update top_cpuset.mems_allowed explicitly because + * scan_for_empty_cpusets() will update it. + */ + scan_for_empty_cpusets(&top_cpuset); break; default: break; diff --git a/kernel/kthread.c b/kernel/kthread.c index 82ed0ea15194..83911c780175 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -219,7 +219,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; -- cgit v1.2.3