summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLi Shaohua <shaohua.li@intel.com>2005-06-26 01:54:56 +0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-26 03:24:30 +0400
commite1367daf3eed5cd619ee88c9907e1e6ddaa58406 (patch)
treedce60efefba356e0a914669587586a6174e41b94
parent0bb3184df537002a742bafddf3f4fb482b7fe610 (diff)
downloadlinux-e1367daf3eed5cd619ee88c9907e1e6ddaa58406.tar.xz
[PATCH] cpu state clean after hot remove
Clean CPU states in order to reuse smp boot code for CPU hotplug. Signed-off-by: Li Shaohua<shaohua.li@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/kernel/cpu/common.c12
-rw-r--r--arch/i386/kernel/irq.c5
-rw-r--r--arch/i386/kernel/process.c20
-rw-r--r--arch/i386/kernel/smpboot.c175
-rw-r--r--drivers/base/cpu.c8
-rw-r--r--include/asm-i386/irq.h2
-rw-r--r--include/asm-i386/smp.h8
7 files changed, 187 insertions, 43 deletions
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index aac74758caf4..2203a9d20212 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -651,3 +651,15 @@ void __devinit cpu_init(void)
clear_used_math();
mxcsr_feature_mask_init();
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+void __devinit cpu_uninit(void)
+{
+ int cpu = raw_smp_processor_id();
+ cpu_clear(cpu, cpu_initialized);
+
+ /* lazy TLB state */
+ per_cpu(cpu_tlbstate, cpu).state = 0;
+ per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+}
+#endif
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index af115004aec5..ce66dcc26d90 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -156,6 +156,11 @@ void irq_ctx_init(int cpu)
cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
}
+void irq_ctx_exit(int cpu)
+{
+ hardirq_ctx[cpu] = NULL;
+}
+
extern asmlinkage void __do_softirq(void);
asmlinkage void do_softirq(void)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index e06f2dc7123d..5f8cfa6b7940 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -152,21 +152,19 @@ static void poll_idle (void)
/* We don't actually take CPU down, just spin without interrupts. */
static inline void play_dead(void)
{
+ /* This must be done before dead CPU ack */
+ cpu_exit_clear();
+ wbinvd();
+ mb();
/* Ack it */
__get_cpu_var(cpu_state) = CPU_DEAD;
- /* We shouldn't have to disable interrupts while dead, but
- * some interrupts just don't seem to go away, and this makes
- * it "work" for testing purposes. */
- /* Death loop */
- while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
- cpu_relax();
-
+ /*
+ * With physical CPU hotplug, we should halt the cpu
+ */
local_irq_disable();
- __flush_tlb_all();
- cpu_set(smp_processor_id(), cpu_online_map);
- enable_APIC_timer();
- local_irq_enable();
+ while (1)
+ __asm__ __volatile__("hlt":::"memory");
}
#else
static inline void play_dead(void)
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index fb0b200d1d85..d66bf489a2e9 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -90,6 +90,12 @@ cpumask_t cpu_callout_map;
EXPORT_SYMBOL(cpu_callout_map);
static cpumask_t smp_commenced_mask;
+/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
+ * is no way to resync one AP against BP. TBD: for prescott and above, we
+ * should use IA64's algorithm
+ */
+static int __devinitdata tsc_sync_disabled;
+
/* Per CPU bogomips and other parameters */
struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
EXPORT_SYMBOL(cpu_data);
@@ -427,7 +433,7 @@ static void __devinit smp_callin(void)
/*
* Synchronize the TSC with the BP
*/
- if (cpu_has_tsc && cpu_khz)
+ if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled)
synchronize_tsc_ap();
}
@@ -507,6 +513,7 @@ static void __devinit start_secondary(void *unused)
lock_ipi_call_lock();
cpu_set(smp_processor_id(), cpu_online_map);
unlock_ipi_call_lock();
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
@@ -816,8 +823,43 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
#endif /* WAKE_SECONDARY_VIA_INIT */
extern cpumask_t cpu_initialized;
+static inline int alloc_cpu_id(void)
+{
+ cpumask_t tmp_map;
+ int cpu;
+ cpus_complement(tmp_map, cpu_present_map);
+ cpu = first_cpu(tmp_map);
+ if (cpu >= NR_CPUS)
+ return -ENODEV;
+ return cpu;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS];
+static inline struct task_struct * alloc_idle_task(int cpu)
+{
+ struct task_struct *idle;
+
+ if ((idle = cpu_idle_tasks[cpu]) != NULL) {
+ /* initialize thread_struct. we really want to avoid destroy
+ * idle tread
+ */
+ idle->thread.esp = (unsigned long)(((struct pt_regs *)
+ (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1);
+ init_idle(idle, cpu);
+ return idle;
+ }
+ idle = fork_idle(cpu);
+
+ if (!IS_ERR(idle))
+ cpu_idle_tasks[cpu] = idle;
+ return idle;
+}
+#else
+#define alloc_idle_task(cpu) fork_idle(cpu)
+#endif
-static int __devinit do_boot_cpu(int apicid)
+static int __devinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -826,16 +868,17 @@ static int __devinit do_boot_cpu(int apicid)
{
struct task_struct *idle;
unsigned long boot_error;
- int timeout, cpu;
+ int timeout;
unsigned long start_eip;
unsigned short nmi_high = 0, nmi_low = 0;
- cpu = ++cpucount;
+ ++cpucount;
+
/*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
*/
- idle = fork_idle(cpu);
+ idle = alloc_idle_task(cpu);
if (IS_ERR(idle))
panic("failed fork for CPU %d", cpu);
idle->thread.eip = (unsigned long) start_secondary;
@@ -902,13 +945,16 @@ static int __devinit do_boot_cpu(int apicid)
inquire_remote_apic(apicid);
}
}
- x86_cpu_to_apicid[cpu] = apicid;
+
if (boot_error) {
/* Try to put things back the way they were before ... */
unmap_cpu_to_logical_apicid(cpu);
cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
cpucount--;
+ } else {
+ x86_cpu_to_apicid[cpu] = apicid;
+ cpu_set(cpu, cpu_present_map);
}
/* mark "stuck" area as not stuck */
@@ -917,6 +963,75 @@ static int __devinit do_boot_cpu(int apicid)
return boot_error;
}
+#ifdef CONFIG_HOTPLUG_CPU
+void cpu_exit_clear(void)
+{
+ int cpu = raw_smp_processor_id();
+
+ idle_task_exit();
+
+ cpucount --;
+ cpu_uninit();
+ irq_ctx_exit(cpu);
+
+ cpu_clear(cpu, cpu_callout_map);
+ cpu_clear(cpu, cpu_callin_map);
+ cpu_clear(cpu, cpu_present_map);
+
+ cpu_clear(cpu, smp_commenced_mask);
+ unmap_cpu_to_logical_apicid(cpu);
+}
+
+struct warm_boot_cpu_info {
+ struct completion *complete;
+ int apicid;
+ int cpu;
+};
+
+static void __devinit do_warm_boot_cpu(void *p)
+{
+ struct warm_boot_cpu_info *info = p;
+ do_boot_cpu(info->apicid, info->cpu);
+ complete(info->complete);
+}
+
+int __devinit smp_prepare_cpu(int cpu)
+{
+ DECLARE_COMPLETION(done);
+ struct warm_boot_cpu_info info;
+ struct work_struct task;
+ int apicid, ret;
+
+ lock_cpu_hotplug();
+ apicid = x86_cpu_to_apicid[cpu];
+ if (apicid == BAD_APICID) {
+ ret = -ENODEV;
+ goto exit;
+ }
+
+ info.complete = &done;
+ info.apicid = apicid;
+ info.cpu = cpu;
+ INIT_WORK(&task, do_warm_boot_cpu, &info);
+
+ tsc_sync_disabled = 1;
+
+ /* init low mem mapping */
+ memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+ sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS);
+ flush_tlb_all();
+ schedule_work(&task);
+ wait_for_completion(&done);
+
+ tsc_sync_disabled = 0;
+ zap_low_mappings();
+ ret = 0;
+exit:
+ unlock_cpu_hotplug();
+ return ret;
+}
+#endif
+
static void smp_tune_scheduling (void)
{
unsigned long cachesize; /* kB */
@@ -1069,7 +1184,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
if (max_cpus <= cpucount+1)
continue;
- if (do_boot_cpu(apicid))
+ if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
printk("CPU #%d not responding - cannot use it.\n",
apicid);
else
@@ -1149,25 +1264,24 @@ void __devinit smp_prepare_boot_cpu(void)
{
cpu_set(smp_processor_id(), cpu_online_map);
cpu_set(smp_processor_id(), cpu_callout_map);
+ cpu_set(smp_processor_id(), cpu_present_map);
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
}
#ifdef CONFIG_HOTPLUG_CPU
-
-/* must be called with the cpucontrol mutex held */
-static int __devinit cpu_enable(unsigned int cpu)
+static void
+remove_siblinginfo(int cpu)
{
- /* get the target out of its holding state */
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- wmb();
-
- /* wait for the processor to ack it. timeout? */
- while (!cpu_online(cpu))
- cpu_relax();
-
- fixup_irqs(cpu_online_map);
- /* counter the disable in fixup_irqs() */
- local_irq_enable();
- return 0;
+ int sibling;
+
+ for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+ cpu_clear(cpu, cpu_sibling_map[sibling]);
+ for_each_cpu_mask(sibling, cpu_core_map[cpu])
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ cpus_clear(cpu_sibling_map[cpu]);
+ cpus_clear(cpu_core_map[cpu]);
+ phys_proc_id[cpu] = BAD_APICID;
+ cpu_core_id[cpu] = BAD_APICID;
}
int __cpu_disable(void)
@@ -1193,6 +1307,8 @@ int __cpu_disable(void)
mdelay(1);
local_irq_disable();
+ remove_siblinginfo(cpu);
+
cpu_clear(cpu, map);
fixup_irqs(map);
/* It's now safe to remove this processor from the online map */
@@ -1207,8 +1323,10 @@ void __cpu_die(unsigned int cpu)
for (i = 0; i < 10; i++) {
/* They ack this in play_dead by setting CPU_DEAD */
- if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+ if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+ printk ("CPU %d is now offline\n", cpu);
return;
+ }
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
@@ -1236,15 +1354,8 @@ int __devinit __cpu_up(unsigned int cpu)
return -EIO;
}
-#ifdef CONFIG_HOTPLUG_CPU
- /* Already up, and in cpu_quiescent now? */
- if (cpu_isset(cpu, smp_commenced_mask)) {
- cpu_enable(cpu);
- return 0;
- }
-#endif
-
local_irq_enable();
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask);
while (!cpu_isset(cpu, cpu_online_map))
@@ -1258,10 +1369,12 @@ void __init smp_cpus_done(unsigned int max_cpus)
setup_ioapic_dest();
#endif
zap_low_mappings();
+#ifndef CONFIG_HOTPLUG_CPU
/*
* Disable executability of the SMP trampoline:
*/
set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
}
void __init smp_intr_init(void)
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 6ef3069b5710..bdd7e9f55c81 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -16,6 +16,10 @@ struct sysdev_class cpu_sysdev_class = {
EXPORT_SYMBOL(cpu_sysdev_class);
#ifdef CONFIG_HOTPLUG_CPU
+#ifndef __HAVE_ARCH_SMP_PREPARE_CPU
+#define smp_prepare_cpu(cpu) (0)
+#endif
+
static ssize_t show_online(struct sys_device *dev, char *buf)
{
struct cpu *cpu = container_of(dev, struct cpu, sysdev);
@@ -36,7 +40,9 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
kobject_hotplug(&dev->kobj, KOBJ_OFFLINE);
break;
case '1':
- ret = cpu_up(cpu->sysdev.id);
+ ret = smp_prepare_cpu(cpu->sysdev.id);
+ if (ret == 0)
+ ret = cpu_up(cpu->sysdev.id);
break;
default:
ret = -EINVAL;
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index e2d8bf23ad70..270f1986b19f 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -29,9 +29,11 @@ extern void release_vm86_irqs(struct task_struct *);
#ifdef CONFIG_4KSTACKS
extern void irq_ctx_init(int cpu);
+ extern void irq_ctx_exit(int cpu);
# define __ARCH_HAS_DO_SOFTIRQ
#else
# define irq_ctx_init(cpu) do { } while (0)
+# define irq_ctx_exit(cpu) do { } while (0)
#endif
#ifdef CONFIG_IRQBALANCE
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index 2451ead0ca5c..c9996eda5408 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -48,6 +48,14 @@ extern void unlock_ipi_call_lock(void);
#define MAX_APICID 256
extern u8 x86_cpu_to_apicid[];
+#ifdef CONFIG_HOTPLUG_CPU
+extern void cpu_exit_clear(void);
+extern void cpu_uninit(void);
+
+#define __HAVE_ARCH_SMP_PREPARE_CPU
+extern int smp_prepare_cpu(int cpu);
+#endif
+
/*
* This function is needed by all SMP systems. It must _always_ be valid
* from the initial startup. We map APIC_BASE very early in page_setup(),