arch/x86/kernel/acpi/madt_wakeup.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292

// SPDX-License-Identifier: GPL-2.0-or-later
#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/io.h>
#include <linux/kexec.h>
#include <linux/memblock.h>
#include <linux/pgtable.h>
#include <linux/sched/hotplug.h>
#include <asm/apic.h>
#include <asm/barrier.h>
#include <asm/init.h>
#include <asm/intel_pt.h>
#include <asm/nmi.h>
#include <asm/processor.h>
#include <asm/reboot.h>

/* Physical address of the Multiprocessor Wakeup Structure mailbox */
static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;

/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox __ro_after_init;

static u64 acpi_mp_pgd __ro_after_init;
static u64 acpi_mp_reset_vector_paddr __ro_after_init;

static void acpi_mp_stop_this_cpu(void)
{
	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
}

static void acpi_mp_play_dead(void)
{
	play_dead_common();
	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
}

static void acpi_mp_cpu_die(unsigned int cpu)
{
	u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
	unsigned long timeout;

	/*
	 * Use TEST mailbox command to prove that BIOS got control over
	 * the CPU before declaring it dead.
	 *
	 * BIOS has to clear 'command' field of the mailbox.
	 */
	acpi_mp_wake_mailbox->apic_id = apicid;
	smp_store_release(&acpi_mp_wake_mailbox->command,
			  ACPI_MP_WAKE_COMMAND_TEST);

	/* Don't wait longer than a second. */
	timeout = USEC_PER_SEC;
	while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
		udelay(1);

	if (!timeout)
		pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
}

/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
static void __init *alloc_pgt_page(void *dummy)
{
	return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
}

static void __init free_pgt_page(void *pgt, void *dummy)
{
	return memblock_free(pgt, PAGE_SIZE);
}

/*
 * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
 * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
 * to the identity mapping and the function has be present at the same spot in
 * the virtual address space before and after switching page tables.
 */
static int __init init_transition_pgtable(pgd_t *pgd)
{
	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
	unsigned long vaddr, paddr;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	vaddr = (unsigned long)asm_acpi_mp_play_dead;
	pgd += pgd_index(vaddr);
	if (!pgd_present(*pgd)) {
		p4d = (p4d_t *)alloc_pgt_page(NULL);
		if (!p4d)
			return -ENOMEM;
		set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
	}
	p4d = p4d_offset(pgd, vaddr);
	if (!p4d_present(*p4d)) {
		pud = (pud_t *)alloc_pgt_page(NULL);
		if (!pud)
			return -ENOMEM;
		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
	}
	pud = pud_offset(p4d, vaddr);
	if (!pud_present(*pud)) {
		pmd = (pmd_t *)alloc_pgt_page(NULL);
		if (!pmd)
			return -ENOMEM;
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
	}
	pmd = pmd_offset(pud, vaddr);
	if (!pmd_present(*pmd)) {
		pte = (pte_t *)alloc_pgt_page(NULL);
		if (!pte)
			return -ENOMEM;
		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
	}
	pte = pte_offset_kernel(pmd, vaddr);

	paddr = __pa(vaddr);
	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));

	return 0;
}

static int __init acpi_mp_setup_reset(u64 reset_vector)
{
	struct x86_mapping_info info = {
		.alloc_pgt_page = alloc_pgt_page,
		.free_pgt_page	= free_pgt_page,
		.page_flag      = __PAGE_KERNEL_LARGE_EXEC,
		.kernpg_flag    = _KERNPG_TABLE_NOENC,
	};
	pgd_t *pgd;

	pgd = alloc_pgt_page(NULL);
	if (!pgd)
		return -ENOMEM;

	for (int i = 0; i < nr_pfn_mapped; i++) {
		unsigned long mstart, mend;

		mstart = pfn_mapped[i].start << PAGE_SHIFT;
		mend   = pfn_mapped[i].end << PAGE_SHIFT;
		if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
			kernel_ident_mapping_free(&info, pgd);
			return -ENOMEM;
		}
	}

	if (kernel_ident_mapping_init(&info, pgd,
				      PAGE_ALIGN_DOWN(reset_vector),
				      PAGE_ALIGN(reset_vector + 1))) {
		kernel_ident_mapping_free(&info, pgd);
		return -ENOMEM;
	}

	if (init_transition_pgtable(pgd)) {
		kernel_ident_mapping_free(&info, pgd);
		return -ENOMEM;
	}

	smp_ops.play_dead = acpi_mp_play_dead;
	smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
	smp_ops.cpu_die = acpi_mp_cpu_die;

	acpi_mp_reset_vector_paddr = reset_vector;
	acpi_mp_pgd = __pa(pgd);

	return 0;
}

static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
{
	if (!acpi_mp_wake_mailbox_paddr) {
		pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
		return -EOPNOTSUPP;
	}

	/*
	 * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
	 *
	 * Wakeup of secondary CPUs is fully serialized in the core code.
	 * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
	 */
	if (!acpi_mp_wake_mailbox) {
		acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
						sizeof(*acpi_mp_wake_mailbox),
						MEMREMAP_WB);
	}

	/*
	 * Mailbox memory is shared between the firmware and OS. Firmware will
	 * listen on mailbox command address, and once it receives the wakeup
	 * command, the CPU associated with the given apicid will be booted.
	 *
	 * The value of 'apic_id' and 'wakeup_vector' must be visible to the
	 * firmware before the wakeup command is visible.  smp_store_release()
	 * ensures ordering and visibility.
	 */
	acpi_mp_wake_mailbox->apic_id	    = apicid;
	acpi_mp_wake_mailbox->wakeup_vector = start_ip;
	smp_store_release(&acpi_mp_wake_mailbox->command,
			  ACPI_MP_WAKE_COMMAND_WAKEUP);

	/*
	 * Wait for the CPU to wake up.
	 *
	 * The CPU being woken up is essentially in a spin loop waiting to be
	 * woken up. It should not take long for it wake up and acknowledge by
	 * zeroing out ->command.
	 *
	 * ACPI specification doesn't provide any guidance on how long kernel
	 * has to wait for a wake up acknowledgment. It also doesn't provide
	 * a way to cancel a wake up request if it takes too long.
	 *
	 * In TDX environment, the VMM has control over how long it takes to
	 * wake up secondary. It can postpone scheduling secondary vCPU
	 * indefinitely. Giving up on wake up request and reporting error opens
	 * possible attack vector for VMM: it can wake up a secondary CPU when
	 * kernel doesn't expect it. Wait until positive result of the wake up
	 * request.
	 */
	while (READ_ONCE(acpi_mp_wake_mailbox->command))
		cpu_relax();

	return 0;
}

static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
{
	cpu_hotplug_disable_offlining();

	/*
	 * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
	 * limits kexec: the second kernel won't be able to use more than one CPU.
	 *
	 * To prevent a kexec kernel from onlining secondary CPUs invalidate the
	 * mailbox address in the ACPI MADT wakeup structure which prevents a
	 * kexec kernel to use it.
	 *
	 * This is safe as the booting kernel has the mailbox address cached
	 * already and acpi_wakeup_cpu() uses the cached value to bring up the
	 * secondary CPUs.
	 *
	 * Note: This is a Linux specific convention and not covered by the
	 *       ACPI specification.
	 */
	mp_wake->mailbox_address = 0;
}

int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
			      const unsigned long end)
{
	struct acpi_madt_multiproc_wakeup *mp_wake;

	mp_wake = (struct acpi_madt_multiproc_wakeup *)header;

	/*
	 * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
	 * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
	 * than the actual size of the MP wakeup entry in ACPI table because the
	 * 'reset_vector' is only available in the V1 MP wakeup structure.
	 */
	if (!mp_wake)
		return -EINVAL;
	if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
		return -EINVAL;
	if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
		return -EINVAL;

	acpi_table_print_madt_entry(&header->common);

	acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;

	if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
	    mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
		if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
			pr_warn("Failed to setup MADT reset vector\n");
			acpi_mp_disable_offlining(mp_wake);
		}
	} else {
		/*
		 * CPU offlining requires version 1 of the ACPI MADT wakeup
		 * structure.
		 */
		acpi_mp_disable_offlining(mp_wake);
	}

	apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);

	return 0;
}