summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/apic/htirq.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c22
-rw-r--r--arch/x86/kernel/apic/msi.c2
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c4
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/process.c11
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/tsc.c206
14 files changed, 84 insertions, 187 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4b994232cb57..3c7c419c4e3e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,6 +29,7 @@ OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
+OBJECT_FILES_NON_STANDARD_paravirt_patch_$(BITS).o := y
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 26b78d86f25a..85a9e17e0dbc 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
+
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
obj-$(CONFIG_ACPI_APEI) += apei.o
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index ae50d3454d78..81ff48905623 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -150,7 +150,7 @@ static const struct irq_domain_ops htirq_domain_ops = {
.deactivate = htirq_domain_deactivate,
};
-void arch_init_htirq_domain(struct irq_domain *parent)
+void __init arch_init_htirq_domain(struct irq_domain *parent)
{
if (disable_apic)
return;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 347bb9f65737..247880fc29f9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1200,28 +1200,6 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
static struct irq_chip ioapic_chip, ioapic_ir_chip;
-#ifdef CONFIG_X86_32
-static inline int IO_APIC_irq_trigger(int irq)
-{
- int apic, idx, pin;
-
- for_each_ioapic_pin(apic, pin) {
- idx = find_irq_entry(apic, pin, mp_INT);
- if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin, 0)))
- return irq_trigger(idx);
- }
- /*
- * nonexistent IRQs are edge default
- */
- return 0;
-}
-#else
-static inline int IO_APIC_irq_trigger(int irq)
-{
- return 1;
-}
-#endif
-
static void __init setup_IO_APIC_irqs(void)
{
unsigned int ioapic, pin;
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index c61aec7e65f4..4c5d1882bec5 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -136,7 +136,7 @@ static struct msi_domain_info pci_msi_domain_info = {
.handler_name = "edge",
};
-void arch_init_msi_domain(struct irq_domain *parent)
+void __init arch_init_msi_domain(struct irq_domain *parent)
{
if (disable_apic)
return;
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index f3557a1eb562..e66d8e48e456 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -405,7 +405,7 @@ int __init arch_probe_nr_irqs(void)
}
#ifdef CONFIG_X86_IO_APIC
-static void init_legacy_irqs(void)
+static void __init init_legacy_irqs(void)
{
int i, node = cpu_to_node(0);
struct apic_chip_data *data;
@@ -424,7 +424,7 @@ static void init_legacy_irqs(void)
}
}
#else
-static void init_legacy_irqs(void) { }
+static inline void init_legacy_irqs(void) { }
#endif
int __init arch_early_irq_init(void)
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index f5af0cc7eb0d..9257bd9dc664 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -856,11 +856,13 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
dentry = kernfs_mount(fs_type, flags, rdt_root,
RDTGROUP_SUPER_MAGIC, NULL);
if (IS_ERR(dentry))
- goto out_cdp;
+ goto out_destroy;
static_branch_enable(&rdt_enable_key);
goto out;
+out_destroy:
+ kernfs_remove(kn_info);
out_cdp:
cdp_disable();
out:
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 901c640d152f..69ea0bc1cfa3 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -28,6 +28,7 @@
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
+#include <linux/frame.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -94,6 +95,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
}
asm (
+ "optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
#ifdef CONFIG_X86_64
@@ -131,7 +133,12 @@ asm (
" popf\n"
#endif
".global optprobe_template_end\n"
- "optprobe_template_end:\n");
+ "optprobe_template_end:\n"
+ ".type optprobe_template_func, @function\n"
+ ".size optprobe_template_func, .-optprobe_template_func\n");
+
+void optprobe_template_func(void);
+STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_MOVE_IDX \
((long)&optprobe_template_val - (long)&optprobe_template_entry)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb88428cbf2..3ca198080ea9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -545,17 +545,6 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
}
/*
- * Return saved PC of a blocked thread.
- * What is this good for? it will be always the scheduler or ret_from_fork.
- */
-unsigned long thread_saved_pc(struct task_struct *tsk)
-{
- struct inactive_task_frame *frame =
- (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp);
- return READ_ONCE_NOCHECK(frame->ret_addr);
-}
-
-/*
* Called from fs/proc with a reference on @p to find the function
* which called into schedule(). This needs to be done carefully
* because the task might wake up and we might look at a stack
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 2544700a2a87..67393fc88353 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -9,6 +9,7 @@
#include <linux/sched.h>
#include <linux/tboot.h>
#include <linux/delay.h>
+#include <linux/frame.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
@@ -123,6 +124,7 @@ void __noreturn machine_real_restart(unsigned int type)
#ifdef CONFIG_APM_MODULE
EXPORT_SYMBOL(machine_real_restart);
#endif
+STACK_FRAME_NON_STANDARD(machine_real_restart);
/*
* Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f81823695014..65622f07e633 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -503,7 +503,7 @@ static int __init reserve_crashkernel_low(void)
return 0;
}
- low_base = memblock_find_in_range(low_size, 1ULL << 32, low_size, CRASH_ALIGN);
+ low_base = memblock_find_in_range(0, 1ULL << 32, low_size, CRASH_ALIGN);
if (!low_base) {
pr_err("Cannot reserve %ldMB crashkernel low memory, please try smaller size.\n",
(unsigned long)(low_size >> 20));
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f04479a8f74f..045e4f993bd2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -863,7 +863,7 @@ static void announce_cpu(int cpu, int apicid)
if (cpu == 1)
printk(KERN_INFO "x86: Booting SMP configuration:\n");
- if (system_state == SYSTEM_BOOTING) {
+ if (system_state < SYSTEM_RUNNING) {
if (node != current_node) {
if (current_node > (-1))
pr_cont("\n");
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4b1724059909..a4eb27918ceb 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -514,7 +514,7 @@ int tboot_force_iommu(void)
if (!tboot_enabled())
return 0;
- if (!intel_iommu_tboot_noforce)
+ if (intel_iommu_tboot_noforce)
return 1;
if (no_iommu || swiotlb || dmar_disabled)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 714dfba6a1e7..5270fc0c2df6 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -51,115 +51,34 @@ static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
struct clocksource *art_related_clocksource;
-/*
- * Use a ring-buffer like data structure, where a writer advances the head by
- * writing a new data entry and a reader advances the tail when it observes a
- * new entry.
- *
- * Writers are made to wait on readers until there's space to write a new
- * entry.
- *
- * This means that we can always use an {offset, mul} pair to compute a ns
- * value that is 'roughly' in the right direction, even if we're writing a new
- * {offset, mul} pair during the clock read.
- *
- * The down-side is that we can no longer guarantee strict monotonicity anymore
- * (assuming the TSC was that to begin with), because while we compute the
- * intersection point of the two clock slopes and make sure the time is
- * continuous at the point of switching; we can no longer guarantee a reader is
- * strictly before or after the switch point.
- *
- * It does mean a reader no longer needs to disable IRQs in order to avoid
- * CPU-Freq updates messing with his times, and similarly an NMI reader will
- * no longer run the risk of hitting half-written state.
- */
-
struct cyc2ns {
- struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
- struct cyc2ns_data *head; /* 48 + 8 = 56 */
- struct cyc2ns_data *tail; /* 56 + 8 = 64 */
-}; /* exactly fits one cacheline */
-
-static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-
-struct cyc2ns_data *cyc2ns_read_begin(void)
-{
- struct cyc2ns_data *head;
-
- preempt_disable();
-
- head = this_cpu_read(cyc2ns.head);
- /*
- * Ensure we observe the entry when we observe the pointer to it.
- * matches the wmb from cyc2ns_write_end().
- */
- smp_read_barrier_depends();
- head->__count++;
- barrier();
+ struct cyc2ns_data data[2]; /* 0 + 2*16 = 32 */
+ seqcount_t seq; /* 32 + 4 = 36 */
- return head;
-}
+}; /* fits one cacheline */
-void cyc2ns_read_end(struct cyc2ns_data *head)
-{
- barrier();
- /*
- * If we're the outer most nested read; update the tail pointer
- * when we're done. This notifies possible pending writers
- * that we've observed the head pointer and that the other
- * entry is now free.
- */
- if (!--head->__count) {
- /*
- * x86-TSO does not reorder writes with older reads;
- * therefore once this write becomes visible to another
- * cpu, we must be finished reading the cyc2ns_data.
- *
- * matches with cyc2ns_write_begin().
- */
- this_cpu_write(cyc2ns.tail, head);
- }
- preempt_enable();
-}
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
-/*
- * Begin writing a new @data entry for @cpu.
- *
- * Assumes some sort of write side lock; currently 'provided' by the assumption
- * that cpufreq will call its notifiers sequentially.
- */
-static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+void cyc2ns_read_begin(struct cyc2ns_data *data)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
- struct cyc2ns_data *data = c2n->data;
+ int seq, idx;
- if (data == c2n->head)
- data++;
+ preempt_disable_notrace();
- /* XXX send an IPI to @cpu in order to guarantee a read? */
+ do {
+ seq = this_cpu_read(cyc2ns.seq.sequence);
+ idx = seq & 1;
- /*
- * When we observe the tail write from cyc2ns_read_end(),
- * the cpu must be done with that entry and its safe
- * to start writing to it.
- */
- while (c2n->tail == data)
- cpu_relax();
+ data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
+ data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
+ data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
- return data;
+ } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
-static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+void cyc2ns_read_end(void)
{
- struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
-
- /*
- * Ensure the @data writes are visible before we publish the
- * entry. Matches the data-depencency in cyc2ns_read_begin().
- */
- smp_wmb();
-
- ACCESS_ONCE(c2n->head) = data;
+ preempt_enable_notrace();
}
/*
@@ -191,7 +110,6 @@ static void cyc2ns_data_init(struct cyc2ns_data *data)
data->cyc2ns_mul = 0;
data->cyc2ns_shift = 0;
data->cyc2ns_offset = 0;
- data->__count = 0;
}
static void cyc2ns_init(int cpu)
@@ -201,51 +119,29 @@ static void cyc2ns_init(int cpu)
cyc2ns_data_init(&c2n->data[0]);
cyc2ns_data_init(&c2n->data[1]);
- c2n->head = c2n->data;
- c2n->tail = c2n->data;
+ seqcount_init(&c2n->seq);
}
static inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
- struct cyc2ns_data *data, *tail;
+ struct cyc2ns_data data;
unsigned long long ns;
- /*
- * See cyc2ns_read_*() for details; replicated in order to avoid
- * an extra few instructions that came with the abstraction.
- * Notable, it allows us to only do the __count and tail update
- * dance when its actually needed.
- */
-
- preempt_disable_notrace();
- data = this_cpu_read(cyc2ns.head);
- tail = this_cpu_read(cyc2ns.tail);
-
- if (likely(data == tail)) {
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
- } else {
- data->__count++;
-
- barrier();
-
- ns = data->cyc2ns_offset;
- ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
+ cyc2ns_read_begin(&data);
- barrier();
+ ns = data.cyc2ns_offset;
+ ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
- if (!--data->__count)
- this_cpu_write(cyc2ns.tail, data);
- }
- preempt_enable_notrace();
+ cyc2ns_read_end();
return ns;
}
-static void set_cyc2ns_scale(unsigned long khz, int cpu)
+static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now)
{
- unsigned long long tsc_now, ns_now;
- struct cyc2ns_data *data;
+ unsigned long long ns_now;
+ struct cyc2ns_data data;
+ struct cyc2ns *c2n;
unsigned long flags;
local_irq_save(flags);
@@ -254,9 +150,6 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
if (!khz)
goto done;
- data = cyc2ns_write_begin(cpu);
-
- tsc_now = rdtsc();
ns_now = cycles_2_ns(tsc_now);
/*
@@ -264,7 +157,7 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* time function is continuous; see the comment near struct
* cyc2ns_data.
*/
- clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz,
+ clocks_calc_mult_shift(&data.cyc2ns_mul, &data.cyc2ns_shift, khz,
NSEC_PER_MSEC, 0);
/*
@@ -273,20 +166,26 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu)
* conversion algorithm shifting a 32-bit value (now specifies a 64-bit
* value) - refer perf_event_mmap_page documentation in perf_event.h.
*/
- if (data->cyc2ns_shift == 32) {
- data->cyc2ns_shift = 31;
- data->cyc2ns_mul >>= 1;
+ if (data.cyc2ns_shift == 32) {
+ data.cyc2ns_shift = 31;
+ data.cyc2ns_mul >>= 1;
}
- data->cyc2ns_offset = ns_now -
- mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, data->cyc2ns_shift);
+ data.cyc2ns_offset = ns_now -
+ mul_u64_u32_shr(tsc_now, data.cyc2ns_mul, data.cyc2ns_shift);
+
+ c2n = per_cpu_ptr(&cyc2ns, cpu);
- cyc2ns_write_end(cpu, data);
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[0] = data;
+ raw_write_seqcount_latch(&c2n->seq);
+ c2n->data[1] = data;
done:
- sched_clock_idle_wakeup_event(0);
+ sched_clock_idle_wakeup_event();
local_irq_restore(flags);
}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -374,6 +273,8 @@ static int __init tsc_setup(char *str)
tsc_clocksource_reliable = 1;
if (!strncmp(str, "noirqtime", 9))
no_sched_irq_time = 1;
+ if (!strcmp(str, "unstable"))
+ mark_tsc_unstable("boot parameter");
return 1;
}
@@ -986,7 +887,6 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
-
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
* changes.
*
@@ -1027,7 +927,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
mark_tsc_unstable("cpufreq changes");
- set_cyc2ns_scale(tsc_khz, freq->cpu);
+ set_cyc2ns_scale(tsc_khz, freq->cpu, rdtsc());
}
return 0;
@@ -1127,6 +1027,15 @@ static void tsc_cs_mark_unstable(struct clocksource *cs)
pr_info("Marking TSC unstable due to clocksource watchdog\n");
}
+static void tsc_cs_tick_stable(struct clocksource *cs)
+{
+ if (tsc_unstable)
+ return;
+
+ if (using_native_sched_clock())
+ sched_clock_tick_stable();
+}
+
/*
* .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc()
*/
@@ -1140,6 +1049,7 @@ static struct clocksource clocksource_tsc = {
.archdata = { .vclock_mode = VCLOCK_TSC },
.resume = tsc_resume,
.mark_unstable = tsc_cs_mark_unstable,
+ .tick_stable = tsc_cs_tick_stable,
};
void mark_tsc_unstable(char *reason)
@@ -1255,6 +1165,7 @@ static void tsc_refine_calibration_work(struct work_struct *work)
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
+ int cpu;
/* Don't bother refining TSC on unstable systems */
if (check_tsc_unstable())
@@ -1305,6 +1216,10 @@ static void tsc_refine_calibration_work(struct work_struct *work)
/* Inform the TSC deadline clockevent devices about the recalibration */
lapic_update_tsc_freq();
+ /* Update the sched_clock() rate to match the clocksource one */
+ for_each_possible_cpu(cpu)
+ set_cyc2ns_scale(tsc_khz, cpu, tsc_stop);
+
out:
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
@@ -1350,7 +1265,7 @@ device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
- u64 lpj;
+ u64 lpj, cyc;
int cpu;
if (!boot_cpu_has(X86_FEATURE_TSC)) {
@@ -1390,9 +1305,10 @@ void __init tsc_init(void)
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
*/
+ cyc = rdtsc();
for_each_possible_cpu(cpu) {
cyc2ns_init(cpu);
- set_cyc2ns_scale(tsc_khz, cpu);
+ set_cyc2ns_scale(tsc_khz, cpu, cyc);
}
if (tsc_disabled > 0)