summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/boot/memory.c2
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h12
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h28
-rw-r--r--arch/x86/include/asm/gart.h24
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--arch/x86/include/asm/io_apic.h2
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/numa.h2
-rw-r--r--arch/x86/kernel/amd_iommu.c519
-rw-r--r--arch/x86/kernel/amd_iommu_init.c39
-rw-r--r--arch/x86/kernel/aperture_64.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c10
-rw-r--r--arch/x86/kernel/apm_32.c5
-rw-r--r--arch/x86/kernel/cpu/amd.c21
-rw-r--r--arch/x86/kernel/cpu/perf_event.c22
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c22
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c125
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c19
-rw-r--r--arch/x86/kernel/devicetree.c2
-rw-r--r--arch/x86/kernel/pci-gart_64.c9
-rw-r--r--arch/x86/kernel/pci-iommu_table.c18
-rw-r--r--arch/x86/kernel/ptrace.c36
-rw-r--r--arch/x86/kernel/reboot_32.S12
-rw-r--r--arch/x86/kernel/setup.c5
-rw-r--r--arch/x86/kvm/x86.c37
-rw-r--r--arch/x86/mm/numa.c31
-rw-r--r--arch/x86/mm/numa_64.c2
-rw-r--r--arch/x86/mm/numa_emulation.c20
-rw-r--r--arch/x86/mm/srat_32.c4
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts8
-rw-r--r--arch/x86/platform/mrst/mrst.c10
-rw-r--r--arch/x86/platform/mrst/vrtc.c4
-rw-r--r--arch/x86/platform/visws/visws_quirks.c20
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/enlighten.c21
-rw-r--r--arch/x86/xen/mmu.c142
-rw-r--r--arch/x86/xen/setup.c2
38 files changed, 819 insertions, 426 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bfd..8cc29da2d689 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -690,6 +690,7 @@ config AMD_IOMMU
bool "AMD IOMMU support"
select SWIOTLB
select PCI_MSI
+ select PCI_IOV
depends on X86_64 && PCI && ACPI
---help---
With this option you can enable support for AMD IOMMU hardware in
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index cae3feb1035e..db75d07c3645 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -91,7 +91,7 @@ static int detect_memory_e801(void)
if (oreg.ax > 15*1024) {
return -1; /* Bogus! */
} else if (oreg.ax == 15*1024) {
- boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+ boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
} else {
/*
* This ignores memory above 16MB if we have a memory
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index 916bc8111a01..a4ae6c3875eb 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -19,13 +19,11 @@
#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
#define _ASM_X86_AMD_IOMMU_PROTO_H
-struct amd_iommu;
+#include <asm/amd_iommu_types.h>
extern int amd_iommu_init_dma_ops(void);
extern int amd_iommu_init_passthrough(void);
extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_flush_all_domains(void);
-extern void amd_iommu_flush_all_devices(void);
extern void amd_iommu_apply_erratum_63(u16 devid);
extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
extern int amd_iommu_init_devices(void);
@@ -44,4 +42,12 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev)
(pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
}
+static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
+{
+ if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
+ return false;
+
+ return !!(iommu->features & f);
+}
+
#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index e3509fc303bf..4c9982995414 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -68,12 +68,25 @@
#define MMIO_CONTROL_OFFSET 0x0018
#define MMIO_EXCL_BASE_OFFSET 0x0020
#define MMIO_EXCL_LIMIT_OFFSET 0x0028
+#define MMIO_EXT_FEATURES 0x0030
#define MMIO_CMD_HEAD_OFFSET 0x2000
#define MMIO_CMD_TAIL_OFFSET 0x2008
#define MMIO_EVT_HEAD_OFFSET 0x2010
#define MMIO_EVT_TAIL_OFFSET 0x2018
#define MMIO_STATUS_OFFSET 0x2020
+
+/* Extended Feature Bits */
+#define FEATURE_PREFETCH (1ULL<<0)
+#define FEATURE_PPR (1ULL<<1)
+#define FEATURE_X2APIC (1ULL<<2)
+#define FEATURE_NX (1ULL<<3)
+#define FEATURE_GT (1ULL<<4)
+#define FEATURE_IA (1ULL<<6)
+#define FEATURE_GA (1ULL<<7)
+#define FEATURE_HE (1ULL<<8)
+#define FEATURE_PC (1ULL<<9)
+
/* MMIO status bits */
#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
@@ -113,7 +126,9 @@
/* command specific defines */
#define CMD_COMPL_WAIT 0x01
#define CMD_INV_DEV_ENTRY 0x02
-#define CMD_INV_IOMMU_PAGES 0x03
+#define CMD_INV_IOMMU_PAGES 0x03
+#define CMD_INV_IOTLB_PAGES 0x04
+#define CMD_INV_ALL 0x08
#define CMD_COMPL_WAIT_STORE_MASK 0x01
#define CMD_COMPL_WAIT_INT_MASK 0x02
@@ -215,6 +230,8 @@
#define IOMMU_PTE_IR (1ULL << 61)
#define IOMMU_PTE_IW (1ULL << 62)
+#define DTE_FLAG_IOTLB 0x01
+
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
@@ -227,6 +244,7 @@
/* IOMMU capabilities */
#define IOMMU_CAP_IOTLB 24
#define IOMMU_CAP_NPCACHE 26
+#define IOMMU_CAP_EFR 27
#define MAX_DOMAIN_ID 65536
@@ -249,6 +267,8 @@ extern bool amd_iommu_dump;
/* global flag if IOMMUs cache non-present entries */
extern bool amd_iommu_np_cache;
+/* Only true if all IOMMUs support device IOTLBs */
+extern bool amd_iommu_iotlb_sup;
/*
* Make iterating over all IOMMUs easier
@@ -371,6 +391,9 @@ struct amd_iommu {
/* flags read from acpi table */
u8 acpi_flags;
+ /* Extended features */
+ u64 features;
+
/*
* Capability pointer. There could be more than one IOMMU per PCI
* device function if there are more than one AMD IOMMU capability
@@ -409,9 +432,6 @@ struct amd_iommu {
/* if one, we need to send a completion wait command */
bool need_sync;
- /* becomes true if a command buffer reset is running */
- bool reset_in_progress;
-
/* default dma_ops domain for that IOMMU */
struct dma_ops_domain *default_dom;
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 43085bfc99c3..156cd5d18d2a 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -66,7 +66,7 @@ static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
* Don't enable translation but enable GART IO and CPU accesses.
* Also, set DISTLBWALKPRB since GART tables memory is UC.
*/
- ctl = DISTLBWALKPRB | order << 1;
+ ctl = order << 1;
pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
}
@@ -75,17 +75,17 @@ static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
{
u32 tmp, ctl;
- /* address of the mappings table */
- addr >>= 12;
- tmp = (u32) addr<<4;
- tmp &= ~0xf;
- pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
-
- /* Enable GART translation for this hammer. */
- pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
- ctl |= GARTEN;
- ctl &= ~(DISGARTCPU | DISGARTIO);
- pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+ /* address of the mappings table */
+ addr >>= 12;
+ tmp = (u32) addr<<4;
+ tmp &= ~0xf;
+ pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
+
+ /* Enable GART translation for this hammer. */
+ pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+ ctl |= GARTEN | DISTLBWALKPRB;
+ ctl &= ~(DISGARTCPU | DISGARTIO);
+ pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
}
static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ef328901c802..c9e09ea05644 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -237,7 +237,7 @@ static inline void fpu_save_init(struct fpu *fpu)
} else if (use_fxsr()) {
fpu_fxsave(fpu);
} else {
- asm volatile("fsave %[fx]; fwait"
+ asm volatile("fnsave %[fx]; fwait"
: [fx] "=m" (fpu->state->fsave));
return;
}
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index c4bd267dfc50..a97a240f67f3 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -150,7 +150,7 @@ void setup_IO_APIC_irq_extra(u32 gsi);
extern void ioapic_and_gsi_init(void);
extern void ioapic_insert_resources(void);
-int io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr);
+int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index fd5a1f365c95..3cce71413d0b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -96,11 +96,15 @@
#define MSR_IA32_MC0_ADDR 0x00000402
#define MSR_IA32_MC0_MISC 0x00000403
+#define MSR_AMD64_MC0_MASK 0xc0010044
+
#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
+#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
+
/* These are consecutive and not in the normal 4er MCE bank block */
#define MSR_IA32_MC0_CTL2 0x00000280
#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 3d4dab43c994..a50fc9f493b3 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -51,7 +51,7 @@ static inline void numa_remove_cpu(int cpu) { }
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable);
+void debug_cpumask_set_cpu(int cpu, int node, bool enable);
#endif
#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 57ca77787220..dc5dddafe5c2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -18,6 +18,7 @@
*/
#include <linux/pci.h>
+#include <linux/pci-ats.h>
#include <linux/bitmap.h>
#include <linux/slab.h>
#include <linux/debugfs.h>
@@ -25,6 +26,7 @@
#include <linux/dma-mapping.h>
#include <linux/iommu-helper.h>
#include <linux/iommu.h>
+#include <linux/delay.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
@@ -34,7 +36,7 @@
#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-#define EXIT_LOOP_COUNT 10000000
+#define LOOP_TIMEOUT 100000
static DEFINE_RWLOCK(amd_iommu_devtable_lock);
@@ -57,7 +59,6 @@ struct iommu_cmd {
u32 data[4];
};
-static void reset_iommu_command_buffer(struct amd_iommu *iommu);
static void update_domain(struct protection_domain *domain);
/****************************************************************************
@@ -322,8 +323,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
break;
case EVENT_TYPE_ILL_CMD:
printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
- iommu->reset_in_progress = true;
- reset_iommu_command_buffer(iommu);
dump_command(address);
break;
case EVENT_TYPE_CMD_HARD_ERR:
@@ -383,186 +382,289 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
*
****************************************************************************/
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command. Must be called with iommu->lock held.
- */
-static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+static int wait_on_sem(volatile u64 *sem)
+{
+ int i = 0;
+
+ while (*sem == 0 && i < LOOP_TIMEOUT) {
+ udelay(1);
+ i += 1;
+ }
+
+ if (i == LOOP_TIMEOUT) {
+ pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static void copy_cmd_to_buffer(struct amd_iommu *iommu,
+ struct iommu_cmd *cmd,
+ u32 tail)
{
- u32 tail, head;
u8 *target;
- WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
- tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
target = iommu->cmd_buf + tail;
- memcpy_toio(target, cmd, sizeof(*cmd));
- tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
- head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
- if (tail == head)
- return -ENOMEM;
+ tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+
+ /* Copy command to buffer */
+ memcpy(target, cmd, sizeof(*cmd));
+
+ /* Tell the IOMMU about it */
writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+}
- return 0;
+static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
+{
+ WARN_ON(address & 0x7ULL);
+
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
+ cmd->data[1] = upper_32_bits(__pa(address));
+ cmd->data[2] = 1;
+ CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
+}
+
+static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[0] = devid;
+ CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
+}
+
+static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+ size_t size, u16 domid, int pde)
+{
+ u64 pages;
+ int s;
+
+ pages = iommu_num_pages(address, size, PAGE_SIZE);
+ s = 0;
+
+ if (pages > 1) {
+ /*
+ * If we have to flush more than one page, flush all
+ * TLB entries for this domain
+ */
+ address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ s = 1;
+ }
+
+ address &= PAGE_MASK;
+
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[1] |= domid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+ if (s) /* size bit - we flush more than one 4kb page */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+
+static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
+ u64 address, size_t size)
+{
+ u64 pages;
+ int s;
+
+ pages = iommu_num_pages(address, size, PAGE_SIZE);
+ s = 0;
+
+ if (pages > 1) {
+ /*
+ * If we have to flush more than one page, flush all
+ * TLB entries for this domain
+ */
+ address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ s = 1;
+ }
+
+ address &= PAGE_MASK;
+
+ memset(cmd, 0, sizeof(*cmd));
+ cmd->data[0] = devid;
+ cmd->data[0] |= (qdep & 0xff) << 24;
+ cmd->data[1] = devid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+ if (s)
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+}
+
+static void build_inv_all(struct iommu_cmd *cmd)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ CMD_SET_TYPE(cmd, CMD_INV_ALL);
}
/*
- * General queuing function for commands. Takes iommu->lock and calls
- * __iommu_queue_command().
+ * Writes the command to the IOMMUs command buffer and informs the
+ * hardware about the new command.
*/
static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
{
+ u32 left, tail, head, next_tail;
unsigned long flags;
- int ret;
+ WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
+
+again:
spin_lock_irqsave(&iommu->lock, flags);
- ret = __iommu_queue_command(iommu, cmd);
- if (!ret)
- iommu->need_sync = true;
- spin_unlock_irqrestore(&iommu->lock, flags);
- return ret;
-}
+ head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+ tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+ next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+ left = (head - next_tail) % iommu->cmd_buf_size;
-/*
- * This function waits until an IOMMU has completed a completion
- * wait command
- */
-static void __iommu_wait_for_completion(struct amd_iommu *iommu)
-{
- int ready = 0;
- unsigned status = 0;
- unsigned long i = 0;
+ if (left <= 2) {
+ struct iommu_cmd sync_cmd;
+ volatile u64 sem = 0;
+ int ret;
- INC_STATS_COUNTER(compl_wait);
+ build_completion_wait(&sync_cmd, (u64)&sem);
+ copy_cmd_to_buffer(iommu, &sync_cmd, tail);
- while (!ready && (i < EXIT_LOOP_COUNT)) {
- ++i;
- /* wait for the bit to become one */
- status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
+ spin_unlock_irqrestore(&iommu->lock, flags);
+
+ if ((ret = wait_on_sem(&sem)) != 0)
+ return ret;
+
+ goto again;
}
- /* set bit back to zero */
- status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
- writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+ copy_cmd_to_buffer(iommu, cmd, tail);
+
+ /* We need to sync now to make sure all commands are processed */
+ iommu->need_sync = true;
+
+ spin_unlock_irqrestore(&iommu->lock, flags);
- if (unlikely(i == EXIT_LOOP_COUNT))
- iommu->reset_in_progress = true;
+ return 0;
}
/*
* This function queues a completion wait command into the command
* buffer of an IOMMU
*/
-static int __iommu_completion_wait(struct amd_iommu *iommu)
+static int iommu_completion_wait(struct amd_iommu *iommu)
{
struct iommu_cmd cmd;
+ volatile u64 sem = 0;
+ int ret;
- memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
- CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+ if (!iommu->need_sync)
+ return 0;
+
+ build_completion_wait(&cmd, (u64)&sem);
+
+ ret = iommu_queue_command(iommu, &cmd);
+ if (ret)
+ return ret;
- return __iommu_queue_command(iommu, &cmd);
+ return wait_on_sem(&sem);
}
-/*
- * This function is called whenever we need to ensure that the IOMMU has
- * completed execution of all commands we sent. It sends a
- * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
- * us about that by writing a value to a physical address we pass with
- * the command.
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
+static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
{
- int ret = 0;
- unsigned long flags;
+ struct iommu_cmd cmd;
- spin_lock_irqsave(&iommu->lock, flags);
+ build_inv_dte(&cmd, devid);
- if (!iommu->need_sync)
- goto out;
+ return iommu_queue_command(iommu, &cmd);
+}
- ret = __iommu_completion_wait(iommu);
+static void iommu_flush_dte_all(struct amd_iommu *iommu)
+{
+ u32 devid;
- iommu->need_sync = false;
+ for (devid = 0; devid <= 0xffff; ++devid)
+ iommu_flush_dte(iommu, devid);
- if (ret)
- goto out;
-
- __iommu_wait_for_completion(iommu);
+ iommu_completion_wait(iommu);
+}
-out:
- spin_unlock_irqrestore(&iommu->lock, flags);
+/*
+ * This function uses heavy locking and may disable irqs for some time. But
+ * this is no issue because it is only called during resume.
+ */
+static void iommu_flush_tlb_all(struct amd_iommu *iommu)
+{
+ u32 dom_id;
- if (iommu->reset_in_progress)
- reset_iommu_command_buffer(iommu);
+ for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
+ struct iommu_cmd cmd;
+ build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ dom_id, 1);
+ iommu_queue_command(iommu, &cmd);
+ }
- return 0;
+ iommu_completion_wait(iommu);
}
-static void iommu_flush_complete(struct protection_domain *domain)
+static void iommu_flush_all(struct amd_iommu *iommu)
{
- int i;
+ struct iommu_cmd cmd;
- for (i = 0; i < amd_iommus_present; ++i) {
- if (!domain->dev_iommu[i])
- continue;
+ build_inv_all(&cmd);
- /*
- * Devices of this domain are behind this IOMMU
- * We need to wait for completion of all commands.
- */
- iommu_completion_wait(amd_iommus[i]);
+ iommu_queue_command(iommu, &cmd);
+ iommu_completion_wait(iommu);
+}
+
+void iommu_flush_all_caches(struct amd_iommu *iommu)
+{
+ if (iommu_feature(iommu, FEATURE_IA)) {
+ iommu_flush_all(iommu);
+ } else {
+ iommu_flush_dte_all(iommu);
+ iommu_flush_tlb_all(iommu);
}
}
/*
- * Command send function for invalidating a device table entry
+ * Command send function for flushing on-device TLB
*/
-static int iommu_flush_device(struct device *dev)
+static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
{
+ struct pci_dev *pdev = to_pci_dev(dev);
struct amd_iommu *iommu;
struct iommu_cmd cmd;
u16 devid;
+ int qdep;
+ qdep = pci_ats_queue_depth(pdev);
devid = get_device_id(dev);
iommu = amd_iommu_rlookup_table[devid];
- /* Build command */
- memset(&cmd, 0, sizeof(cmd));
- CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
- cmd.data[0] = devid;
+ build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
return iommu_queue_command(iommu, &cmd);
}
-static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
- u16 domid, int pde, int s)
-{
- memset(cmd, 0, sizeof(*cmd));
- address &= PAGE_MASK;
- CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
- cmd->data[1] |= domid;
- cmd->data[2] = lower_32_bits(address);
- cmd->data[3] = upper_32_bits(address);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
/*
- * Generic command send function for invalidaing TLB entries
+ * Command send function for invalidating a device table entry
*/
-static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
- u64 address, u16 domid, int pde, int s)
+static int device_flush_dte(struct device *dev)
{
- struct iommu_cmd cmd;
+ struct amd_iommu *iommu;
+ struct pci_dev *pdev;
+ u16 devid;
int ret;
- __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
+ pdev = to_pci_dev(dev);
+ devid = get_device_id(dev);
+ iommu = amd_iommu_rlookup_table[devid];
- ret = iommu_queue_command(iommu, &cmd);
+ ret = iommu_flush_dte(iommu, devid);
+ if (ret)
+ return ret;
+
+ if (pci_ats_enabled(pdev))
+ ret = device_flush_iotlb(dev, 0, ~0UL);
return ret;
}
@@ -572,23 +674,14 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
* It invalidates a single PTE if the range to flush is within a single
* page. Otherwise it flushes the whole TLB of the IOMMU.
*/
-static void __iommu_flush_pages(struct protection_domain *domain,
- u64 address, size_t size, int pde)
+static void __domain_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size, int pde)
{
- int s = 0, i;
- unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
-
- address &= PAGE_MASK;
-
- if (pages > 1) {
- /*
- * If we have to flush more than one page, flush all
- * TLB entries for this domain
- */
- address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
- s = 1;
- }
+ struct iommu_dev_data *dev_data;
+ struct iommu_cmd cmd;
+ int ret = 0, i;
+ build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
for (i = 0; i < amd_iommus_present; ++i) {
if (!domain->dev_iommu[i])
@@ -598,101 +691,70 @@ static void __iommu_flush_pages(struct protection_domain *domain,
* Devices of this domain are behind this IOMMU
* We need a TLB flush
*/
- iommu_queue_inv_iommu_pages(amd_iommus[i], address,
- domain->id, pde, s);
+ ret |= iommu_queue_command(amd_iommus[i], &cmd);
}
- return;
+ list_for_each_entry(dev_data, &domain->dev_list, list) {
+ struct pci_dev *pdev = to_pci_dev(dev_data->dev);
+
+ if (!pci_ats_enabled(pdev))
+ continue;
+
+ ret |= device_flush_iotlb(dev_data->dev, address, size);
+ }
+
+ WARN_ON(ret);
}
-static void iommu_flush_pages(struct protection_domain *domain,
- u64 address, size_t size)
+static void domain_flush_pages(struct protection_domain *domain,
+ u64 address, size_t size)
{
- __iommu_flush_pages(domain, address, size, 0);
+ __domain_flush_pages(domain, address, size, 0);
}
/* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct protection_domain *domain)
+static void domain_flush_tlb(struct protection_domain *domain)
{
- __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
+ __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
}
/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct protection_domain *domain)
+static void domain_flush_tlb_pde(struct protection_domain *domain)
{
- __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
+ __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
}
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void iommu_flush_domain_devices(struct protection_domain *domain)
+static void domain_flush_complete(struct protection_domain *domain)
{
- struct iommu_dev_data *dev_data;
- unsigned long flags;
-
- spin_lock_irqsave(&domain->lock, flags);
-
- list_for_each_entry(dev_data, &domain->dev_list, list)
- iommu_flush_device(dev_data->dev);
-
- spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-static void iommu_flush_all_domain_devices(void)
-{
- struct protection_domain *domain;
- unsigned long flags;
+ int i;
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
+ for (i = 0; i < amd_iommus_present; ++i) {
+ if (!domain->dev_iommu[i])
+ continue;
- list_for_each_entry(domain, &amd_iommu_pd_list, list) {
- iommu_flush_domain_devices(domain);
- iommu_flush_complete(domain);
+ /*
+ * Devices of this domain are behind this IOMMU
+ * We need to wait for completion of all commands.
+ */
+ iommu_completion_wait(amd_iommus[i]);
}
-
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
}
-void amd_iommu_flush_all_devices(void)
-{
- iommu_flush_all_domain_devices();
-}
/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
+ * This function flushes the DTEs for all devices in domain
*/
-void amd_iommu_flush_all_domains(void)
+static void domain_flush_devices(struct protection_domain *domain)
{
- struct protection_domain *domain;
+ struct iommu_dev_data *dev_data;
unsigned long flags;
- spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
- list_for_each_entry(domain, &amd_iommu_pd_list, list) {
- spin_lock(&domain->lock);
- iommu_flush_tlb_pde(domain);
- iommu_flush_complete(domain);
- spin_unlock(&domain->lock);
- }
-
- spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
-{
- pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
-
- if (iommu->reset_in_progress)
- panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
+ spin_lock_irqsave(&domain->lock, flags);
- amd_iommu_reset_cmd_buffer(iommu);
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
+ list_for_each_entry(dev_data, &domain->dev_list, list)
+ device_flush_dte(dev_data->dev);
- iommu->reset_in_progress = false;
+ spin_unlock_irqrestore(&domain->lock, flags);
}
/****************************************************************************
@@ -1410,17 +1472,22 @@ static bool dma_ops_domain(struct protection_domain *domain)
return domain->flags & PD_DMA_OPS_MASK;
}
-static void set_dte_entry(u16 devid, struct protection_domain *domain)
+static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
{
u64 pte_root = virt_to_phys(domain->pt_root);
+ u32 flags = 0;
pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
- amd_iommu_dev_table[devid].data[2] = domain->id;
- amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
- amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+ if (ats)
+ flags |= DTE_FLAG_IOTLB;
+
+ amd_iommu_dev_table[devid].data[3] |= flags;
+ amd_iommu_dev_table[devid].data[2] = domain->id;
+ amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
+ amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
}
static void clear_dte_entry(u16 devid)
@@ -1437,34 +1504,42 @@ static void do_attach(struct device *dev, struct protection_domain *domain)
{
struct iommu_dev_data *dev_data;
struct amd_iommu *iommu;
+ struct pci_dev *pdev;
+ bool ats = false;
u16 devid;
devid = get_device_id(dev);
iommu = amd_iommu_rlookup_table[devid];
dev_data = get_dev_data(dev);
+ pdev = to_pci_dev(dev);
+
+ if (amd_iommu_iotlb_sup)
+ ats = pci_ats_enabled(pdev);
/* Update data structures */
dev_data->domain = domain;
list_add(&dev_data->list, &domain->dev_list);
- set_dte_entry(devid, domain);
+ set_dte_entry(devid, domain, ats);
/* Do reference counting */
domain->dev_iommu[iommu->index] += 1;
domain->dev_cnt += 1;
/* Flush the DTE entry */
- iommu_flush_device(dev);
+ device_flush_dte(dev);
}
static void do_detach(struct device *dev)
{
struct iommu_dev_data *dev_data;
struct amd_iommu *iommu;
+ struct pci_dev *pdev;
u16 devid;
devid = get_device_id(dev);
iommu = amd_iommu_rlookup_table[devid];
dev_data = get_dev_data(dev);
+ pdev = to_pci_dev(dev);
/* decrease reference counters */
dev_data->domain->dev_iommu[iommu->index] -= 1;
@@ -1476,7 +1551,7 @@ static void do_detach(struct device *dev)
clear_dte_entry(devid);
/* Flush the DTE entry */
- iommu_flush_device(dev);
+ device_flush_dte(dev);
}
/*
@@ -1539,9 +1614,13 @@ out_unlock:
static int attach_device(struct device *dev,
struct protection_domain *domain)
{
+ struct pci_dev *pdev = to_pci_dev(dev);
unsigned long flags;
int ret;
+ if (amd_iommu_iotlb_sup)
+ pci_enable_ats(pdev, PAGE_SHIFT);
+
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
ret = __attach_device(dev, domain);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
@@ -1551,7 +1630,7 @@ static int attach_device(struct device *dev,
* left the caches in the IOMMU dirty. So we have to flush
* here to evict all dirty stuff.
*/
- iommu_flush_tlb_pde(domain);
+ domain_flush_tlb_pde(domain);
return ret;
}
@@ -1598,12 +1677,16 @@ static void __detach_device(struct device *dev)
*/
static void detach_device(struct device *dev)
{
+ struct pci_dev *pdev = to_pci_dev(dev);
unsigned long flags;
/* lock device table */
write_lock_irqsave(&amd_iommu_devtable_lock, flags);
__detach_device(dev);
write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+ if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
+ pci_disable_ats(pdev);
}
/*
@@ -1692,7 +1775,7 @@ static int device_change_notifier(struct notifier_block *nb,
goto out;
}
- iommu_flush_device(dev);
+ device_flush_dte(dev);
iommu_completion_wait(iommu);
out:
@@ -1753,8 +1836,9 @@ static void update_device_table(struct protection_domain *domain)
struct iommu_dev_data *dev_data;
list_for_each_entry(dev_data, &domain->dev_list, list) {
+ struct pci_dev *pdev = to_pci_dev(dev_data->dev);
u16 devid = get_device_id(dev_data->dev);
- set_dte_entry(devid, domain);
+ set_dte_entry(devid, domain, pci_ats_enabled(pdev));
}
}
@@ -1764,8 +1848,9 @@ static void update_domain(struct protection_domain *domain)
return;
update_device_table(domain);
- iommu_flush_domain_devices(domain);
- iommu_flush_tlb_pde(domain);
+
+ domain_flush_devices(domain);
+ domain_flush_tlb_pde(domain);
domain->updated = false;
}
@@ -1924,10 +2009,10 @@ retry:
ADD_STATS_COUNTER(alloced_io_mem, size);
if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
- iommu_flush_tlb(&dma_dom->domain);
+ domain_flush_tlb(&dma_dom->domain);
dma_dom->need_flush = false;
} else if (unlikely(amd_iommu_np_cache))
- iommu_flush_pages(&dma_dom->domain, address, size);
+ domain_flush_pages(&dma_dom->domain, address, size);
out:
return address;
@@ -1976,7 +2061,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
dma_ops_free_addresses(dma_dom, dma_addr, pages);
if (amd_iommu_unmap_flush || dma_dom->need_flush) {
- iommu_flush_pages(&dma_dom->domain, flush_addr, size);
+ domain_flush_pages(&dma_dom->domain, flush_addr, size);
dma_dom->need_flush = false;
}
}
@@ -2012,7 +2097,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
if (addr == DMA_ERROR_CODE)
goto out;
- iommu_flush_complete(domain);
+ domain_flush_complete(domain);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -2039,7 +2124,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
__unmap_single(domain->priv, dma_addr, size, dir);
- iommu_flush_complete(domain);
+ domain_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -2104,7 +2189,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
goto unmap;
}
- iommu_flush_complete(domain);
+ domain_flush_complete(domain);
out:
spin_unlock_irqrestore(&domain->lock, flags);
@@ -2150,7 +2235,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
s->dma_address = s->dma_length = 0;
}
- iommu_flush_complete(domain);
+ domain_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
}
@@ -2200,7 +2285,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
goto out_free;
}
- iommu_flush_complete(domain);
+ domain_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -2232,7 +2317,7 @@ static void free_coherent(struct device *dev, size_t size,
__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
- iommu_flush_complete(domain);
+ domain_flush_complete(domain);
spin_unlock_irqrestore(&domain->lock, flags);
@@ -2476,7 +2561,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
if (!iommu)
return;
- iommu_flush_device(dev);
+ device_flush_dte(dev);
iommu_completion_wait(iommu);
}
@@ -2542,7 +2627,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
unmap_size = iommu_unmap_page(domain, iova, page_size);
mutex_unlock(&domain->api_lock);
- iommu_flush_tlb_pde(domain);
+ domain_flush_tlb_pde(domain);
return get_order(unmap_size);
}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 246d727b65b7..28b078133688 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -137,6 +137,7 @@ int amd_iommus_present;
/* IOMMUs have a non-present cache? */
bool amd_iommu_np_cache __read_mostly;
+bool amd_iommu_iotlb_sup __read_mostly = true;
/*
* The ACPI table parsing functions set this variable on an error
@@ -180,6 +181,12 @@ static u32 dev_table_size; /* size of the device table */
static u32 alias_table_size; /* size of the alias table */
static u32 rlookup_table_size; /* size if the rlookup table */
+/*
+ * This function flushes all internal caches of
+ * the IOMMU used by this driver.
+ */
+extern void iommu_flush_all_caches(struct amd_iommu *iommu);
+
static inline void update_last_devid(u16 devid)
{
if (devid > amd_iommu_last_bdf)
@@ -293,9 +300,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
/* Function to enable the hardware */
static void iommu_enable(struct amd_iommu *iommu)
{
- printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
+ static const char * const feat_str[] = {
+ "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
+ "IA", "GA", "HE", "PC", NULL
+ };
+ int i;
+
+ printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
dev_name(&iommu->dev->dev), iommu->cap_ptr);
+ if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
+ printk(KERN_CONT " extended features: ");
+ for (i = 0; feat_str[i]; ++i)
+ if (iommu_feature(iommu, (1ULL << i)))
+ printk(KERN_CONT " %s", feat_str[i]);
+ }
+ printk(KERN_CONT "\n");
+
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
@@ -651,7 +672,7 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
static void __init init_iommu_from_pci(struct amd_iommu *iommu)
{
int cap_ptr = iommu->cap_ptr;
- u32 range, misc;
+ u32 range, misc, low, high;
int i, j;
pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
@@ -667,6 +688,15 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
MMIO_GET_LD(range));
iommu->evt_msi_num = MMIO_MSI_NUM(misc);
+ if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
+ amd_iommu_iotlb_sup = false;
+
+ /* read extended feature bits */
+ low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
+ high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
+
+ iommu->features = ((u64)high << 32) | low;
+
if (!is_rd890_iommu(iommu->dev))
return;
@@ -1244,6 +1274,7 @@ static void enable_iommus(void)
iommu_set_exclusion_range(iommu);
iommu_init_msi(iommu);
iommu_enable(iommu);
+ iommu_flush_all_caches(iommu);
}
}
@@ -1274,8 +1305,8 @@ static void amd_iommu_resume(void)
* we have to flush after the IOMMUs are enabled because a
* disabled IOMMU will never execute the commands we send
*/
- amd_iommu_flush_all_devices();
- amd_iommu_flush_all_domains();
+ for_each_iommu(iommu)
+ iommu_flush_all_caches(iommu);
}
static int amd_iommu_suspend(void)
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 86d1ad4962a7..73fb469908c6 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -499,7 +499,7 @@ out:
* Don't enable translation yet but enable GART IO and CPU
* accesses and set DISTLBWALKPRB since GART table memory is UC.
*/
- u32 ctl = DISTLBWALKPRB | aper_order << 1;
+ u32 ctl = aper_order << 1;
bus = amd_nb_bus_dev_ranges[i].bus;
dev_base = amd_nb_bus_dev_ranges[i].dev_base;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 68df09bba92e..45fd33d1fd3a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -128,8 +128,8 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
-static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
- struct io_apic_irq_attr *attr);
+static int io_apic_setup_irq_pin(unsigned int irq, int node,
+ struct io_apic_irq_attr *attr);
/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
void mp_save_irq(struct mpc_intsrc *m)
@@ -3570,7 +3570,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
}
#endif /* CONFIG_HT_IRQ */
-int
+static int
io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
{
struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
@@ -3585,8 +3585,8 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
return ret;
}
-static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
- struct io_apic_irq_attr *attr)
+int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+ struct io_apic_irq_attr *attr)
{
unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
int ret;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0b4be431c620..adee12e0da1f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,6 +228,7 @@
#include <linux/kthread.h>
#include <linux/jiffies.h>
#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -1238,6 +1239,7 @@ static int suspend(int vetoable)
local_irq_disable();
sysdev_suspend(PMSG_SUSPEND);
+ syscore_suspend();
local_irq_enable();
@@ -1255,6 +1257,7 @@ static int suspend(int vetoable)
apm_error("suspend", err);
err = (err == APM_SUCCESS) ? 0 : -EIO;
+ syscore_resume();
sysdev_resume();
local_irq_enable();
@@ -1280,6 +1283,7 @@ static void standby(void)
local_irq_disable();
sysdev_suspend(PMSG_SUSPEND);
+ syscore_suspend();
local_irq_enable();
err = set_system_power_state(APM_STATE_STANDBY);
@@ -1287,6 +1291,7 @@ static void standby(void)
apm_error("standby", err);
local_irq_disable();
+ syscore_resume();
sysdev_resume();
local_irq_enable();
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3ecece0217ef..bb9eb29a52dd 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -615,6 +615,25 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
/* As a rule processors have APIC timer running in deep C states */
if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
set_cpu_cap(c, X86_FEATURE_ARAT);
+
+ /*
+ * Disable GART TLB Walk Errors on Fam10h. We do this here
+ * because this is always needed when GART is enabled, even in a
+ * kernel which has no MCE support built in.
+ */
+ if (c->x86 == 0x10) {
+ /*
+ * BIOS should disable GartTlbWlk Errors themself. If
+ * it doesn't do it here as suggested by the BKDG.
+ *
+ * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
+ */
+ u64 mask;
+
+ rdmsrl(MSR_AMD64_MCx_MASK(4), mask);
+ mask |= (1 << 10);
+ wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+ }
}
#ifdef CONFIG_X86_32
@@ -679,7 +698,7 @@ cpu_dev_register(amd_cpu_dev);
*/
const int amd_erratum_400[] =
- AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+ AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0x0f, 0x4, 0x2, 0xff, 0xf),
AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
EXPORT_SYMBOL_GPL(amd_erratum_400);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index eed3673a8656..e638689279d3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -586,8 +586,12 @@ static int x86_setup_perfctr(struct perf_event *event)
return -EOPNOTSUPP;
}
+ /*
+ * Do not allow config1 (extended registers) to propagate,
+ * there's no sane user-space generalization yet:
+ */
if (attr->type == PERF_TYPE_RAW)
- return x86_pmu_extra_regs(event->attr.config, event);
+ return 0;
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, event);
@@ -609,8 +613,8 @@ static int x86_setup_perfctr(struct perf_event *event)
/*
* Branch tracing:
*/
- if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
- (hwc->sample_period == 1)) {
+ if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+ !attr->freq && hwc->sample_period == 1) {
/* BTS is not supported by this architecture. */
if (!x86_pmu.bts_active)
return -EOPNOTSUPP;
@@ -1284,6 +1288,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
+ /*
+ * Some chipsets need to unmask the LVTPC in a particular spot
+ * inside the nmi handler. As a result, the unmasking was pushed
+ * into all the nmi handlers.
+ *
+ * This generic handler doesn't seem to have any issues where the
+ * unmasking occurs so it was left at the top.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
if (!test_bit(idx, cpuc->active_mask)) {
/*
@@ -1370,8 +1384,6 @@ perf_event_nmi_handler(struct notifier_block *self,
return NOTIFY_DONE;
}
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
handled = x86_pmu.handle_irq(args->regs);
if (!handled)
return NOTIFY_DONE;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 461f62bbd774..cf4e369cea67 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
[ C(L1D) ] = {
[ C(OP_READ) ] = {
[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
- [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
+ [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
},
[ C(OP_WRITE) ] = {
[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -427,7 +427,9 @@ static __initconst const struct x86_pmu amd_pmu = {
*
* Exceptions:
*
+ * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
* 0x003 FP PERF_CTL[3]
+ * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
* 0x00B FP PERF_CTL[3]
* 0x00D FP PERF_CTL[3]
* 0x023 DE PERF_CTL[2:0]
@@ -448,6 +450,8 @@ static __initconst const struct x86_pmu amd_pmu = {
* 0x0DF LS PERF_CTL[5:0]
* 0x1D6 EX PERF_CTL[5:0]
* 0x1D8 EX PERF_CTL[5:0]
+ *
+ * (*) depending on the umask all FPU counters may be used
*/
static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -460,18 +464,28 @@ static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
static struct event_constraint *
amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
{
- unsigned int event_code = amd_get_event_code(&event->hw);
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int event_code = amd_get_event_code(hwc);
switch (event_code & AMD_EVENT_TYPE_MASK) {
case AMD_EVENT_FP:
switch (event_code) {
+ case 0x000:
+ if (!(hwc->config & 0x0000F000ULL))
+ break;
+ if (!(hwc->config & 0x00000F00ULL))
+ break;
+ return &amd_f15_PMC3;
+ case 0x004:
+ if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+ break;
+ return &amd_f15_PMC3;
case 0x003:
case 0x00B:
case 0x00D:
return &amd_f15_PMC3;
- default:
- return &amd_f15_PMC53;
}
+ return &amd_f15_PMC53;
case AMD_EVENT_LS:
case AMD_EVENT_DC:
case AMD_EVENT_EX_LS:
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 8fc2b2cee1da..447a28de6f09 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -25,7 +25,7 @@ struct intel_percore {
/*
* Intel PerfMon, used on Core and later.
*/
-static const u64 intel_perfmon_event_map[] =
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
{
[PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
@@ -184,26 +184,23 @@ static __initconst const u64 snb_hw_cache_event_ids
},
},
[ C(LL ) ] = {
- /*
- * TBD: Need Off-core Response Performance Monitoring support
- */
[ C(OP_READ) ] = {
- /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
[ C(OP_WRITE) ] = {
- /* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
[ C(OP_PREFETCH) ] = {
- /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
},
[ C(DTLB) ] = {
@@ -285,26 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
},
[ C(LL ) ] = {
[ C(OP_READ) ] = {
- /* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
/*
* Use RFO, not WRITEBACK, because a write miss would typically occur
* on RFO.
*/
[ C(OP_WRITE) ] = {
- /* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */
- [ C(RESULT_ACCESS) ] = 0x01bb,
- /* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */
+ /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+ [ C(RESULT_ACCESS) ] = 0x01b7,
+ /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
[ C(RESULT_MISS) ] = 0x01b7,
},
[ C(OP_PREFETCH) ] = {
- /* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+ /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
[ C(RESULT_ACCESS) ] = 0x01b7,
- /* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
- [ C(RESULT_MISS) ] = 0x01bb,
+ /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+ [ C(RESULT_MISS) ] = 0x01b7,
},
},
[ C(DTLB) ] = {
@@ -352,16 +349,36 @@ static __initconst const u64 westmere_hw_cache_event_ids
};
/*
- * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
*/
-#define DMND_DATA_RD (1 << 0)
-#define DMND_RFO (1 << 1)
-#define DMND_WB (1 << 3)
-#define PF_DATA_RD (1 << 4)
-#define PF_DATA_RFO (1 << 5)
-#define RESP_UNCORE_HIT (1 << 8)
-#define RESP_MISS (0xf600) /* non uncore hit */
+#define NHM_DMND_DATA_RD (1 << 0)
+#define NHM_DMND_RFO (1 << 1)
+#define NHM_DMND_IFETCH (1 << 2)
+#define NHM_DMND_WB (1 << 3)
+#define NHM_PF_DATA_RD (1 << 4)
+#define NHM_PF_DATA_RFO (1 << 5)
+#define NHM_PF_IFETCH (1 << 6)
+#define NHM_OFFCORE_OTHER (1 << 7)
+#define NHM_UNCORE_HIT (1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
+#define NHM_OTHER_CORE_HITM (1 << 10)
+ /* reserved */
+#define NHM_REMOTE_CACHE_FWD (1 << 12)
+#define NHM_REMOTE_DRAM (1 << 13)
+#define NHM_LOCAL_DRAM (1 << 14)
+#define NHM_NON_DRAM (1 << 15)
+
+#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
+
+#define NHM_DMND_READ (NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
static __initconst const u64 nehalem_hw_cache_extra_regs
[PERF_COUNT_HW_CACHE_MAX]
@@ -370,16 +387,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
{
[ C(LL ) ] = {
[ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT,
- [ C(RESULT_MISS) ] = DMND_DATA_RD|RESP_MISS,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT,
- [ C(RESULT_MISS) ] = DMND_RFO|DMND_WB|RESP_MISS,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
},
[ C(OP_PREFETCH) ] = {
- [ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT,
- [ C(RESULT_MISS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS,
+ [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+ [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
},
}
};
@@ -391,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids
{
[ C(L1D) ] = {
[ C(OP_READ) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
- [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
+ [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
+ [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
},
[ C(OP_WRITE) ] = {
- [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
- [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
+ [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
+ [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
},
[ C(OP_PREFETCH) ] = {
[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
@@ -933,6 +950,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
+ /*
+ * Some chipsets need to unmask the LVTPC in a particular spot
+ * inside the nmi handler. As a result, the unmasking was pushed
+ * into all the nmi handlers.
+ *
+ * This handler doesn't seem to have any issues with the unmasking
+ * so it was left at the top.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
@@ -998,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
unsigned int hw_event, bts_event;
+ if (event->attr.freq)
+ return NULL;
+
hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
@@ -1305,7 +1335,7 @@ static void intel_clovertown_quirks(void)
* AJ106 could possibly be worked around by not allowing LBR
* usage from PEBS, including the fixup.
* AJ68 could possibly be worked around by always programming
- * a pebs_event_reset[0] value and coping with the lost events.
+ * a pebs_event_reset[0] value and coping with the lost events.
*
* But taken together it might just make sense to not enable PEBS on
* these chips.
@@ -1409,6 +1439,18 @@ static __init int intel_pmu_init(void)
x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.extra_regs = intel_nehalem_extra_regs;
+
+ if (ebx & 0x40) {
+ /*
+ * Erratum AAJ80 detected, we work it around by using
+ * the BR_MISP_EXEC.ANY event. This will over-count
+ * branch-misses, but it's still much better than the
+ * architectural event which is often completely bogus:
+ */
+ intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+
+ pr_cont("erratum AAJ80 worked around, ");
+ }
pr_cont("Nehalem events, ");
break;
@@ -1425,6 +1467,7 @@ static __init int intel_pmu_init(void)
case 37: /* 32 nm nehalem, "Clarkdale" */
case 44: /* 32 nm nehalem, "Gulftown" */
+ case 47: /* 32 nm Xeon E7 */
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index c2520e178d32..e93fcd55fae1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -947,14 +947,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
if (perf_event_overflow(event, 1, &data, regs))
- p4_pmu_disable_event(event);
+ x86_pmu_stop(event, 0);
}
- if (handled) {
- /* p4 quirk: unmask it again */
- apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+ if (handled)
inc_irq_stat(apic_perf_irqs);
- }
+
+ /*
+ * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+ * been observed that the OVF bit flag has to be cleared first _before_
+ * the LVTPC can be unmasked.
+ *
+ * The reason is the NMI line will continue to be asserted while the OVF
+ * bit is set. This causes a second NMI to generate if the LVTPC is
+ * unmasked before the OVF bit is cleared, leading to unknown NMI
+ * messages.
+ */
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
return handled;
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 706a9fb46a58..e90f08458e6b 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -391,7 +391,7 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
- return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr);
+ return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
}
static void __init ioapic_add_ofnode(struct device_node *np)
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 82ada01625b9..b117efd24f71 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -81,6 +81,9 @@ static u32 gart_unmapped_entry;
#define AGPEXTERN
#endif
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR (1ULL << 40)
+
/* backdoor interface to AGP driver */
AGPEXTERN int agp_memory_reserved;
AGPEXTERN __u32 *agp_gatt_table;
@@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
size_t size, int dir, unsigned long align_mask)
{
unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
- unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
+ unsigned long iommu_page;
int i;
+ if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+ return bad_dma_addr;
+
+ iommu_page = alloc_iommu(dev, npages, align_mask);
if (iommu_page == -1) {
if (!nonforced_iommu(dev, phys_mem, size))
return phys_mem;
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 55d745ec1181..35ccf75696eb 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
struct iommu_table_entry *finish)
{
struct iommu_table_entry *p, *q, *x;
- char sym_p[KSYM_SYMBOL_LEN];
- char sym_q[KSYM_SYMBOL_LEN];
/* Simple cyclic dependency checker. */
for (p = start; p < finish; p++) {
q = find_dependents_of(start, finish, p);
x = find_dependents_of(start, finish, q);
if (p == x) {
- sprint_symbol(sym_p, (unsigned long)p->detect);
- sprint_symbol(sym_q, (unsigned long)q->detect);
-
- printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
- " on %s and vice-versa. BREAKING IT.\n",
- sym_p, sym_q);
+ printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
+ p->detect, q->detect);
/* Heavy handed way..*/
x->depend = 0;
}
@@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
for (p = start; p < finish; p++) {
q = find_dependents_of(p, finish, p);
if (q && q > p) {
- sprint_symbol(sym_p, (unsigned long)p->detect);
- sprint_symbol(sym_q, (unsigned long)q->detect);
-
- printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
- "should be called before %s!\n",
- sym_p, sym_q);
+ printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
+ p->detect, q->detect);
}
}
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45892dc4b72a..f65e5b521dbd 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
unsigned len, type;
struct perf_event *bp;
+ if (ptrace_get_breakpoints(tsk) < 0)
+ return -ESRCH;
+
data &= ~DR_CONTROL_RESERVED;
old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
restore:
@@ -655,6 +658,9 @@ restore:
}
goto restore;
}
+
+ ptrace_put_breakpoints(tsk);
+
return ((orig_ret < 0) ? orig_ret : rc);
}
@@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
if (n < HBP_NUM) {
struct perf_event *bp;
+
+ if (ptrace_get_breakpoints(tsk) < 0)
+ return -ESRCH;
+
bp = thread->ptrace_bps[n];
if (!bp)
- return 0;
- val = bp->hw.info.address;
+ val = 0;
+ else
+ val = bp->hw.info.address;
+
+ ptrace_put_breakpoints(tsk);
} else if (n == 6) {
val = thread->debugreg6;
} else if (n == 7) {
@@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
struct perf_event *bp;
struct thread_struct *t = &tsk->thread;
struct perf_event_attr attr;
+ int err = 0;
+
+ if (ptrace_get_breakpoints(tsk) < 0)
+ return -ESRCH;
if (!t->ptrace_bps[nr]) {
ptrace_breakpoint_init(&attr);
@@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
* writing for the user. And anyway this is the previous
* behaviour.
*/
- if (IS_ERR(bp))
- return PTR_ERR(bp);
+ if (IS_ERR(bp)) {
+ err = PTR_ERR(bp);
+ goto put;
+ }
t->ptrace_bps[nr] = bp;
} else {
- int err;
-
bp = t->ptrace_bps[nr];
attr = bp->attr;
attr.bp_addr = addr;
err = modify_user_hw_breakpoint(bp, &attr);
- if (err)
- return err;
}
-
- return 0;
+put:
+ ptrace_put_breakpoints(tsk);
+ return err;
}
/*
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
index 29092b38d816..1d5c46df0d78 100644
--- a/arch/x86/kernel/reboot_32.S
+++ b/arch/x86/kernel/reboot_32.S
@@ -21,26 +21,26 @@ r_base = .
/* Get our own relocated address */
call 1f
1: popl %ebx
- subl $1b, %ebx
+ subl $(1b - r_base), %ebx
/* Compute the equivalent real-mode segment */
movl %ebx, %ecx
shrl $4, %ecx
/* Patch post-real-mode segment jump */
- movw dispatch_table(%ebx,%eax,2),%ax
- movw %ax, 101f(%ebx)
- movw %cx, 102f(%ebx)
+ movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
+ movw %ax, (101f - r_base)(%ebx)
+ movw %cx, (102f - r_base)(%ebx)
/* Set up the IDT for real mode. */
- lidtl machine_real_restart_idt(%ebx)
+ lidtl (machine_real_restart_idt - r_base)(%ebx)
/*
* Set up a GDT from which we can load segment descriptors for real
* mode. The GDT is not used in real mode; it is just needed here to
* prepare the descriptors.
*/
- lgdtl machine_real_restart_gdt(%ebx)
+ lgdtl (machine_real_restart_gdt - r_base)(%ebx)
/*
* Load the data segment registers with 16-bit compatible values
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5a0484a95ad6..4be9b398470e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -976,6 +976,11 @@ void __init setup_arch(char **cmdline_p)
paging_init();
x86_init.paging.pagetable_setup_done(swapper_pg_dir);
+ if (boot_cpu_data.cpuid_level >= 0) {
+ /* A CPU has %cr4 if and only if it has CPUID */
+ mmu_cr4_features = read_cr4();
+ }
+
#ifdef CONFIG_X86_32
/* sync back kernel address range */
clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 58f517b59645..934b4c6b0bf9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2395,9 +2395,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
int i;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- for (i = 1; *nent < maxnent; ++i) {
- if (entry[i - 1].eax == 0 && i != 2)
- break;
+ for (i = 1; *nent < maxnent && i < 64; ++i) {
+ if (entry[i].eax == 0)
+ continue;
do_cpuid_1_ent(&entry[i], function, i);
entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -4958,12 +4958,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
best = e;
break;
}
- /*
- * Both basic or both extended?
- */
- if (((e->function ^ function) & 0x80000000) == 0)
- if (!best || e->function > best->function)
- best = e;
}
return best;
}
@@ -4983,6 +4977,27 @@ not_found:
return 36;
}
+/*
+ * If no match is found, check whether we exceed the vCPU's limit
+ * and return the content of the highest valid _standard_ leaf instead.
+ * This is to satisfy the CPUID specification.
+ */
+static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
+ u32 function, u32 index)
+{
+ struct kvm_cpuid_entry2 *maxlevel;
+
+ maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+ if (!maxlevel || maxlevel->eax >= function)
+ return NULL;
+ if (function & 0x80000000) {
+ maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
+ if (!maxlevel)
+ return NULL;
+ }
+ return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+}
+
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
u32 function, index;
@@ -4995,6 +5010,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
best = kvm_find_cpuid_entry(vcpu, function, index);
+
+ if (!best)
+ best = check_cpuid_limit(vcpu, function, index);
+
if (best) {
kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 9559d360fde7..745258dfc4dc 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -213,53 +213,48 @@ int early_cpu_to_node(int cpu)
return per_cpu(x86_cpu_to_node_map, cpu);
}
-struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+void debug_cpumask_set_cpu(int cpu, int node, bool enable)
{
- int node = early_cpu_to_node(cpu);
struct cpumask *mask;
char buf[64];
if (node == NUMA_NO_NODE) {
/* early_cpu_to_node() already emits a warning and trace */
- return NULL;
+ return;
}
mask = node_to_cpumask_map[node];
if (!mask) {
pr_err("node_to_cpumask_map[%i] NULL\n", node);
dump_stack();
- return NULL;
+ return;
}
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+
cpulist_scnprintf(buf, sizeof(buf), mask);
printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
enable ? "numa_add_cpu" : "numa_remove_cpu",
cpu, node, buf);
- return mask;
+ return;
}
# ifndef CONFIG_NUMA_EMU
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
+static void __cpuinit numa_set_cpumask(int cpu, bool enable)
{
- struct cpumask *mask;
-
- mask = debug_cpumask_set_cpu(cpu, enable);
- if (!mask)
- return;
-
- if (enable)
- cpumask_set_cpu(cpu, mask);
- else
- cpumask_clear_cpu(cpu, mask);
+ debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
}
void __cpuinit numa_add_cpu(int cpu)
{
- numa_set_cpumask(cpu, 1);
+ numa_set_cpumask(cpu, true);
}
void __cpuinit numa_remove_cpu(int cpu)
{
- numa_set_cpumask(cpu, 0);
+ numa_set_cpumask(cpu, false);
}
# endif /* !CONFIG_NUMA_EMU */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index e8c00cc72033..85b52fc03084 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -306,7 +306,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
bi->end = min(bi->end, high);
/* and there's no empty block */
- if (bi->start == bi->end) {
+ if (bi->start >= bi->end) {
numa_remove_memblk_from(i--, mi);
continue;
}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index ad091e4cff17..de84cc140379 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -454,10 +454,9 @@ void __cpuinit numa_remove_cpu(int cpu)
cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
}
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
-static void __cpuinit numa_set_cpumask(int cpu, int enable)
+static void __cpuinit numa_set_cpumask(int cpu, bool enable)
{
- struct cpumask *mask;
- int nid, physnid, i;
+ int nid, physnid;
nid = early_cpu_to_node(cpu);
if (nid == NUMA_NO_NODE) {
@@ -467,28 +466,21 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
physnid = emu_nid_to_phys[nid];
- for_each_online_node(i) {
+ for_each_online_node(nid) {
if (emu_nid_to_phys[nid] != physnid)
continue;
- mask = debug_cpumask_set_cpu(cpu, enable);
- if (!mask)
- return;
-
- if (enable)
- cpumask_set_cpu(cpu, mask);
- else
- cpumask_clear_cpu(cpu, mask);
+ debug_cpumask_set_cpu(cpu, nid, enable);
}
}
void __cpuinit numa_add_cpu(int cpu)
{
- numa_set_cpumask(cpu, 1);
+ numa_set_cpumask(cpu, true);
}
void __cpuinit numa_remove_cpu(int cpu)
{
- numa_set_cpumask(cpu, 0);
+ numa_set_cpumask(cpu, false);
}
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 48651c6f657d..364f36bdfad8 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -211,10 +211,12 @@ int __init get_memcfg_from_srat(void)
{
int i, j, nid;
-
if (srat_disabled())
goto out_fail;
+ if (acpi_numa_init() < 0)
+ goto out_fail;
+
if (num_memory_chunks == 0) {
printk(KERN_DEBUG
"could not find any ACPI SRAT memory areas.\n");
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
index dc701ea58546..e70be38ce039 100644
--- a/arch/x86/platform/ce4100/falconfalls.dts
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -74,6 +74,7 @@
compatible = "intel,ce4100-pci", "pci";
device_type = "pci";
bus-range = <1 1>;
+ reg = <0x0800 0x0 0x0 0x0 0x0>;
ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
interrupt-parent = <&ioapic2>;
@@ -346,7 +347,7 @@
"pciclass0c03";
reg = <0x16800 0x0 0x0 0x0 0x0>;
- interrupts = <22 3>;
+ interrupts = <22 1>;
};
usb@d,1 {
@@ -356,7 +357,7 @@
"pciclass0c03";
reg = <0x16900 0x0 0x0 0x0 0x0>;
- interrupts = <22 3>;
+ interrupts = <22 1>;
};
sata@e,0 {
@@ -366,7 +367,7 @@
"pciclass0106";
reg = <0x17000 0x0 0x0 0x0 0x0>;
- interrupts = <23 3>;
+ interrupts = <23 1>;
};
flash@f,0 {
@@ -412,6 +413,7 @@
#address-cells = <2>;
#size-cells = <1>;
compatible = "isa";
+ reg = <0xf800 0x0 0x0 0x0 0x0>;
ranges = <1 0 0 0 0 0x100>;
rtc@70 {
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 5c0207bf959b..275dbc19e2cf 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -97,11 +97,11 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
pentry->freq_hz, pentry->irq);
if (!pentry->irq)
continue;
- mp_irq.type = MP_IOAPIC;
+ mp_irq.type = MP_INTSRC;
mp_irq.irqtype = mp_INT;
/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
mp_irq.irqflag = 5;
- mp_irq.srcbus = 0;
+ mp_irq.srcbus = MP_BUS_ISA;
mp_irq.srcbusirq = pentry->irq; /* IRQ */
mp_irq.dstapic = MP_APIC_ALL;
mp_irq.dstirq = pentry->irq;
@@ -168,10 +168,10 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
totallen, (u32)pentry->phys_addr, pentry->irq);
- mp_irq.type = MP_IOAPIC;
+ mp_irq.type = MP_INTSRC;
mp_irq.irqtype = mp_INT;
mp_irq.irqflag = 0xf; /* level trigger and active low */
- mp_irq.srcbus = 0;
+ mp_irq.srcbus = MP_BUS_ISA;
mp_irq.srcbusirq = pentry->irq; /* IRQ */
mp_irq.dstapic = MP_APIC_ALL;
mp_irq.dstirq = pentry->irq;
@@ -282,7 +282,7 @@ void __init x86_mrst_early_setup(void)
/* Avoid searching for BIOS MP tables */
x86_init.mpparse.find_smp_config = x86_init_noop;
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
-
+ set_bit(MP_BUS_ISA, mp_bus_not_pci);
}
/*
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 04cf645feb92..73d70d65e76e 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -100,9 +100,11 @@ int vrtc_set_mmss(unsigned long nowtime)
void __init mrst_rtc_init(void)
{
- unsigned long vrtc_paddr = sfi_mrtc_array[0].phys_addr;
+ unsigned long vrtc_paddr;
sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
+
+ vrtc_paddr = sfi_mrtc_array[0].phys_addr;
if (!sfi_mrtc_num || !vrtc_paddr)
return;
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index fe4cf8294878..c7abf13a213f 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -471,15 +471,7 @@ static unsigned int startup_piix4_master_irq(struct irq_data *data)
{
legacy_pic->init(0);
enable_cobalt_irq(data);
-}
-
-static void end_piix4_master_irq(struct irq_data *data)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cobalt_lock, flags);
- enable_cobalt_irq(data);
- spin_unlock_irqrestore(&cobalt_lock, flags);
+ return 0;
}
static struct irq_chip piix4_master_irq_type = {
@@ -492,7 +484,7 @@ static void pii4_mask(struct irq_data *data) { }
static struct irq_chip piix4_virtual_irq_type = {
.name = "PIIX4-virtual",
- .mask = pii4_mask,
+ .irq_mask = pii4_mask,
};
/*
@@ -580,9 +572,9 @@ static struct irqaction cascade_action = {
static inline void set_piix4_virtual_irq_type(void)
{
- piix4_virtual_irq_type.enable = i8259A_chip.unmask;
- piix4_virtual_irq_type.disable = i8259A_chip.mask;
- piix4_virtual_irq_type.unmask = i8259A_chip.unmask;
+ piix4_virtual_irq_type.irq_enable = i8259A_chip.irq_unmask;
+ piix4_virtual_irq_type.irq_disable = i8259A_chip.irq_mask;
+ piix4_virtual_irq_type.irq_unmask = i8259A_chip.irq_unmask;
}
static void __init visws_pre_intr_init(void)
@@ -599,7 +591,7 @@ static void __init visws_pre_intr_init(void)
else if (i == CO_IRQ_IDE0)
chip = &cobalt_irq_type;
else if (i == CO_IRQ_IDE1)
- >chip = &cobalt_irq_type;
+ chip = &cobalt_irq_type;
else if (i == CO_IRQ_8259)
chip = &piix4_master_irq_type;
else if (i < CO_IRQ_APIC0)
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1c7121ba18ff..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -39,6 +39,7 @@ config XEN_MAX_DOMAIN_MEMORY
config XEN_SAVE_RESTORE
bool
depends on XEN
+ select HIBERNATE_CALLBACKS
default y
config XEN_DEBUG_FS
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 49dbd78ec3cb..e3c6a06cf725 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -238,6 +238,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
static __init void xen_init_cpuid_mask(void)
{
unsigned int ax, bx, cx, dx;
+ unsigned int xsave_mask;
cpuid_leaf1_edx_mask =
~((1 << X86_FEATURE_MCE) | /* disable MCE */
@@ -249,24 +250,16 @@ static __init void xen_init_cpuid_mask(void)
cpuid_leaf1_edx_mask &=
~((1 << X86_FEATURE_APIC) | /* disable local APIC */
(1 << X86_FEATURE_ACPI)); /* disable ACPI */
-
ax = 1;
- cx = 0;
xen_cpuid(&ax, &bx, &cx, &dx);
- /* cpuid claims we support xsave; try enabling it to see what happens */
- if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
- unsigned long cr4;
-
- set_in_cr4(X86_CR4_OSXSAVE);
-
- cr4 = read_cr4();
+ xsave_mask =
+ (1 << (X86_FEATURE_XSAVE % 32)) |
+ (1 << (X86_FEATURE_OSXSAVE % 32));
- if ((cr4 & X86_CR4_OSXSAVE) == 0)
- cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
-
- clear_in_cr4(X86_CR4_OSXSAVE);
- }
+ /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+ if ((cx & xsave_mask) != xsave_mask)
+ cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
}
static void xen_set_debugreg(int reg, unsigned long val)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c82df6c9c0f0..55c965b38c27 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -565,13 +565,13 @@ pte_t xen_make_pte_debug(pteval_t pte)
if (io_page &&
(xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
- WARN(addr != other_addr,
+ WARN_ONCE(addr != other_addr,
"0x%lx is using VM_IO, but it is 0x%lx!\n",
(unsigned long)addr, (unsigned long)other_addr);
} else {
pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
other_addr = (_pte.pte & PTE_PFN_MASK);
- WARN((addr == other_addr) && (!io_page) && (!iomap_set),
+ WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
"0x%lx is missing VM_IO (and wasn't fixed)!\n",
(unsigned long)addr);
}
@@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm)
return ret;
}
+#ifdef CONFIG_X86_64
+static __initdata u64 __last_pgt_set_rw = 0;
+static __initdata u64 __pgt_buf_start = 0;
+static __initdata u64 __pgt_buf_end = 0;
+static __initdata u64 __pgt_buf_top = 0;
+/*
+ * As a consequence of the commit:
+ *
+ * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
+ * Author: Yinghai Lu <yinghai@kernel.org>
+ * Date: Fri Dec 17 16:58:28 2010 -0800
+ *
+ * x86-64, mm: Put early page table high
+ *
+ * at some point init_memory_mapping is going to reach the pagetable pages
+ * area and map those pages too (mapping them as normal memory that falls
+ * in the range of addresses passed to init_memory_mapping as argument).
+ * Some of those pages are already pagetable pages (they are in the range
+ * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
+ * everything is fine.
+ * Some of these pages are not pagetable pages yet (they fall in the range
+ * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
+ * are going to be mapped RW. When these pages become pagetable pages and
+ * are hooked into the pagetable, xen will find that the guest has already
+ * a RW mapping of them somewhere and fail the operation.
+ * The reason Xen requires pagetables to be RO is that the hypervisor needs
+ * to verify that the pagetables are valid before using them. The validation
+ * operations are called "pinning".
+ *
+ * In order to fix the issue we mark all the pages in the entire range
+ * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
+ * is completed only the range pgt_buf_start-pgt_buf_end is reserved by
+ * init_memory_mapping. Hence the kernel is going to crash as soon as one
+ * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
+ * ranges are RO).
+ *
+ * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
+ * the init_memory_mapping has completed (in a perfect world we would
+ * call this function from init_memory_mapping, but lets ignore that).
+ *
+ * Because we are called _after_ init_memory_mapping the pgt_buf_[start,
+ * end,top] have all changed to new values (b/c init_memory_mapping
+ * is called and setting up another new page-table). Hence, the first time
+ * we enter this function, we save away the pgt_buf_start value and update
+ * the pgt_buf_[end,top].
+ *
+ * When we detect that the "old" pgt_buf_start through pgt_buf_end
+ * PFNs have been reserved (so memblock_x86_reserve_range has been called),
+ * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
+ *
+ * And then we update those "old" pgt_buf_[end|top] with the new ones
+ * so that we can redo this on the next pagetable.
+ */
+static __init void mark_rw_past_pgt(void) {
+
+ if (pgt_buf_end > pgt_buf_start) {
+ u64 addr, size;
+
+ /* Save it away. */
+ if (!__pgt_buf_start) {
+ __pgt_buf_start = pgt_buf_start;
+ __pgt_buf_end = pgt_buf_end;
+ __pgt_buf_top = pgt_buf_top;
+ return;
+ }
+ /* If we get the range that starts at __pgt_buf_end that means
+ * the range is reserved, and that in 'init_memory_mapping'
+ * the 'memblock_x86_reserve_range' has been called with the
+ * outdated __pgt_buf_start, __pgt_buf_end (the "new"
+ * pgt_buf_[start|end|top] refer now to a new pagetable.
+ * Note: we are called _after_ the pgt_buf_[..] have been
+ * updated.*/
+
+ addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
+ &size, PAGE_SIZE);
+
+ /* Still not reserved, meaning 'memblock_x86_reserve_range'
+ * hasn't been called yet. Update the _end and _top.*/
+ if (addr == PFN_PHYS(__pgt_buf_start)) {
+ __pgt_buf_end = pgt_buf_end;
+ __pgt_buf_top = pgt_buf_top;
+ return;
+ }
+
+ /* OK, the area is reserved, meaning it is time for us to
+ * set RW for the old end->top PFNs. */
+
+ /* ..unless we had already done this. */
+ if (__pgt_buf_end == __last_pgt_set_rw)
+ return;
+
+ addr = PFN_PHYS(__pgt_buf_end);
+
+ /* set as RW the rest */
+ printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
+ PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
+
+ while (addr < PFN_PHYS(__pgt_buf_top)) {
+ make_lowmem_page_readwrite(__va(addr));
+ addr += PAGE_SIZE;
+ }
+ /* And update everything so that we are ready for the next
+ * pagetable (the one created for regions past 4GB) */
+ __last_pgt_set_rw = __pgt_buf_end;
+ __pgt_buf_start = pgt_buf_start;
+ __pgt_buf_end = pgt_buf_end;
+ __pgt_buf_top = pgt_buf_top;
+ }
+ return;
+}
+#else
+static __init void mark_rw_past_pgt(void) { }
+#endif
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
@@ -1473,30 +1586,43 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
#endif
}
+#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
- unsigned long pfn = pte_pfn(pte);
-
-#ifdef CONFIG_X86_32
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
-#endif
+
+ return pte;
+}
+#else /* CONFIG_X86_64 */
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
/*
+ * A bit of optimization. We do not need to call the workaround
+ * when xen_set_pte_init is called with a PTE with 0 as PFN.
+ * That is b/c the pagetable at that point are just being populated
+ * with empty values and we can save some cycles by not calling
+ * the 'memblock' code.*/
+ if (pfn)
+ mark_rw_past_pgt();
+ /*
* If the new pfn is within the range of the newly allocated
* kernel pagetable, and it isn't being mapped into an
* early_ioremap fixmap slot as a freshly allocated page, make sure
* it is RO.
*/
if (((!is_early_ioremap_ptep(ptep) &&
- pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
+ pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
pte = pte_wrprotect(pte);
return pte;
}
+#endif /* CONFIG_X86_64 */
/* Init-time set_pte while constructing initial pagetables, which
doesn't allow RO pagetable pages to be remapped RW */
@@ -1992,6 +2118,8 @@ __init void xen_ident_map_ISA(void)
static __init void xen_post_allocator_init(void)
{
+ mark_rw_past_pgt();
+
#ifdef CONFIG_XEN_DEBUG
pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index fa0269a99377..90bac0aac3a5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -227,7 +227,7 @@ char * __init xen_memory_setup(void)
memcpy(map_raw, map, sizeof(map));
e820.nr_map = 0;
- xen_extra_mem_start = mem_end;
+ xen_extra_mem_start = max((1ULL << 32), mem_end);
for (i = 0; i < memmap.nr_entries; i++) {
unsigned long long end;