summaryrefslogtreecommitdiff
path: root/drivers/xen
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/xen')
-rw-r--r--drivers/xen/Kconfig27
-rw-r--r--drivers/xen/Makefile9
-rw-r--r--drivers/xen/balloon.c62
-rw-r--r--drivers/xen/biomerge.c13
-rw-r--r--drivers/xen/cpu_hotplug.c3
-rw-r--r--drivers/xen/events.c784
-rw-r--r--drivers/xen/evtchn.c7
-rw-r--r--drivers/xen/grant-table.c79
-rw-r--r--drivers/xen/manage.c84
-rw-r--r--drivers/xen/pci.c117
-rw-r--r--drivers/xen/platform-pci.c207
-rw-r--r--drivers/xen/swiotlb-xen.c515
-rw-r--r--drivers/xen/sys-hypervisor.c4
-rw-r--r--drivers/xen/xenbus/xenbus_client.c93
-rw-r--r--drivers/xen/xenbus/xenbus_probe.c135
-rw-r--r--drivers/xen/xenbus/xenbus_xs.c59
-rw-r--r--drivers/xen/xencomm.c2
-rw-r--r--drivers/xen/xenfs/Makefile3
-rw-r--r--drivers/xen/xenfs/privcmd.c404
-rw-r--r--drivers/xen/xenfs/super.c108
-rw-r--r--drivers/xen/xenfs/xenbus.c6
-rw-r--r--drivers/xen/xenfs/xenfs.h3
-rw-r--r--drivers/xen/xenfs/xenstored.c68
23 files changed, 2543 insertions, 249 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index cab100acf983..6e6180ccd726 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -1,6 +1,8 @@
+menu "Xen driver support"
+ depends on XEN
+
config XEN_BALLOON
bool "Xen memory balloon driver"
- depends on XEN
default y
help
The balloon driver allows the Xen domain to request more memory from
@@ -20,7 +22,6 @@ config XEN_SCRUB_PAGES
config XEN_DEV_EVTCHN
tristate "Xen /dev/xen/evtchn device"
- depends on XEN
default y
help
The evtchn driver allows a userspace process to triger event
@@ -30,7 +31,6 @@ config XEN_DEV_EVTCHN
config XENFS
tristate "Xen filesystem"
- depends on XEN
default y
help
The xen filesystem provides a way for domains to share
@@ -53,11 +53,28 @@ config XEN_COMPAT_XENFS
config XEN_SYS_HYPERVISOR
bool "Create xen entries under /sys/hypervisor"
- depends on XEN && SYSFS
+ depends on SYSFS
select SYS_HYPERVISOR
default y
help
Create entries under /sys/hypervisor describing the Xen
hypervisor environment. When running native or in another
virtual environment, /sys/hypervisor will still be present,
- but will have no xen contents. \ No newline at end of file
+ but will have no xen contents.
+
+config XEN_PLATFORM_PCI
+ tristate "xen platform pci device driver"
+ depends on XEN_PVHVM
+ default m
+ help
+ Driver for the Xen PCI Platform device: it is responsible for
+ initializing xenbus and grant_table when running in a Xen HVM
+ domain. As a consequence this driver is required to run any Xen PV
+ frontend on Xen HVM.
+
+config SWIOTLB_XEN
+ def_bool y
+ depends on PCI
+ select SWIOTLB
+
+endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ec2a39b1e26f..eb8a78d77d9d 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -1,9 +1,16 @@
obj-y += grant-table.o features.o events.o manage.o
obj-y += xenbus/
+nostackp := $(call cc-option, -fno-stack-protector)
+CFLAGS_features.o := $(nostackp)
+
+obj-$(CONFIG_BLOCK) += biomerge.o
obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o
obj-$(CONFIG_XEN_XENCOMM) += xencomm.o
obj-$(CONFIG_XEN_BALLOON) += balloon.o
obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o
obj-$(CONFIG_XENFS) += xenfs/
-obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o \ No newline at end of file
+obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
+obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o
+obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o
+obj-$(CONFIG_XEN_DOM0) += pci.o
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index f5bbd9e83416..500290b150bb 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -43,6 +43,7 @@
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/sysdev.h>
+#include <linux/gfp.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -52,6 +53,8 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/interface/memory.h>
#include <xen/xenbus.h>
@@ -66,8 +69,6 @@ struct balloon_stats {
/* We aim for 'current allocation' == 'target allocation'. */
unsigned long current_pages;
unsigned long target_pages;
- /* We may hit the hard limit in Xen. If we do then we remember it. */
- unsigned long hard_limit;
/*
* Drivers may alter the memory reservation independently, but they
* must inform the balloon driver so we avoid hitting the hard limit.
@@ -84,23 +85,12 @@ static struct sys_device balloon_sysdev;
static int register_balloon(struct sys_device *sysdev);
-/*
- * Protects atomic reservation decrease/increase against concurrent increases.
- * Also protects non-atomic updates of current_pages and driver_pages, and
- * balloon lists.
- */
-static DEFINE_SPINLOCK(balloon_lock);
-
static struct balloon_stats balloon_stats;
/* We increase/decrease in batches which fit in a page */
static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)];
-/* VM /proc information for memory */
-extern unsigned long totalram_pages;
-
#ifdef CONFIG_HIGHMEM
-extern unsigned long totalhigh_pages;
#define inc_totalhigh_pages() (totalhigh_pages++)
#define dec_totalhigh_pages() (totalhigh_pages--)
#else
@@ -140,6 +130,8 @@ static void balloon_append(struct page *page)
list_add(&page->lru, &ballooned_pages);
balloon_stats.balloon_low++;
}
+
+ totalram_pages--;
}
/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
@@ -160,6 +152,8 @@ static struct page *balloon_retrieve(void)
else
balloon_stats.balloon_low--;
+ totalram_pages++;
+
return page;
}
@@ -185,7 +179,7 @@ static void balloon_alarm(unsigned long unused)
static unsigned long current_target(void)
{
- unsigned long target = min(balloon_stats.target_pages, balloon_stats.hard_limit);
+ unsigned long target = balloon_stats.target_pages;
target = min(target,
balloon_stats.current_pages +
@@ -209,35 +203,22 @@ static int increase_reservation(unsigned long nr_pages)
if (nr_pages > ARRAY_SIZE(frame_list))
nr_pages = ARRAY_SIZE(frame_list);
- spin_lock_irqsave(&balloon_lock, flags);
+ spin_lock_irqsave(&xen_reservation_lock, flags);
page = balloon_first_page();
for (i = 0; i < nr_pages; i++) {
BUG_ON(page == NULL);
- frame_list[i] = page_to_pfn(page);;
+ frame_list[i] = page_to_pfn(page);
page = balloon_next_page(page);
}
set_xen_guest_handle(reservation.extent_start, frame_list);
reservation.nr_extents = nr_pages;
rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
- if (rc < nr_pages) {
- if (rc > 0) {
- int ret;
-
- /* We hit the Xen hard limit: reprobe. */
- reservation.nr_extents = rc;
- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
- &reservation);
- BUG_ON(ret != rc);
- }
- if (rc >= 0)
- balloon_stats.hard_limit = (balloon_stats.current_pages + rc -
- balloon_stats.driver_pages);
+ if (rc < 0)
goto out;
- }
- for (i = 0; i < nr_pages; i++) {
+ for (i = 0; i < rc; i++) {
page = balloon_retrieve();
BUG_ON(page == NULL);
@@ -263,13 +244,12 @@ static int increase_reservation(unsigned long nr_pages)
__free_page(page);
}
- balloon_stats.current_pages += nr_pages;
- totalram_pages = balloon_stats.current_pages;
+ balloon_stats.current_pages += rc;
out:
- spin_unlock_irqrestore(&balloon_lock, flags);
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
- return 0;
+ return rc < 0 ? rc : rc != nr_pages;
}
static int decrease_reservation(unsigned long nr_pages)
@@ -312,7 +292,7 @@ static int decrease_reservation(unsigned long nr_pages)
kmap_flush_unused();
flush_tlb_all();
- spin_lock_irqsave(&balloon_lock, flags);
+ spin_lock_irqsave(&xen_reservation_lock, flags);
/* No more mappings: invalidate P2M and add to balloon. */
for (i = 0; i < nr_pages; i++) {
@@ -327,9 +307,8 @@ static int decrease_reservation(unsigned long nr_pages)
BUG_ON(ret != nr_pages);
balloon_stats.current_pages -= nr_pages;
- totalram_pages = balloon_stats.current_pages;
- spin_unlock_irqrestore(&balloon_lock, flags);
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
return need_sleep;
}
@@ -371,7 +350,6 @@ static void balloon_process(struct work_struct *work)
static void balloon_set_new_target(unsigned long target)
{
/* No need for lock. Not read-modify-write updates. */
- balloon_stats.hard_limit = ~0UL;
balloon_stats.target_pages = target;
schedule_work(&balloon_worker);
}
@@ -426,12 +404,10 @@ static int __init balloon_init(void)
pr_info("xen_balloon: Initialising balloon driver.\n");
balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn);
- totalram_pages = balloon_stats.current_pages;
balloon_stats.target_pages = balloon_stats.current_pages;
balloon_stats.balloon_low = 0;
balloon_stats.balloon_high = 0;
balloon_stats.driver_pages = 0UL;
- balloon_stats.hard_limit = ~0UL;
init_timer(&balloon_timer);
balloon_timer.data = 0;
@@ -476,9 +452,6 @@ module_exit(balloon_exit);
BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages));
BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low));
BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high));
-BALLOON_SHOW(hard_limit_kb,
- (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n",
- (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0);
BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages));
static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr,
@@ -548,7 +521,6 @@ static struct attribute *balloon_info_attrs[] = {
&attr_current_kb.attr,
&attr_low_kb.attr,
&attr_high_kb.attr,
- &attr_hard_limit_kb.attr,
&attr_driver_kb.attr,
NULL
};
diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c
new file mode 100644
index 000000000000..ba6eda4b5143
--- /dev/null
+++ b/drivers/xen/biomerge.c
@@ -0,0 +1,13 @@
+#include <linux/bio.h>
+#include <linux/io.h>
+#include <xen/page.h>
+
+bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
+ const struct bio_vec *vec2)
+{
+ unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page));
+ unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page));
+
+ return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
+ ((mfn1 == mfn2) || ((mfn1+1) == mfn2));
+}
diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index bdfd584ad853..14e2d995e958 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -1,5 +1,6 @@
#include <linux/notifier.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <asm/xen/hypervisor.h>
@@ -86,7 +87,7 @@ static int setup_cpu_watcher(struct notifier_block *notifier,
for_each_possible_cpu(cpu) {
if (vcpu_online(cpu) == 0) {
(void)cpu_down(cpu);
- cpu_clear(cpu, cpu_present_map);
+ set_cpu_present(cpu, false);
}
}
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index abad71b1632b..321a0c8346e5 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -16,7 +16,7 @@
* (typically dom0).
* 2. VIRQs, typically used for timers. These are per-cpu events.
* 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
+ * 4. PIRQs - Hardware interrupts.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
@@ -27,18 +27,28 @@
#include <linux/module.h>
#include <linux/string.h>
#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/irqnr.h>
+#include <linux/pci.h>
+#include <asm/desc.h>
#include <asm/ptrace.h>
#include <asm/irq.h>
#include <asm/idle.h>
+#include <asm/io_apic.h>
#include <asm/sync_bitops.h>
+#include <asm/xen/pci.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
+#include <xen/xen.h>
+#include <xen/hvm.h>
#include <xen/xen-ops.h>
#include <xen/events.h>
#include <xen/interface/xen.h>
#include <xen/interface/event_channel.h>
+#include <xen/interface/hvm/hvm_op.h>
+#include <xen/interface/hvm/params.h>
/*
* This lock protects updates to the following mapping and reference-count
@@ -47,10 +57,10 @@
static DEFINE_SPINLOCK(irq_mapping_update_lock);
/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1};
/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
+static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1};
/* Interrupt types. */
enum xen_irq_type {
@@ -67,7 +77,8 @@ enum xen_irq_type {
* event channel - irq->event channel mapping
* cpu - cpu this event channel is bound to
* index - type-specific information:
- * PIRQ - vector, with MSB being "needs EIO"
+ * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM
+ * guest, or GSI (real passthrough IRQ) of the device.
* VIRQ - virq number
* IPI - IPI vector
* EVTCHN -
@@ -82,21 +93,30 @@ struct irq_info
unsigned short virq;
enum ipi_vector ipi;
struct {
+ unsigned short pirq;
unsigned short gsi;
- unsigned short vector;
+ unsigned char vector;
+ unsigned char flags;
} pirq;
} u;
};
+#define PIRQ_NEEDS_EOI (1 << 0)
+#define PIRQ_SHAREABLE (1 << 1)
-static struct irq_info irq_info[NR_IRQS];
+static struct irq_info *irq_info;
+static int *pirq_to_irq;
+static int nr_pirqs;
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
- [0 ... NR_EVENT_CHANNELS-1] = -1
-};
+static int *evtchn_to_irq;
struct cpu_evtchn_s {
unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG];
};
-static struct cpu_evtchn_s *cpu_evtchn_mask_p;
+
+static __initdata struct cpu_evtchn_s init_evtchn_mask = {
+ .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul,
+};
+static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask;
+
static inline unsigned long *cpu_evtchn_mask(int cpu)
{
return cpu_evtchn_mask_p[cpu].bits;
@@ -106,6 +126,8 @@ static inline unsigned long *cpu_evtchn_mask(int cpu)
#define VALID_EVTCHN(chn) ((chn) != 0)
static struct irq_chip xen_dynamic_chip;
+static struct irq_chip xen_percpu_chip;
+static struct irq_chip xen_pirq_chip;
/* Constructor for packed IRQ information. */
static struct irq_info mk_unbound_info(void)
@@ -131,11 +153,12 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq)
.cpu = 0, .u.virq = virq };
}
-static struct irq_info mk_pirq_info(unsigned short evtchn,
+static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq,
unsigned short gsi, unsigned short vector)
{
return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn,
- .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } };
+ .cpu = 0,
+ .u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } };
}
/*
@@ -177,6 +200,16 @@ static unsigned virq_from_irq(unsigned irq)
return info->u.virq;
}
+static unsigned pirq_from_irq(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info == NULL);
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return info->u.pirq.pirq;
+}
+
static unsigned gsi_from_irq(unsigned irq)
{
struct irq_info *info = info_for_irq(irq);
@@ -218,6 +251,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn)
return ret;
}
+static bool pirq_needs_eoi(unsigned irq)
+{
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ return info->u.pirq.flags & PIRQ_NEEDS_EOI;
+}
+
static inline unsigned long active_evtchns(unsigned int cpu,
struct shared_info *sh,
unsigned int idx)
@@ -254,7 +296,7 @@ static void init_evtchn_cpu_bindings(void)
}
#endif
- memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0)));
+ memset(cpu_evtchn_mask(0), ~0, sizeof(struct cpu_evtchn_s));
}
static inline void clear_evtchn(int port)
@@ -329,26 +371,413 @@ static void unmask_evtchn(int port)
put_cpu();
}
-static int find_unbound_irq(void)
+static int get_nr_hw_irqs(void)
{
- int irq;
- struct irq_desc *desc;
+ int ret = 1;
- for (irq = 0; irq < nr_irqs; irq++)
- if (irq_info[irq].type == IRQT_UNBOUND)
+#ifdef CONFIG_X86_IO_APIC
+ ret = get_nr_irqs_gsi();
+#endif
+
+ return ret;
+}
+
+/* callers of this function should make sure that PHYSDEVOP_get_nr_pirqs
+ * succeeded otherwise nr_pirqs won't hold the right value */
+static int find_unbound_pirq(void)
+{
+ int i;
+ for (i = nr_pirqs-1; i >= 0; i--) {
+ if (pirq_to_irq[i] < 0)
+ return i;
+ }
+ return -1;
+}
+
+static int find_unbound_irq(void)
+{
+ struct irq_data *data;
+ int irq, res;
+ int start = get_nr_hw_irqs();
+
+ if (start == nr_irqs)
+ goto no_irqs;
+
+ /* nr_irqs is a magic value. Must not use it.*/
+ for (irq = nr_irqs-1; irq > start; irq--) {
+ data = irq_get_irq_data(irq);
+ /* only 0->15 have init'd desc; handle irq > 16 */
+ if (!data)
break;
+ if (data->chip == &no_irq_chip)
+ break;
+ if (data->chip != &xen_dynamic_chip)
+ continue;
+ if (irq_info[irq].type == IRQT_UNBOUND)
+ return irq;
+ }
+
+ if (irq == start)
+ goto no_irqs;
- if (irq == nr_irqs)
- panic("No available IRQ to bind to: increase nr_irqs!\n");
+ res = irq_alloc_desc_at(irq, 0);
- desc = irq_to_desc_alloc_node(irq, 0);
- if (WARN_ON(desc == NULL))
+ if (WARN_ON(res != irq))
return -1;
- dynamic_irq_init(irq);
+ return irq;
+
+no_irqs:
+ panic("No available IRQ to bind to: increase nr_irqs!\n");
+}
+
+static bool identity_mapped_irq(unsigned irq)
+{
+ /* identity map all the hardware irqs */
+ return irq < get_nr_hw_irqs();
+}
+
+static void pirq_unmask_notify(int irq)
+{
+ struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) };
+
+ if (unlikely(pirq_needs_eoi(irq))) {
+ int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
+ WARN_ON(rc);
+ }
+}
+
+static void pirq_query_unmask(int irq)
+{
+ struct physdev_irq_status_query irq_status;
+ struct irq_info *info = info_for_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ irq_status.irq = pirq_from_irq(irq);
+ if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status))
+ irq_status.flags = 0;
+
+ info->u.pirq.flags &= ~PIRQ_NEEDS_EOI;
+ if (irq_status.flags & XENIRQSTAT_needs_eoi)
+ info->u.pirq.flags |= PIRQ_NEEDS_EOI;
+}
+
+static bool probing_irq(int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ return desc && desc->action == NULL;
+}
+
+static unsigned int startup_pirq(unsigned int irq)
+{
+ struct evtchn_bind_pirq bind_pirq;
+ struct irq_info *info = info_for_irq(irq);
+ int evtchn = evtchn_from_irq(irq);
+ int rc;
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ if (VALID_EVTCHN(evtchn))
+ goto out;
+
+ bind_pirq.pirq = pirq_from_irq(irq);
+ /* NB. We are happy to share unless we are probing. */
+ bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ?
+ BIND_PIRQ__WILL_SHARE : 0;
+ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
+ if (rc != 0) {
+ if (!probing_irq(irq))
+ printk(KERN_INFO "Failed to obtain physical IRQ %d\n",
+ irq);
+ return 0;
+ }
+ evtchn = bind_pirq.port;
+
+ pirq_query_unmask(irq);
+
+ evtchn_to_irq[evtchn] = irq;
+ bind_evtchn_to_cpu(evtchn, 0);
+ info->evtchn = evtchn;
+
+out:
+ unmask_evtchn(evtchn);
+ pirq_unmask_notify(irq);
+
+ return 0;
+}
+
+static void shutdown_pirq(unsigned int irq)
+{
+ struct evtchn_close close;
+ struct irq_info *info = info_for_irq(irq);
+ int evtchn = evtchn_from_irq(irq);
+
+ BUG_ON(info->type != IRQT_PIRQ);
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ mask_evtchn(evtchn);
+
+ close.port = evtchn;
+ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+ BUG();
+
+ bind_evtchn_to_cpu(evtchn, 0);
+ evtchn_to_irq[evtchn] = -1;
+ info->evtchn = 0;
+}
+
+static void enable_pirq(unsigned int irq)
+{
+ startup_pirq(irq);
+}
+
+static void disable_pirq(unsigned int irq)
+{
+}
+
+static void ack_pirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+
+ move_native_irq(irq);
+
+ if (VALID_EVTCHN(evtchn)) {
+ mask_evtchn(evtchn);
+ clear_evtchn(evtchn);
+ }
+}
+
+static void end_pirq(unsigned int irq)
+{
+ int evtchn = evtchn_from_irq(irq);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (WARN_ON(!desc))
+ return;
+
+ if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) ==
+ (IRQ_DISABLED|IRQ_PENDING)) {
+ shutdown_pirq(irq);
+ } else if (VALID_EVTCHN(evtchn)) {
+ unmask_evtchn(evtchn);
+ pirq_unmask_notify(irq);
+ }
+}
+
+static int find_irq_by_gsi(unsigned gsi)
+{
+ int irq;
+
+ for (irq = 0; irq < nr_irqs; irq++) {
+ struct irq_info *info = info_for_irq(irq);
+
+ if (info == NULL || info->type != IRQT_PIRQ)
+ continue;
+
+ if (gsi_from_irq(irq) == gsi)
+ return irq;
+ }
+
+ return -1;
+}
+
+int xen_allocate_pirq(unsigned gsi, int shareable, char *name)
+{
+ return xen_map_pirq_gsi(gsi, gsi, shareable, name);
+}
+
+/* xen_map_pirq_gsi might allocate irqs from the top down, as a
+ * consequence don't assume that the irq number returned has a low value
+ * or can be used as a pirq number unless you know otherwise.
+ *
+ * One notable exception is when xen_map_pirq_gsi is called passing an
+ * hardware gsi as argument, in that case the irq number returned
+ * matches the gsi number passed as second argument.
+ *
+ * Note: We don't assign an event channel until the irq actually started
+ * up. Return an existing irq if we've already got one for the gsi.
+ */
+int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name)
+{
+ int irq = 0;
+ struct physdev_irq irq_op;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ if ((pirq > nr_pirqs) || (gsi > nr_irqs)) {
+ printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n",
+ pirq > nr_pirqs ? "nr_pirqs" :"",
+ gsi > nr_irqs ? "nr_irqs" : "");
+ goto out;
+ }
+
+ irq = find_irq_by_gsi(gsi);
+ if (irq != -1) {
+ printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n",
+ irq, gsi);
+ goto out; /* XXX need refcount? */
+ }
+ /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore
+ * we are using the !xen_initial_domain() to drop in the function.*/
+ if (identity_mapped_irq(gsi) || (!xen_initial_domain() &&
+ xen_pv_domain())) {
+ irq = gsi;
+ irq_alloc_desc_at(irq, 0);
+ } else
+ irq = find_unbound_irq();
+
+ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
+ handle_level_irq, name);
+
+ irq_op.irq = irq;
+ irq_op.vector = 0;
+
+ /* Only the privileged domain can do this. For non-priv, the pcifront
+ * driver provides a PCI bus that does the call to do exactly
+ * this in the priv domain. */
+ if (xen_initial_domain() &&
+ HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) {
+ irq_free_desc(irq);
+ irq = -ENOSPC;
+ goto out;
+ }
+
+ irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector);
+ irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0;
+ pirq_to_irq[pirq] = irq;
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
+
+ return irq;
+}
+
+#ifdef CONFIG_PCI_MSI
+#include <linux/msi.h>
+#include "../pci/msi.h"
+
+void xen_allocate_pirq_msi(char *name, int *irq, int *pirq)
+{
+ spin_lock(&irq_mapping_update_lock);
+
+ *irq = find_unbound_irq();
+ if (*irq == -1)
+ goto out;
+
+ *pirq = find_unbound_pirq();
+ if (*pirq == -1)
+ goto out;
+
+ set_irq_chip_and_handler_name(*irq, &xen_pirq_chip,
+ handle_level_irq, name);
+
+ irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0);
+ pirq_to_irq[*pirq] = *irq;
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
+}
+
+int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type)
+{
+ int irq = -1;
+ struct physdev_map_pirq map_irq;
+ int rc;
+ int pos;
+ u32 table_offset, bir;
+
+ memset(&map_irq, 0, sizeof(map_irq));
+ map_irq.domid = DOMID_SELF;
+ map_irq.type = MAP_PIRQ_TYPE_MSI;
+ map_irq.index = -1;
+ map_irq.pirq = -1;
+ map_irq.bus = dev->bus->number;
+ map_irq.devfn = dev->devfn;
+
+ if (type == PCI_CAP_ID_MSIX) {
+ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+
+ pci_read_config_dword(dev, msix_table_offset_reg(pos),
+ &table_offset);
+ bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
+
+ map_irq.table_base = pci_resource_start(dev, bir);
+ map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
+ }
+
+ spin_lock(&irq_mapping_update_lock);
+
+ irq = find_unbound_irq();
+
+ if (irq == -1)
+ goto out;
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
+ if (rc) {
+ printk(KERN_WARNING "xen map irq failed %d\n", rc);
+
+ irq_free_desc(irq);
+
+ irq = -1;
+ goto out;
+ }
+ irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index);
+
+ set_irq_chip_and_handler_name(irq, &xen_pirq_chip,
+ handle_level_irq,
+ (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi");
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
return irq;
}
+#endif
+
+int xen_destroy_irq(int irq)
+{
+ struct irq_desc *desc;
+ struct physdev_unmap_pirq unmap_irq;
+ struct irq_info *info = info_for_irq(irq);
+ int rc = -ENOENT;
+
+ spin_lock(&irq_mapping_update_lock);
+
+ desc = irq_to_desc(irq);
+ if (!desc)
+ goto out;
+
+ if (xen_initial_domain()) {
+ unmap_irq.pirq = info->u.pirq.gsi;
+ unmap_irq.domid = DOMID_SELF;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
+ if (rc) {
+ printk(KERN_WARNING "unmap irq failed %d\n", rc);
+ goto out;
+ }
+ }
+ irq_info[irq] = mk_unbound_info();
+
+ irq_free_desc(irq);
+
+out:
+ spin_unlock(&irq_mapping_update_lock);
+ return rc;
+}
+
+int xen_vector_from_irq(unsigned irq)
+{
+ return vector_from_irq(irq);
+}
+
+int xen_gsi_from_irq(unsigned irq)
+{
+ return gsi_from_irq(irq);
+}
int bind_evtchn_to_irq(unsigned int evtchn)
{
@@ -362,7 +791,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
irq = find_unbound_irq();
set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "event");
+ handle_fasteoi_irq, "event");
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_evtchn_info(evtchn);
@@ -388,8 +817,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
if (irq < 0)
goto out;
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "ipi");
+ set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+ handle_percpu_irq, "ipi");
bind_ipi.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
@@ -410,7 +839,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
}
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
{
struct evtchn_bind_virq bind_virq;
int evtchn, irq;
@@ -420,6 +849,11 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
irq = per_cpu(virq_to_irq, cpu)[virq];
if (irq == -1) {
+ irq = find_unbound_irq();
+
+ set_irq_chip_and_handler_name(irq, &xen_percpu_chip,
+ handle_percpu_irq, "virq");
+
bind_virq.virq = virq;
bind_virq.vcpu = cpu;
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
@@ -427,11 +861,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
BUG();
evtchn = bind_virq.port;
- irq = find_unbound_irq();
-
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "virq");
-
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_virq_info(evtchn, virq);
@@ -474,9 +903,12 @@ static void unbind_from_irq(unsigned int irq)
bind_evtchn_to_cpu(evtchn, 0);
evtchn_to_irq[evtchn] = -1;
+ }
+
+ if (irq_info[irq].type != IRQT_UNBOUND) {
irq_info[irq] = mk_unbound_info();
- dynamic_irq_cleanup(irq);
+ irq_free_desc(irq);
}
spin_unlock(&irq_mapping_update_lock);
@@ -532,6 +964,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
if (irq < 0)
return irq;
+ irqflags |= IRQF_NO_SUSPEND;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
if (retval != 0) {
unbind_from_irq(irq);
@@ -559,41 +992,75 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
{
struct shared_info *sh = HYPERVISOR_shared_info;
int cpu = smp_processor_id();
+ unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu);
int i;
unsigned long flags;
static DEFINE_SPINLOCK(debug_lock);
+ struct vcpu_info *v;
spin_lock_irqsave(&debug_lock, flags);
- printk("vcpu %d\n ", cpu);
+ printk("\nvcpu %d\n ", cpu);
for_each_online_cpu(i) {
- struct vcpu_info *v = per_cpu(xen_vcpu, i);
- printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
- (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask,
- v->evtchn_upcall_pending,
- v->evtchn_pending_sel);
- }
- printk("pending:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_pending[i],
- i % 8 == 0 ? "\n " : " ");
- printk("\nmasks:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_mask[i],
- i % 8 == 0 ? "\n " : " ");
-
- printk("\nunmasked:\n ");
- for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
- printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
- i % 8 == 0 ? "\n " : " ");
+ int pending;
+ v = per_cpu(xen_vcpu, i);
+ pending = (get_irq_regs() && i == cpu)
+ ? xen_irqs_disabled(get_irq_regs())
+ : v->evtchn_upcall_mask;
+ printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i,
+ pending, v->evtchn_upcall_pending,
+ (int)(sizeof(v->evtchn_pending_sel)*2),
+ v->evtchn_pending_sel);
+ }
+ v = per_cpu(xen_vcpu, cpu);
+
+ printk("\npending:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2,
+ sh->evtchn_pending[i],
+ i % 8 == 0 ? "\n " : " ");
+ printk("\nglobal mask:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%0*lx%s",
+ (int)(sizeof(sh->evtchn_mask[0])*2),
+ sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nglobally unmasked:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+ sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nlocal cpu%d mask:\n ", cpu);
+ for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--)
+ printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2),
+ cpu_evtchn[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nlocally unmasked:\n ");
+ for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) {
+ unsigned long pending = sh->evtchn_pending[i]
+ & ~sh->evtchn_mask[i]
+ & cpu_evtchn[i];
+ printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2),
+ pending, i % 8 == 0 ? "\n " : " ");
+ }
printk("\npending list:\n");
- for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+ for (i = 0; i < NR_EVENT_CHANNELS; i++) {
if (sync_test_bit(i, sh->evtchn_pending)) {
- printk(" %d: event %d -> irq %d\n",
+ int word_idx = i / BITS_PER_LONG;
+ printk(" %d: event %d -> irq %d%s%s%s\n",
cpu_from_evtchn(i), i,
- evtchn_to_irq[i]);
+ evtchn_to_irq[i],
+ sync_test_bit(word_idx, &v->evtchn_pending_sel)
+ ? "" : " l2-clear",
+ !sync_test_bit(i, sh->evtchn_mask)
+ ? "" : " globally-masked",
+ sync_test_bit(i, cpu_evtchn)
+ ? "" : " locally-masked");
}
}
@@ -602,6 +1069,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
+static DEFINE_PER_CPU(unsigned, xed_nesting_count);
+
/*
* Search the CPUs pending events bitmasks. For each one found, map
* the event number to an irq, and feed it into do_IRQ() for
@@ -611,24 +1080,19 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
* a bitset of words which contain pending event bits. The second
* level is a bitset of pending events themselves.
*/
-void xen_evtchn_do_upcall(struct pt_regs *regs)
+static void __xen_evtchn_do_upcall(void)
{
int cpu = get_cpu();
- struct pt_regs *old_regs = set_irq_regs(regs);
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- static DEFINE_PER_CPU(unsigned, nesting_count);
unsigned count;
- exit_idle();
- irq_enter();
-
do {
unsigned long pending_words;
vcpu_info->evtchn_upcall_pending = 0;
- if (__get_cpu_var(nesting_count)++)
+ if (__get_cpu_var(xed_nesting_count)++)
goto out;
#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
@@ -645,24 +1109,48 @@ void xen_evtchn_do_upcall(struct pt_regs *regs)
int bit_idx = __ffs(pending_bits);
int port = (word_idx * BITS_PER_LONG) + bit_idx;
int irq = evtchn_to_irq[port];
+ struct irq_desc *desc;
+
+ mask_evtchn(port);
+ clear_evtchn(port);
- if (irq != -1)
- handle_irq(irq, regs);
+ if (irq != -1) {
+ desc = irq_to_desc(irq);
+ if (desc)
+ generic_handle_irq_desc(irq, desc);
+ }
}
}
BUG_ON(!irqs_disabled());
- count = __get_cpu_var(nesting_count);
- __get_cpu_var(nesting_count) = 0;
- } while(count != 1);
+ count = __get_cpu_var(xed_nesting_count);
+ __get_cpu_var(xed_nesting_count) = 0;
+ } while (count != 1 || vcpu_info->evtchn_upcall_pending);
out:
+
+ put_cpu();
+}
+
+void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ exit_idle();
+ irq_enter();
+
+ __xen_evtchn_do_upcall();
+
irq_exit();
set_irq_regs(old_regs);
+}
- put_cpu();
+void xen_hvm_evtchn_do_upcall(void)
+{
+ __xen_evtchn_do_upcall();
}
+EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall);
/* Rebind a new event channel to an existing irq. */
void rebind_evtchn_irq(int evtchn, int irq)
@@ -699,7 +1187,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
struct evtchn_bind_vcpu bind_vcpu;
int evtchn = evtchn_from_irq(irq);
- if (!VALID_EVTCHN(evtchn))
+ /* events delivered via platform PCI interrupts are always
+ * routed to vcpu 0 */
+ if (!VALID_EVTCHN(evtchn) ||
+ (xen_hvm_domain() && !xen_have_vector_callback))
return -1;
/* Send future instances of this interrupt to other vcpu. */
@@ -760,10 +1251,10 @@ static void ack_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
- move_native_irq(irq);
+ move_masked_irq(irq);
if (VALID_EVTCHN(evtchn))
- clear_evtchn(evtchn);
+ unmask_evtchn(evtchn);
}
static int retrigger_dynirq(unsigned int irq)
@@ -808,9 +1299,6 @@ static void restore_cpu_virqs(unsigned int cpu)
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_virq_info(evtchn, virq);
bind_evtchn_to_cpu(evtchn, cpu);
-
- /* Ready for use. */
- unmask_evtchn(evtchn);
}
}
@@ -836,10 +1324,6 @@ static void restore_cpu_ipis(unsigned int cpu)
evtchn_to_irq[evtchn] = irq;
irq_info[irq] = mk_ipi_info(evtchn, ipi);
bind_evtchn_to_cpu(evtchn, cpu);
-
- /* Ready for use. */
- unmask_evtchn(evtchn);
-
}
}
@@ -851,7 +1335,7 @@ void xen_clear_irq_pending(int irq)
if (VALID_EVTCHN(evtchn))
clear_evtchn(evtchn);
}
-
+EXPORT_SYMBOL(xen_clear_irq_pending);
void xen_set_irq_pending(int irq)
{
int evtchn = evtchn_from_irq(irq);
@@ -871,9 +1355,9 @@ bool xen_test_irq_pending(int irq)
return ret;
}
-/* Poll waiting for an irq to become pending. In the usual case, the
- irq will be disabled so it won't deliver an interrupt. */
-void xen_poll_irq(int irq)
+/* Poll waiting for an irq to become pending with timeout. In the usual case,
+ * the irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq_timeout(int irq, u64 timeout)
{
evtchn_port_t evtchn = evtchn_from_irq(irq);
@@ -881,17 +1365,25 @@ void xen_poll_irq(int irq)
struct sched_poll poll;
poll.nr_ports = 1;
- poll.timeout = 0;
+ poll.timeout = timeout;
set_xen_guest_handle(poll.ports, &evtchn);
if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0)
BUG();
}
}
+EXPORT_SYMBOL(xen_poll_irq_timeout);
+/* Poll waiting for an irq to become pending. In the usual case, the
+ * irq will be disabled so it won't deliver an interrupt. */
+void xen_poll_irq(int irq)
+{
+ xen_poll_irq_timeout(irq, 0 /* no timeout */);
+}
void xen_irq_resume(void)
{
unsigned int cpu, irq, evtchn;
+ struct irq_desc *desc;
init_evtchn_cpu_bindings();
@@ -910,6 +1402,23 @@ void xen_irq_resume(void)
restore_cpu_virqs(cpu);
restore_cpu_ipis(cpu);
}
+
+ /*
+ * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These
+ * are not handled by the IRQ core.
+ */
+ for_each_irq_desc(irq, desc) {
+ if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND))
+ continue;
+ if (desc->status & IRQ_DISABLED)
+ continue;
+
+ evtchn = evtchn_from_irq(irq);
+ if (evtchn == -1)
+ continue;
+
+ unmask_evtchn(evtchn);
+ }
}
static struct irq_chip xen_dynamic_chip __read_mostly = {
@@ -919,18 +1428,107 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
.mask = disable_dynirq,
.unmask = enable_dynirq,
- .ack = ack_dynirq,
+ .eoi = ack_dynirq,
.set_affinity = set_affinity_irq,
.retrigger = retrigger_dynirq,
};
+static struct irq_chip xen_pirq_chip __read_mostly = {
+ .name = "xen-pirq",
+
+ .startup = startup_pirq,
+ .shutdown = shutdown_pirq,
+
+ .enable = enable_pirq,
+ .unmask = enable_pirq,
+
+ .disable = disable_pirq,
+ .mask = disable_pirq,
+
+ .ack = ack_pirq,
+ .end = end_pirq,
+
+ .set_affinity = set_affinity_irq,
+
+ .retrigger = retrigger_dynirq,
+};
+
+static struct irq_chip xen_percpu_chip __read_mostly = {
+ .name = "xen-percpu",
+
+ .disable = disable_dynirq,
+ .mask = disable_dynirq,
+ .unmask = enable_dynirq,
+
+ .ack = ack_dynirq,
+};
+
+int xen_set_callback_via(uint64_t via)
+{
+ struct xen_hvm_param a;
+ a.domid = DOMID_SELF;
+ a.index = HVM_PARAM_CALLBACK_IRQ;
+ a.value = via;
+ return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
+}
+EXPORT_SYMBOL_GPL(xen_set_callback_via);
+
+#ifdef CONFIG_XEN_PVHVM
+/* Vector callbacks are better than PCI interrupts to receive event
+ * channel notifications because we can receive vector callbacks on any
+ * vcpu and we don't need PCI support or APIC interactions. */
+void xen_callback_vector(void)
+{
+ int rc;
+ uint64_t callback_via;
+ if (xen_have_vector_callback) {
+ callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK);
+ rc = xen_set_callback_via(callback_via);
+ if (rc) {
+ printk(KERN_ERR "Request for Xen HVM callback vector"
+ " failed.\n");
+ xen_have_vector_callback = 0;
+ return;
+ }
+ printk(KERN_INFO "Xen HVM callback vector for event delivery is "
+ "enabled\n");
+ /* in the restore case the vector has already been allocated */
+ if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors))
+ alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector);
+ }
+}
+#else
+void xen_callback_vector(void) {}
+#endif
+
void __init xen_init_IRQ(void)
{
- int i;
+ int i, rc;
+ struct physdev_nr_pirqs op_nr_pirqs;
cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s),
GFP_KERNEL);
- BUG_ON(cpu_evtchn_mask_p == NULL);
+ irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL);
+
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_nr_pirqs, &op_nr_pirqs);
+ if (rc < 0) {
+ nr_pirqs = nr_irqs;
+ if (rc != -ENOSYS)
+ printk(KERN_WARNING "PHYSDEVOP_get_nr_pirqs returned rc=%d\n", rc);
+ } else {
+ if (xen_pv_domain() && !xen_initial_domain())
+ nr_pirqs = max((int)op_nr_pirqs.nr_pirqs, nr_irqs);
+ else
+ nr_pirqs = op_nr_pirqs.nr_pirqs;
+ }
+ pirq_to_irq = kcalloc(nr_pirqs, sizeof(*pirq_to_irq), GFP_KERNEL);
+ for (i = 0; i < nr_pirqs; i++)
+ pirq_to_irq[i] = -1;
+
+ evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq),
+ GFP_KERNEL);
+ for (i = 0; i < NR_EVENT_CHANNELS; i++)
+ evtchn_to_irq[i] = -1;
init_evtchn_cpu_bindings();
@@ -938,5 +1536,15 @@ void __init xen_init_IRQ(void)
for (i = 0; i < NR_EVENT_CHANNELS; i++)
mask_evtchn(i);
- irq_ctx_init(smp_processor_id());
+ if (xen_hvm_domain()) {
+ xen_callback_vector();
+ native_init_IRQ();
+ /* pci_xen_hvm_init must be called after native_init_IRQ so that
+ * __acpi_register_gsi can point at the right function */
+ pci_xen_hvm_init();
+ } else {
+ irq_ctx_init(smp_processor_id());
+ if (xen_initial_domain())
+ xen_setup_pirqs();
+ }
}
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index f3594ec2ee33..ef11daf0cafe 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -38,7 +38,6 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/fs.h>
-#include <linux/errno.h>
#include <linux/miscdevice.h>
#include <linux/major.h>
#include <linux/proc_fs.h>
@@ -46,9 +45,10 @@
#include <linux/poll.h>
#include <linux/irq.h>
#include <linux/init.h>
-#include <linux/gfp.h>
#include <linux/mutex.h>
#include <linux/cpu.h>
+
+#include <xen/xen.h>
#include <xen/events.h>
#include <xen/evtchn.h>
#include <asm/xen/hypervisor.h>
@@ -470,7 +470,7 @@ static int evtchn_open(struct inode *inode, struct file *filp)
filp->private_data = u;
- return 0;
+ return nonseekable_open(inode, filp);;
}
static int evtchn_release(struct inode *inode, struct file *filp)
@@ -513,6 +513,7 @@ static const struct file_operations evtchn_fops = {
.fasync = evtchn_fasync,
.open = evtchn_open,
.release = evtchn_release,
+ .llseek = no_llseek,
};
static struct miscdevice evtchn_miscdev = {
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 7d8f531fb8e8..6c4531816496 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -34,12 +34,16 @@
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <xen/xen.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
+#include <xen/interface/memory.h>
#include <asm/xen/hypercall.h>
#include <asm/pgtable.h>
@@ -57,6 +61,8 @@ static unsigned int boot_max_nr_grant_frames;
static int gnttab_free_count;
static grant_ref_t gnttab_free_head;
static DEFINE_SPINLOCK(gnttab_list_lock);
+unsigned long xen_hvm_resume_frames;
+EXPORT_SYMBOL_GPL(xen_hvm_resume_frames);
static struct grant_entry *shared;
@@ -431,7 +437,7 @@ static unsigned int __max_nr_grant_frames(void)
return query.max_nr_frames;
}
-static inline unsigned int max_nr_grant_frames(void)
+unsigned int gnttab_max_grant_frames(void)
{
unsigned int xen_max = __max_nr_grant_frames();
@@ -439,6 +445,7 @@ static inline unsigned int max_nr_grant_frames(void)
return boot_max_nr_grant_frames;
return xen_max;
}
+EXPORT_SYMBOL_GPL(gnttab_max_grant_frames);
static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
{
@@ -447,6 +454,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
unsigned int nr_gframes = end_idx + 1;
int rc;
+ if (xen_hvm_domain()) {
+ struct xen_add_to_physmap xatp;
+ unsigned int i = end_idx;
+ rc = 0;
+ /*
+ * Loop backwards, so that the first hypercall has the largest
+ * index, ensuring that the table will grow only once.
+ */
+ do {
+ xatp.domid = DOMID_SELF;
+ xatp.idx = i;
+ xatp.space = XENMAPSPACE_grant_table;
+ xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i;
+ rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp);
+ if (rc != 0) {
+ printk(KERN_WARNING
+ "grant table add_to_physmap failed, err=%d\n", rc);
+ break;
+ }
+ } while (i-- > start_idx);
+
+ return rc;
+ }
+
frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC);
if (!frames)
return -ENOMEM;
@@ -463,7 +494,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
BUG_ON(rc || setup.status);
- rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(),
+ rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(),
&shared);
BUG_ON(rc);
@@ -474,9 +505,27 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx)
int gnttab_resume(void)
{
- if (max_nr_grant_frames() < nr_grant_frames)
+ unsigned int max_nr_gframes;
+
+ max_nr_gframes = gnttab_max_grant_frames();
+ if (max_nr_gframes < nr_grant_frames)
return -ENOSYS;
- return gnttab_map(0, nr_grant_frames - 1);
+
+ if (xen_pv_domain())
+ return gnttab_map(0, nr_grant_frames - 1);
+
+ if (!shared) {
+ shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes);
+ if (shared == NULL) {
+ printk(KERN_WARNING
+ "Failed to ioremap gnttab share frames!");
+ return -ENOMEM;
+ }
+ }
+
+ gnttab_map(0, nr_grant_frames - 1);
+
+ return 0;
}
int gnttab_suspend(void)
@@ -493,7 +542,7 @@ static int gnttab_expand(unsigned int req_entries)
cur = nr_grant_frames;
extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) /
GREFS_PER_GRANT_FRAME);
- if (cur + extra > max_nr_grant_frames())
+ if (cur + extra > gnttab_max_grant_frames())
return -ENOSPC;
rc = gnttab_map(cur, cur + extra - 1);
@@ -503,15 +552,12 @@ static int gnttab_expand(unsigned int req_entries)
return rc;
}
-static int __devinit gnttab_init(void)
+int gnttab_init(void)
{
int i;
unsigned int max_nr_glist_frames, nr_glist_frames;
unsigned int nr_init_grefs;
- if (!xen_domain())
- return -ENODEV;
-
nr_grant_frames = 1;
boot_max_nr_grant_frames = __max_nr_grant_frames();
@@ -554,5 +600,18 @@ static int __devinit gnttab_init(void)
kfree(gnttab_list);
return -ENOMEM;
}
+EXPORT_SYMBOL_GPL(gnttab_init);
+
+static int __devinit __gnttab_init(void)
+{
+ /* Delay grant-table initialization in the PV on HVM case */
+ if (xen_hvm_domain())
+ return 0;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ return gnttab_init();
+}
-core_initcall(gnttab_init);
+core_initcall(__gnttab_init);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 10d03d7931c4..ef9c7db52077 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -3,11 +3,13 @@
*/
#include <linux/kernel.h>
#include <linux/err.h>
+#include <linux/slab.h>
#include <linux/reboot.h>
#include <linux/sysrq.h>
#include <linux/stop_machine.h>
#include <linux/freezer.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/grant_table.h>
#include <xen/events.h>
@@ -16,6 +18,7 @@
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
+#include <asm/xen/hypervisor.h>
enum shutdown_state {
SHUTDOWN_INVALID = -1,
@@ -32,10 +35,30 @@ enum shutdown_state {
static enum shutdown_state shutting_down = SHUTDOWN_INVALID;
#ifdef CONFIG_PM_SLEEP
-static int xen_suspend(void *data)
+static int xen_hvm_suspend(void *data)
{
+ struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
int *cancelled = data;
+
+ BUG_ON(!irqs_disabled());
+
+ *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r);
+
+ xen_hvm_post_suspend(*cancelled);
+ gnttab_resume();
+
+ if (!*cancelled) {
+ xen_irq_resume();
+ xen_timer_resume();
+ }
+
+ return 0;
+}
+
+static int xen_suspend(void *data)
+{
int err;
+ int *cancelled = data;
BUG_ON(!irqs_disabled());
@@ -43,7 +66,6 @@ static int xen_suspend(void *data)
if (err) {
printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
err);
- dpm_resume_noirq(PMSG_RESUME);
return err;
}
@@ -69,7 +91,6 @@ static int xen_suspend(void *data)
}
sysdev_resume();
- dpm_resume_noirq(PMSG_RESUME);
return 0;
}
@@ -88,14 +109,14 @@ static void do_suspend(void)
err = freeze_processes();
if (err) {
printk(KERN_ERR "xen suspend: freeze failed %d\n", err);
- return;
+ goto out;
}
#endif
err = dpm_suspend_start(PMSG_SUSPEND);
if (err) {
printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err);
- goto out;
+ goto out_thaw;
}
printk(KERN_DEBUG "suspending xenstore...\n");
@@ -104,31 +125,37 @@ static void do_suspend(void)
err = dpm_suspend_noirq(PMSG_SUSPEND);
if (err) {
printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err);
- goto resume_devices;
+ goto out_resume;
}
- err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+ if (xen_hvm_domain())
+ err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0));
+ else
+ err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
+
+ dpm_resume_noirq(PMSG_RESUME);
+
if (err) {
printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
- goto out;
+ cancelled = 1;
}
+out_resume:
if (!cancelled) {
xen_arch_resume();
xs_resume();
} else
xs_suspend_cancel();
- dpm_resume_noirq(PMSG_RESUME);
-
-resume_devices:
dpm_resume_end(PMSG_RESUME);
/* Make sure timer events get retriggered on all CPUs */
clock_was_set();
-out:
+
+out_thaw:
#ifdef CONFIG_PREEMPT
thaw_processes();
+out:
#endif
shutting_down = SHUTDOWN_INVALID;
}
@@ -183,6 +210,7 @@ static void shutdown_handler(struct xenbus_watch *watch,
kfree(str);
}
+#ifdef CONFIG_MAGIC_SYSRQ
static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
unsigned int len)
{
@@ -209,18 +237,19 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec,
goto again;
if (sysrq_key != '\0')
- handle_sysrq(sysrq_key, NULL);
+ handle_sysrq(sysrq_key);
}
-static struct xenbus_watch shutdown_watch = {
- .node = "control/shutdown",
- .callback = shutdown_handler
-};
-
static struct xenbus_watch sysrq_watch = {
.node = "control/sysrq",
.callback = sysrq_handler
};
+#endif
+
+static struct xenbus_watch shutdown_watch = {
+ .node = "control/shutdown",
+ .callback = shutdown_handler
+};
static int setup_shutdown_watcher(void)
{
@@ -232,11 +261,13 @@ static int setup_shutdown_watcher(void)
return err;
}
+#ifdef CONFIG_MAGIC_SYSRQ
err = register_xenbus_watch(&sysrq_watch);
if (err) {
printk(KERN_ERR "Failed to set sysrq watcher\n");
return err;
}
+#endif
return 0;
}
@@ -249,7 +280,19 @@ static int shutdown_event(struct notifier_block *notifier,
return NOTIFY_DONE;
}
-static int __init setup_shutdown_event(void)
+static int __init __setup_shutdown_event(void)
+{
+ /* Delay initialization in the PV on HVM case */
+ if (xen_hvm_domain())
+ return 0;
+
+ if (!xen_pv_domain())
+ return -ENODEV;
+
+ return xen_setup_shutdown_event();
+}
+
+int xen_setup_shutdown_event(void)
{
static struct notifier_block xenstore_notifier = {
.notifier_call = shutdown_event
@@ -258,5 +301,6 @@ static int __init setup_shutdown_event(void)
return 0;
}
+EXPORT_SYMBOL_GPL(xen_setup_shutdown_event);
-subsys_initcall(setup_shutdown_event);
+subsys_initcall(__setup_shutdown_event);
diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c
new file mode 100644
index 000000000000..cef4bafc07dc
--- /dev/null
+++ b/drivers/xen/pci.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Weidong Han <weidong.han@intel.com>
+ */
+
+#include <linux/pci.h>
+#include <xen/xen.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/xen.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include "../pci/pci.h"
+
+static int xen_add_device(struct device *dev)
+{
+ int r;
+ struct pci_dev *pci_dev = to_pci_dev(dev);
+
+#ifdef CONFIG_PCI_IOV
+ if (pci_dev->is_virtfn) {
+ struct physdev_manage_pci_ext manage_pci_ext = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ .is_virtfn = 1,
+ .physfn.bus = pci_dev->physfn->bus->number,
+ .physfn.devfn = pci_dev->physfn->devfn,
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+ &manage_pci_ext);
+ } else
+#endif
+ if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) {
+ struct physdev_manage_pci_ext manage_pci_ext = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ .is_extfn = 1,
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext,
+ &manage_pci_ext);
+ } else {
+ struct physdev_manage_pci manage_pci = {
+ .bus = pci_dev->bus->number,
+ .devfn = pci_dev->devfn,
+ };
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add,
+ &manage_pci);
+ }
+
+ return r;
+}
+
+static int xen_remove_device(struct device *dev)
+{
+ int r;
+ struct pci_dev *pci_dev = to_pci_dev(dev);
+ struct physdev_manage_pci manage_pci;
+
+ manage_pci.bus = pci_dev->bus->number;
+ manage_pci.devfn = pci_dev->devfn;
+
+ r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove,
+ &manage_pci);
+
+ return r;
+}
+
+static int xen_pci_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+ int r = 0;
+
+ switch (action) {
+ case BUS_NOTIFY_ADD_DEVICE:
+ r = xen_add_device(dev);
+ break;
+ case BUS_NOTIFY_DEL_DEVICE:
+ r = xen_remove_device(dev);
+ break;
+ default:
+ break;
+ }
+
+ return r;
+}
+
+struct notifier_block device_nb = {
+ .notifier_call = xen_pci_notifier,
+};
+
+static int __init register_xen_pci_notifier(void)
+{
+ if (!xen_initial_domain())
+ return 0;
+
+ return bus_register_notifier(&pci_bus_type, &device_nb);
+}
+
+arch_initcall(register_xen_pci_notifier);
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
new file mode 100644
index 000000000000..c01b5ddce529
--- /dev/null
+++ b/drivers/xen/platform-pci.c
@@ -0,0 +1,207 @@
+/******************************************************************************
+ * platform-pci.c
+ *
+ * Xen platform PCI device driver
+ * Copyright (c) 2005, Intel Corporation.
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <xen/platform_pci.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/events.h>
+#include <xen/hvm.h>
+#include <xen/xen-ops.h>
+
+#define DRV_NAME "xen-platform-pci"
+
+MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com");
+MODULE_DESCRIPTION("Xen platform PCI device");
+MODULE_LICENSE("GPL");
+
+static unsigned long platform_mmio;
+static unsigned long platform_mmio_alloc;
+static unsigned long platform_mmiolen;
+static uint64_t callback_via;
+
+unsigned long alloc_xen_mmio(unsigned long len)
+{
+ unsigned long addr;
+
+ addr = platform_mmio + platform_mmio_alloc;
+ platform_mmio_alloc += len;
+ BUG_ON(platform_mmio_alloc > platform_mmiolen);
+
+ return addr;
+}
+
+static uint64_t get_callback_via(struct pci_dev *pdev)
+{
+ u8 pin;
+ int irq;
+
+ irq = pdev->irq;
+ if (irq < 16)
+ return irq; /* ISA IRQ */
+
+ pin = pdev->pin;
+
+ /* We don't know the GSI. Specify the PCI INTx line instead. */
+ return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */
+ ((uint64_t)pci_domain_nr(pdev->bus) << 32) |
+ ((uint64_t)pdev->bus->number << 16) |
+ ((uint64_t)(pdev->devfn & 0xff) << 8) |
+ ((uint64_t)(pin - 1) & 3);
+}
+
+static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id)
+{
+ xen_hvm_evtchn_do_upcall();
+ return IRQ_HANDLED;
+}
+
+static int xen_allocate_irq(struct pci_dev *pdev)
+{
+ return request_irq(pdev->irq, do_hvm_evtchn_intr,
+ IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING,
+ "xen-platform-pci", pdev);
+}
+
+static int platform_pci_resume(struct pci_dev *pdev)
+{
+ int err;
+ if (xen_have_vector_callback)
+ return 0;
+ err = xen_set_callback_via(callback_via);
+ if (err) {
+ dev_err(&pdev->dev, "platform_pci_resume failure!\n");
+ return err;
+ }
+ return 0;
+}
+
+static int __devinit platform_pci_init(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int i, ret;
+ long ioaddr, iolen;
+ long mmio_addr, mmio_len;
+ unsigned int max_nr_gframes;
+
+ i = pci_enable_device(pdev);
+ if (i)
+ return i;
+
+ ioaddr = pci_resource_start(pdev, 0);
+ iolen = pci_resource_len(pdev, 0);
+
+ mmio_addr = pci_resource_start(pdev, 1);
+ mmio_len = pci_resource_len(pdev, 1);
+
+ if (mmio_addr == 0 || ioaddr == 0) {
+ dev_err(&pdev->dev, "no resources found\n");
+ ret = -ENOENT;
+ goto pci_out;
+ }
+
+ if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) {
+ dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n",
+ mmio_addr, mmio_len);
+ ret = -EBUSY;
+ goto pci_out;
+ }
+
+ if (request_region(ioaddr, iolen, DRV_NAME) == NULL) {
+ dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n",
+ iolen, ioaddr);
+ ret = -EBUSY;
+ goto mem_out;
+ }
+
+ platform_mmio = mmio_addr;
+ platform_mmiolen = mmio_len;
+
+ if (!xen_have_vector_callback) {
+ ret = xen_allocate_irq(pdev);
+ if (ret) {
+ dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);
+ goto out;
+ }
+ callback_via = get_callback_via(pdev);
+ ret = xen_set_callback_via(callback_via);
+ if (ret) {
+ dev_warn(&pdev->dev, "Unable to set the evtchn callback "
+ "err=%d\n", ret);
+ goto out;
+ }
+ }
+
+ max_nr_gframes = gnttab_max_grant_frames();
+ xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes);
+ ret = gnttab_init();
+ if (ret)
+ goto out;
+ xenbus_probe(NULL);
+ ret = xen_setup_shutdown_event();
+ if (ret)
+ goto out;
+ return 0;
+
+out:
+ release_region(ioaddr, iolen);
+mem_out:
+ release_mem_region(mmio_addr, mmio_len);
+pci_out:
+ pci_disable_device(pdev);
+ return ret;
+}
+
+static struct pci_device_id platform_pci_tbl[] __devinitdata = {
+ {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM,
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+ {0,}
+};
+
+MODULE_DEVICE_TABLE(pci, platform_pci_tbl);
+
+static struct pci_driver platform_driver = {
+ .name = DRV_NAME,
+ .probe = platform_pci_init,
+ .id_table = platform_pci_tbl,
+#ifdef CONFIG_PM
+ .resume_early = platform_pci_resume,
+#endif
+};
+
+static int __init platform_pci_module_init(void)
+{
+ /* no unplug has been done, IGNORE hasn't been specified: just
+ * return now */
+ if (!xen_platform_pci_unplug)
+ return -ENODEV;
+
+ return pci_register_driver(&platform_driver);
+}
+
+module_init(platform_pci_module_init);
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
new file mode 100644
index 000000000000..54469c3eeacd
--- /dev/null
+++ b/drivers/xen/swiotlb-xen.c
@@ -0,0 +1,515 @@
+/*
+ * Copyright 2010
+ * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *
+ * This code provides a IOMMU for Xen PV guests with PCI passthrough.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License v2.0 as published by
+ * the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * PV guests under Xen are running in an non-contiguous memory architecture.
+ *
+ * When PCI pass-through is utilized, this necessitates an IOMMU for
+ * translating bus (DMA) to virtual and vice-versa and also providing a
+ * mechanism to have contiguous pages for device drivers operations (say DMA
+ * operations).
+ *
+ * Specifically, under Xen the Linux idea of pages is an illusion. It
+ * assumes that pages start at zero and go up to the available memory. To
+ * help with that, the Linux Xen MMU provides a lookup mechanism to
+ * translate the page frame numbers (PFN) to machine frame numbers (MFN)
+ * and vice-versa. The MFN are the "real" frame numbers. Furthermore
+ * memory is not contiguous. Xen hypervisor stitches memory for guests
+ * from different pools, which means there is no guarantee that PFN==MFN
+ * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are
+ * allocated in descending order (high to low), meaning the guest might
+ * never get any MFN's under the 4GB mark.
+ *
+ */
+
+#include <linux/bootmem.h>
+#include <linux/dma-mapping.h>
+#include <xen/swiotlb-xen.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+/*
+ * Used to do a quick range check in swiotlb_tbl_unmap_single and
+ * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
+ * API.
+ */
+
+static char *xen_io_tlb_start, *xen_io_tlb_end;
+static unsigned long xen_io_tlb_nslabs;
+/*
+ * Quick lookup value of the bus address of the IOTLB.
+ */
+
+u64 start_dma_addr;
+
+static dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
+{
+ return phys_to_machine(XPADDR(paddr)).maddr;;
+}
+
+static phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
+{
+ return machine_to_phys(XMADDR(baddr)).paddr;
+}
+
+static dma_addr_t xen_virt_to_bus(void *address)
+{
+ return xen_phys_to_bus(virt_to_phys(address));
+}
+
+static int check_pages_physically_contiguous(unsigned long pfn,
+ unsigned int offset,
+ size_t length)
+{
+ unsigned long next_mfn;
+ int i;
+ int nr_pages;
+
+ next_mfn = pfn_to_mfn(pfn);
+ nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+
+ for (i = 1; i < nr_pages; i++) {
+ if (pfn_to_mfn(++pfn) != ++next_mfn)
+ return 0;
+ }
+ return 1;
+}
+
+static int range_straddles_page_boundary(phys_addr_t p, size_t size)
+{
+ unsigned long pfn = PFN_DOWN(p);
+ unsigned int offset = p & ~PAGE_MASK;
+
+ if (offset + size <= PAGE_SIZE)
+ return 0;
+ if (check_pages_physically_contiguous(pfn, offset, size))
+ return 0;
+ return 1;
+}
+
+static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
+{
+ unsigned long mfn = PFN_DOWN(dma_addr);
+ unsigned long pfn = mfn_to_local_pfn(mfn);
+ phys_addr_t paddr;
+
+ /* If the address is outside our domain, it CAN
+ * have the same virtual address as another address
+ * in our domain. Therefore _only_ check address within our domain.
+ */
+ if (pfn_valid(pfn)) {
+ paddr = PFN_PHYS(pfn);
+ return paddr >= virt_to_phys(xen_io_tlb_start) &&
+ paddr < virt_to_phys(xen_io_tlb_end);
+ }
+ return 0;
+}
+
+static int max_dma_bits = 32;
+
+static int
+xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
+{
+ int i, rc;
+ int dma_bits;
+
+ dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+
+ i = 0;
+ do {
+ int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE);
+
+ do {
+ rc = xen_create_contiguous_region(
+ (unsigned long)buf + (i << IO_TLB_SHIFT),
+ get_order(slabs << IO_TLB_SHIFT),
+ dma_bits);
+ } while (rc && dma_bits++ < max_dma_bits);
+ if (rc)
+ return rc;
+
+ i += slabs;
+ } while (i < nslabs);
+ return 0;
+}
+
+void __init xen_swiotlb_init(int verbose)
+{
+ unsigned long bytes;
+ int rc;
+
+ xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
+ xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+
+ bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT;
+
+ /*
+ * Get IO TLB memory from any location.
+ */
+ xen_io_tlb_start = alloc_bootmem(bytes);
+ if (!xen_io_tlb_start)
+ panic("Cannot allocate SWIOTLB buffer");
+
+ xen_io_tlb_end = xen_io_tlb_start + bytes;
+ /*
+ * And replace that memory with pages under 4GB.
+ */
+ rc = xen_swiotlb_fixup(xen_io_tlb_start,
+ bytes,
+ xen_io_tlb_nslabs);
+ if (rc)
+ goto error;
+
+ start_dma_addr = xen_virt_to_bus(xen_io_tlb_start);
+ swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose);
+
+ return;
+error:
+ panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\
+ "We either don't have the permission or you do not have enough"\
+ "free memory under 4GB!\n", rc);
+}
+
+void *
+xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
+ dma_addr_t *dma_handle, gfp_t flags)
+{
+ void *ret;
+ int order = get_order(size);
+ u64 dma_mask = DMA_BIT_MASK(32);
+ unsigned long vstart;
+
+ /*
+ * Ignore region specifiers - the kernel's ideas of
+ * pseudo-phys memory layout has nothing to do with the
+ * machine physical layout. We can't allocate highmem
+ * because we can't return a pointer to it.
+ */
+ flags &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+ if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret))
+ return ret;
+
+ vstart = __get_free_pages(flags, order);
+ ret = (void *)vstart;
+
+ if (hwdev && hwdev->coherent_dma_mask)
+ dma_mask = dma_alloc_coherent_mask(hwdev, flags);
+
+ if (ret) {
+ if (xen_create_contiguous_region(vstart, order,
+ fls64(dma_mask)) != 0) {
+ free_pages(vstart, order);
+ return NULL;
+ }
+ memset(ret, 0, size);
+ *dma_handle = virt_to_machine(ret).maddr;
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent);
+
+void
+xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
+ dma_addr_t dev_addr)
+{
+ int order = get_order(size);
+
+ if (dma_release_from_coherent(hwdev, order, vaddr))
+ return;
+
+ xen_destroy_contiguous_region((unsigned long)vaddr, order);
+ free_pages((unsigned long)vaddr, order);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent);
+
+
+/*
+ * Map a single buffer of the indicated size for DMA in streaming mode. The
+ * physical address to use is returned.
+ *
+ * Once the device is given the dma address, the device owns this memory until
+ * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed.
+ */
+dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
+ unsigned long offset, size_t size,
+ enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ phys_addr_t phys = page_to_phys(page) + offset;
+ dma_addr_t dev_addr = xen_phys_to_bus(phys);
+ void *map;
+
+ BUG_ON(dir == DMA_NONE);
+ /*
+ * If the address happens to be in the device's DMA window,
+ * we can safely return the device addr and not worry about bounce
+ * buffering it.
+ */
+ if (dma_capable(dev, dev_addr, size) &&
+ !range_straddles_page_boundary(phys, size) && !swiotlb_force)
+ return dev_addr;
+
+ /*
+ * Oh well, have to allocate and map a bounce buffer.
+ */
+ map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
+ if (!map)
+ return DMA_ERROR_CODE;
+
+ dev_addr = xen_virt_to_bus(map);
+
+ /*
+ * Ensure that the address returned is DMA'ble
+ */
+ if (!dma_capable(dev, dev_addr, size))
+ panic("map_single: bounce buffer is not DMA'ble");
+
+ return dev_addr;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_page);
+
+/*
+ * Unmap a single streaming mode DMA translation. The dma_addr and size must
+ * match what was provided for in a previous xen_swiotlb_map_page call. All
+ * other usages are undefined.
+ *
+ * After this call, reads by the cpu to the buffer are guaranteed to see
+ * whatever the device wrote there.
+ */
+static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+ BUG_ON(dir == DMA_NONE);
+
+ /* NOTE: We use dev_addr here, not paddr! */
+ if (is_xen_swiotlb_buffer(dev_addr)) {
+ swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+ return;
+ }
+
+ if (dir != DMA_FROM_DEVICE)
+ return;
+
+ /*
+ * phys_to_virt doesn't work with hihgmem page but we could
+ * call dma_mark_clean() with hihgmem page here. However, we
+ * are fine since dma_mark_clean() is null on POWERPC. We can
+ * make dma_mark_clean() take a physical address if necessary.
+ */
+ dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ xen_unmap_single(hwdev, dev_addr, size, dir);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page);
+
+/*
+ * Make physical memory consistent for a single streaming mode DMA translation
+ * after a transfer.
+ *
+ * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer
+ * using the cpu, yet do not wish to teardown the dma mapping, you must
+ * call this function before doing so. At the next point you give the dma
+ * address back to the card, you must first perform a
+ * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer
+ */
+static void
+xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir,
+ enum dma_sync_target target)
+{
+ phys_addr_t paddr = xen_bus_to_phys(dev_addr);
+
+ BUG_ON(dir == DMA_NONE);
+
+ /* NOTE: We use dev_addr here, not paddr! */
+ if (is_xen_swiotlb_buffer(dev_addr)) {
+ swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
+ target);
+ return;
+ }
+
+ if (dir != DMA_FROM_DEVICE)
+ return;
+
+ dma_mark_clean(phys_to_virt(paddr), size);
+}
+
+void
+xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu);
+
+void
+xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr,
+ size_t size, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device);
+
+/*
+ * Map a set of buffers described by scatterlist in streaming mode for DMA.
+ * This is the scatter-gather version of the above xen_swiotlb_map_page
+ * interface. Here the scatter gather list elements are each tagged with the
+ * appropriate dma address and length. They are obtained via
+ * sg_dma_{address,length}(SG).
+ *
+ * NOTE: An implementation may be able to use a smaller number of
+ * DMA address/length pairs than there are SG table elements.
+ * (for example via virtual mapping capabilities)
+ * The routine returns the number of addr/length pairs actually
+ * used, at most nents.
+ *
+ * Device ownership issues as mentioned above for xen_swiotlb_map_page are the
+ * same here.
+ */
+int
+xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i) {
+ phys_addr_t paddr = sg_phys(sg);
+ dma_addr_t dev_addr = xen_phys_to_bus(paddr);
+
+ if (swiotlb_force ||
+ !dma_capable(hwdev, dev_addr, sg->length) ||
+ range_straddles_page_boundary(paddr, sg->length)) {
+ void *map = swiotlb_tbl_map_single(hwdev,
+ start_dma_addr,
+ sg_phys(sg),
+ sg->length, dir);
+ if (!map) {
+ /* Don't panic here, we expect map_sg users
+ to do proper error handling. */
+ xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
+ attrs);
+ sgl[0].dma_length = 0;
+ return DMA_ERROR_CODE;
+ }
+ sg->dma_address = xen_virt_to_bus(map);
+ } else
+ sg->dma_address = dev_addr;
+ sg->dma_length = sg->length;
+ }
+ return nelems;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs);
+
+int
+xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ enum dma_data_direction dir)
+{
+ return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg);
+
+/*
+ * Unmap a set of streaming mode DMA translations. Again, cpu read rules
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
+ */
+void
+xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ struct dma_attrs *attrs)
+{
+ struct scatterlist *sg;
+ int i;
+
+ BUG_ON(dir == DMA_NONE);
+
+ for_each_sg(sgl, sg, nelems, i)
+ xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs);
+
+void
+xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems,
+ enum dma_data_direction dir)
+{
+ return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg);
+
+/*
+ * Make physical memory consistent for a set of streaming mode DMA translations
+ * after a transfer.
+ *
+ * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules
+ * and usage.
+ */
+static void
+xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
+ int nelems, enum dma_data_direction dir,
+ enum dma_sync_target target)
+{
+ struct scatterlist *sg;
+ int i;
+
+ for_each_sg(sgl, sg, nelems, i)
+ xen_swiotlb_sync_single(hwdev, sg->dma_address,
+ sg->dma_length, dir, target);
+}
+
+void
+xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu);
+
+void
+xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
+ int nelems, enum dma_data_direction dir)
+{
+ xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE);
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device);
+
+int
+xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
+{
+ return !dma_addr;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error);
+
+/*
+ * Return whether the given device DMA address mask can be supported
+ * properly. For example, if your device can only drive the low 24-bits
+ * during bus mastering, then you would pass 0x00ffffff as the mask to
+ * this function.
+ */
+int
+xen_swiotlb_dma_supported(struct device *hwdev, u64 mask)
+{
+ return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask;
+}
+EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported);
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 88a60e03ccf0..60f1827a32cb 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -7,6 +7,7 @@
* published by the Free Software Foundation.
*/
+#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kobject.h>
@@ -14,6 +15,7 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h>
@@ -425,7 +427,7 @@ static ssize_t hyp_sysfs_store(struct kobject *kobj,
return 0;
}
-static struct sysfs_ops hyp_sysfs_ops = {
+static const struct sysfs_ops hyp_sysfs_ops = {
.show = hyp_sysfs_show,
.store = hyp_sysfs_store,
};
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 92a1ef80a288..cdacf923e073 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -30,6 +30,7 @@
* IN THE SOFTWARE.
*/
+#include <linux/slab.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <asm/xen/hypervisor.h>
@@ -49,6 +50,8 @@ const char *xenbus_strstate(enum xenbus_state state)
[ XenbusStateConnected ] = "Connected",
[ XenbusStateClosing ] = "Closing",
[ XenbusStateClosed ] = "Closed",
+ [XenbusStateReconfiguring] = "Reconfiguring",
+ [XenbusStateReconfigured] = "Reconfigured",
};
return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID";
}
@@ -132,17 +135,12 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev,
}
EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt);
+static void xenbus_switch_fatal(struct xenbus_device *, int, int,
+ const char *, ...);
-/**
- * xenbus_switch_state
- * @dev: xenbus device
- * @state: new state
- *
- * Advertise in the store a change of the given driver to the given new_state.
- * Return 0 on success, or -errno on error. On error, the device will switch
- * to XenbusStateClosing, and the error will be saved in the store.
- */
-int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+static int
+__xenbus_switch_state(struct xenbus_device *dev,
+ enum xenbus_state state, int depth)
{
/* We check whether the state is currently set to the given value, and
if not, then the state is set. We don't want to unconditionally
@@ -151,35 +149,65 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
to it, as the device will be tearing down, and we don't want to
resurrect that directory.
- Note that, because of this cached value of our state, this function
- will not work inside a Xenstore transaction (something it was
- trying to in the past) because dev->state would not get reset if
- the transaction was aborted.
-
+ Note that, because of this cached value of our state, this
+ function will not take a caller's Xenstore transaction
+ (something it was trying to in the past) because dev->state
+ would not get reset if the transaction was aborted.
*/
+ struct xenbus_transaction xbt;
int current_state;
- int err;
+ int err, abort;
if (state == dev->state)
return 0;
- err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d",
- &current_state);
- if (err != 1)
+again:
+ abort = 1;
+
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_switch_fatal(dev, depth, err, "starting transaction");
return 0;
+ }
+
+ err = xenbus_scanf(xbt, dev->nodename, "state", "%d", &current_state);
+ if (err != 1)
+ goto abort;
- err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state);
+ err = xenbus_printf(xbt, dev->nodename, "state", "%d", state);
if (err) {
- if (state != XenbusStateClosing) /* Avoid looping */
- xenbus_dev_fatal(dev, err, "writing new state");
- return err;
+ xenbus_switch_fatal(dev, depth, err, "writing new state");
+ goto abort;
}
- dev->state = state;
+ abort = 0;
+abort:
+ err = xenbus_transaction_end(xbt, abort);
+ if (err) {
+ if (err == -EAGAIN && !abort)
+ goto again;
+ xenbus_switch_fatal(dev, depth, err, "ending transaction");
+ } else
+ dev->state = state;
return 0;
}
+
+/**
+ * xenbus_switch_state
+ * @dev: xenbus device
+ * @state: new state
+ *
+ * Advertise in the store a change of the given driver to the given new_state.
+ * Return 0 on success, or -errno on error. On error, the device will switch
+ * to XenbusStateClosing, and the error will be saved in the store.
+ */
+int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state)
+{
+ return __xenbus_switch_state(dev, state, 0);
+}
+
EXPORT_SYMBOL_GPL(xenbus_switch_state);
int xenbus_frontend_closed(struct xenbus_device *dev)
@@ -283,6 +311,23 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...)
EXPORT_SYMBOL_GPL(xenbus_dev_fatal);
/**
+ * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps
+ * avoiding recursion within xenbus_switch_state.
+ */
+static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ xenbus_va_dev_error(dev, err, fmt, ap);
+ va_end(ap);
+
+ if (!depth)
+ __xenbus_switch_state(dev, XenbusStateClosing, 1);
+}
+
+/**
* xenbus_grant_ring
* @dev: xenbus device
* @ring_mfn: mfn of ring to grant
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index d42e25d5968d..deb9c4ba3a93 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -45,22 +45,30 @@
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/io.h>
+#include <linux/slab.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/page.h>
+#include <xen/platform_pci.h>
+#include <xen/hvm.h>
+
#include "xenbus_comms.h"
#include "xenbus_probe.h"
int xen_store_evtchn;
-EXPORT_SYMBOL(xen_store_evtchn);
+EXPORT_SYMBOL_GPL(xen_store_evtchn);
struct xenstore_domain_interface *xen_store_interface;
+EXPORT_SYMBOL_GPL(xen_store_interface);
+
static unsigned long xen_store_mfn;
static BLOCKING_NOTIFIER_HEAD(xenstore_chain);
@@ -454,21 +462,21 @@ static ssize_t xendev_show_nodename(struct device *dev,
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename);
}
-DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
+static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL);
static ssize_t xendev_show_devtype(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype);
}
-DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
+static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL);
static ssize_t xendev_show_modalias(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype);
}
-DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
+static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL);
int xenbus_probe_node(struct xen_bus_type *bus,
const char *type,
@@ -766,7 +774,7 @@ EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
void xenbus_probe(struct work_struct *unused)
{
- BUG_ON((xenstored_ready <= 0));
+ xenstored_ready = 1;
/* Enumerate devices in xenstore and watch for changes. */
xenbus_probe_devices(&xenbus_frontend);
@@ -776,10 +784,26 @@ void xenbus_probe(struct work_struct *unused)
/* Notify others that xenstore is up */
blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
}
+EXPORT_SYMBOL_GPL(xenbus_probe);
-static int __init xenbus_probe_init(void)
+static int __init xenbus_probe_initcall(void)
+{
+ if (!xen_domain())
+ return -ENODEV;
+
+ if (xen_initial_domain() || xen_hvm_domain())
+ return 0;
+
+ xenbus_probe(NULL);
+ return 0;
+}
+
+device_initcall(xenbus_probe_initcall);
+
+static int __init xenbus_init(void)
{
int err = 0;
+ unsigned long page = 0;
DPRINTK("");
@@ -800,13 +824,50 @@ static int __init xenbus_probe_init(void)
* Domain0 doesn't have a store_evtchn or store_mfn yet.
*/
if (xen_initial_domain()) {
- /* dom0 not yet supported */
+ struct evtchn_alloc_unbound alloc_unbound;
+
+ /* Allocate Xenstore page */
+ page = get_zeroed_page(GFP_KERNEL);
+ if (!page)
+ goto out_error;
+
+ xen_store_mfn = xen_start_info->store_mfn =
+ pfn_to_mfn(virt_to_phys((void *)page) >>
+ PAGE_SHIFT);
+
+ /* Next allocate a local port which xenstored can bind to */
+ alloc_unbound.dom = DOMID_SELF;
+ alloc_unbound.remote_dom = 0;
+
+ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+ &alloc_unbound);
+ if (err == -ENOSYS)
+ goto out_error;
+
+ BUG_ON(err);
+ xen_store_evtchn = xen_start_info->store_evtchn =
+ alloc_unbound.port;
+
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
} else {
- xenstored_ready = 1;
- xen_store_evtchn = xen_start_info->store_evtchn;
- xen_store_mfn = xen_start_info->store_mfn;
+ if (xen_hvm_domain()) {
+ uint64_t v = 0;
+ err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v);
+ if (err)
+ goto out_error;
+ xen_store_evtchn = (int)v;
+ err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v);
+ if (err)
+ goto out_error;
+ xen_store_mfn = (unsigned long)v;
+ xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE);
+ } else {
+ xen_store_evtchn = xen_start_info->store_evtchn;
+ xen_store_mfn = xen_start_info->store_mfn;
+ xen_store_interface = mfn_to_virt(xen_store_mfn);
+ xenstored_ready = 1;
+ }
}
- xen_store_interface = mfn_to_virt(xen_store_mfn);
/* Initialize the interface to xenstore. */
err = xs_init();
@@ -816,9 +877,6 @@ static int __init xenbus_probe_init(void)
goto out_unreg_back;
}
- if (!xen_initial_domain())
- xenbus_probe(NULL);
-
#ifdef CONFIG_XEN_COMPAT_XENFS
/*
* Create xenfs mountpoint in /proc for compatibility with
@@ -836,14 +894,16 @@ static int __init xenbus_probe_init(void)
bus_unregister(&xenbus_frontend.bus);
out_error:
+ if (page != 0)
+ free_page(page);
return err;
}
-postcore_initcall(xenbus_probe_init);
+postcore_initcall(xenbus_init);
MODULE_LICENSE("GPL");
-static int is_disconnected_device(struct device *dev, void *data)
+static int is_device_connecting(struct device *dev, void *data)
{
struct xenbus_device *xendev = to_xenbus_device(dev);
struct device_driver *drv = data;
@@ -861,14 +921,15 @@ static int is_disconnected_device(struct device *dev, void *data)
return 0;
xendrv = to_xenbus_driver(dev->driver);
- return (xendev->state != XenbusStateConnected ||
- (xendrv->is_ready && !xendrv->is_ready(xendev)));
+ return (xendev->state < XenbusStateConnected ||
+ (xendev->state == XenbusStateConnected &&
+ xendrv->is_ready && !xendrv->is_ready(xendev)));
}
-static int exists_disconnected_device(struct device_driver *drv)
+static int exists_connecting_device(struct device_driver *drv)
{
return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
- is_disconnected_device);
+ is_device_connecting);
}
static int print_device_status(struct device *dev, void *data)
@@ -884,10 +945,13 @@ static int print_device_status(struct device *dev, void *data)
/* Information only: is this too noisy? */
printk(KERN_INFO "XENBUS: Device with no driver: %s\n",
xendev->nodename);
- } else if (xendev->state != XenbusStateConnected) {
+ } else if (xendev->state < XenbusStateConnected) {
+ enum xenbus_state rstate = XenbusStateUnknown;
+ if (xendev->otherend)
+ rstate = xenbus_read_driver_state(xendev->otherend);
printk(KERN_WARNING "XENBUS: Timeout connecting "
- "to device: %s (state %d)\n",
- xendev->nodename, xendev->state);
+ "to device: %s (local state %d, remote state %d)\n",
+ xendev->nodename, xendev->state, rstate);
}
return 0;
@@ -897,7 +961,7 @@ static int print_device_status(struct device *dev, void *data)
static int ready_to_wait_for_devices;
/*
- * On a 10 second timeout, wait for all devices currently configured. We need
+ * On a 5-minute timeout, wait for all devices currently configured. We need
* to do this to guarantee that the filesystems and / or network devices
* needed for boot are available, before we can allow the boot to proceed.
*
@@ -912,18 +976,30 @@ static int ready_to_wait_for_devices;
*/
static void wait_for_devices(struct xenbus_driver *xendrv)
{
- unsigned long timeout = jiffies + 10*HZ;
+ unsigned long start = jiffies;
struct device_driver *drv = xendrv ? &xendrv->driver : NULL;
+ unsigned int seconds_waited = 0;
if (!ready_to_wait_for_devices || !xen_domain())
return;
- while (exists_disconnected_device(drv)) {
- if (time_after(jiffies, timeout))
- break;
+ while (exists_connecting_device(drv)) {
+ if (time_after(jiffies, start + (seconds_waited+5)*HZ)) {
+ if (!seconds_waited)
+ printk(KERN_WARNING "XENBUS: Waiting for "
+ "devices to initialise: ");
+ seconds_waited += 5;
+ printk("%us...", 300 - seconds_waited);
+ if (seconds_waited == 300)
+ break;
+ }
+
schedule_timeout_interruptible(HZ/10);
}
+ if (seconds_waited)
+ printk("\n");
+
bus_for_each_dev(&xenbus_frontend.bus, NULL, drv,
print_device_status);
}
@@ -931,6 +1007,9 @@ static void wait_for_devices(struct xenbus_driver *xendrv)
#ifndef MODULE
static int __init boot_wait_for_devices(void)
{
+ if (xen_hvm_domain() && !xen_platform_pci_unplug)
+ return -ENODEV;
+
ready_to_wait_for_devices = 1;
wait_for_devices(NULL);
return 0;
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index eab33f1dbdf7..5534690075af 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -76,6 +76,14 @@ struct xs_handle {
/*
* Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex.
* response_mutex is never taken simultaneously with the other three.
+ *
+ * transaction_mutex must be held before incrementing
+ * transaction_count. The mutex is held when a suspend is in
+ * progress to prevent new transactions starting.
+ *
+ * When decrementing transaction_count to zero the wait queue
+ * should be woken up, the suspend code waits for count to
+ * reach zero.
*/
/* One request at a time. */
@@ -85,7 +93,9 @@ struct xs_handle {
struct mutex response_mutex;
/* Protect transactions against save/restore. */
- struct rw_semaphore transaction_mutex;
+ struct mutex transaction_mutex;
+ atomic_t transaction_count;
+ wait_queue_head_t transaction_wq;
/* Protect watch (de)register against save/restore. */
struct rw_semaphore watch_mutex;
@@ -157,6 +167,31 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len)
return body;
}
+static void transaction_start(void)
+{
+ mutex_lock(&xs_state.transaction_mutex);
+ atomic_inc(&xs_state.transaction_count);
+ mutex_unlock(&xs_state.transaction_mutex);
+}
+
+static void transaction_end(void)
+{
+ if (atomic_dec_and_test(&xs_state.transaction_count))
+ wake_up(&xs_state.transaction_wq);
+}
+
+static void transaction_suspend(void)
+{
+ mutex_lock(&xs_state.transaction_mutex);
+ wait_event(xs_state.transaction_wq,
+ atomic_read(&xs_state.transaction_count) == 0);
+}
+
+static void transaction_resume(void)
+{
+ mutex_unlock(&xs_state.transaction_mutex);
+}
+
void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
{
void *ret;
@@ -164,7 +199,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
int err;
if (req_msg.type == XS_TRANSACTION_START)
- down_read(&xs_state.transaction_mutex);
+ transaction_start();
mutex_lock(&xs_state.request_mutex);
@@ -180,7 +215,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg)
if ((msg->type == XS_TRANSACTION_END) ||
((req_msg.type == XS_TRANSACTION_START) &&
(msg->type == XS_ERROR)))
- up_read(&xs_state.transaction_mutex);
+ transaction_end();
return ret;
}
@@ -432,11 +467,11 @@ int xenbus_transaction_start(struct xenbus_transaction *t)
{
char *id_str;
- down_read(&xs_state.transaction_mutex);
+ transaction_start();
id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL);
if (IS_ERR(id_str)) {
- up_read(&xs_state.transaction_mutex);
+ transaction_end();
return PTR_ERR(id_str);
}
@@ -461,7 +496,7 @@ int xenbus_transaction_end(struct xenbus_transaction t, int abort)
err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL));
- up_read(&xs_state.transaction_mutex);
+ transaction_end();
return err;
}
@@ -499,7 +534,7 @@ int xenbus_printf(struct xenbus_transaction t,
#define PRINTF_BUFFER_SIZE 4096
char *printf_buffer;
- printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL);
+ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH);
if (printf_buffer == NULL)
return -ENOMEM;
@@ -662,7 +697,7 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch);
void xs_suspend(void)
{
- down_write(&xs_state.transaction_mutex);
+ transaction_suspend();
down_write(&xs_state.watch_mutex);
mutex_lock(&xs_state.request_mutex);
mutex_lock(&xs_state.response_mutex);
@@ -677,7 +712,7 @@ void xs_resume(void)
mutex_unlock(&xs_state.response_mutex);
mutex_unlock(&xs_state.request_mutex);
- up_write(&xs_state.transaction_mutex);
+ transaction_resume();
/* No need for watches_lock: the watch_mutex is sufficient. */
list_for_each_entry(watch, &watches, list) {
@@ -693,7 +728,7 @@ void xs_suspend_cancel(void)
mutex_unlock(&xs_state.response_mutex);
mutex_unlock(&xs_state.request_mutex);
up_write(&xs_state.watch_mutex);
- up_write(&xs_state.transaction_mutex);
+ mutex_unlock(&xs_state.transaction_mutex);
}
static int xenwatch_thread(void *unused)
@@ -843,8 +878,10 @@ int xs_init(void)
mutex_init(&xs_state.request_mutex);
mutex_init(&xs_state.response_mutex);
- init_rwsem(&xs_state.transaction_mutex);
+ mutex_init(&xs_state.transaction_mutex);
init_rwsem(&xs_state.watch_mutex);
+ atomic_set(&xs_state.transaction_count, 0);
+ init_waitqueue_head(&xs_state.transaction_wq);
/* Initialize the shared memory rings to talk to xenstored */
err = xb_init_comms();
diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c
index a240b2c20b99..b91f8ff50d05 100644
--- a/drivers/xen/xencomm.c
+++ b/drivers/xen/xencomm.c
@@ -18,8 +18,8 @@
* Authors: Hollis Blanchard <hollisb@us.ibm.com>
*/
-#include <linux/gfp.h>
#include <linux/mm.h>
+#include <linux/slab.h>
#include <asm/page.h>
#include <xen/xencomm.h>
#include <xen/interface/xen.h>
diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile
index 25275c3bbdff..4fde9440fe1f 100644
--- a/drivers/xen/xenfs/Makefile
+++ b/drivers/xen/xenfs/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_XENFS) += xenfs.o
-xenfs-objs = super.o xenbus.o \ No newline at end of file
+xenfs-y = super.o xenbus.o privcmd.o
+xenfs-$(CONFIG_XEN_DOM0) += xenstored.o
diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c
new file mode 100644
index 000000000000..f80be7f6eb95
--- /dev/null
+++ b/drivers/xen/xenfs/privcmd.c
@@ -0,0 +1,404 @@
+/******************************************************************************
+ * privcmd.c
+ *
+ * Interface to privileged domain-0 commands.
+ *
+ * Copyright (c) 2002-2004, K A Fraser, B Dragovic
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/uaccess.h>
+#include <linux/swap.h>
+#include <linux/smp_lock.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/privcmd.h>
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+#include <xen/xen-ops.h>
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma);
+#endif
+
+static long privcmd_ioctl_hypercall(void __user *udata)
+{
+ struct privcmd_hypercall hypercall;
+ long ret;
+
+ if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
+ return -EFAULT;
+
+ ret = privcmd_call(hypercall.op,
+ hypercall.arg[0], hypercall.arg[1],
+ hypercall.arg[2], hypercall.arg[3],
+ hypercall.arg[4]);
+
+ return ret;
+}
+
+static void free_page_list(struct list_head *pages)
+{
+ struct page *p, *n;
+
+ list_for_each_entry_safe(p, n, pages, lru)
+ __free_page(p);
+
+ INIT_LIST_HEAD(pages);
+}
+
+/*
+ * Given an array of items in userspace, return a list of pages
+ * containing the data. If copying fails, either because of memory
+ * allocation failure or a problem reading user memory, return an
+ * error code; its up to the caller to dispose of any partial list.
+ */
+static int gather_array(struct list_head *pagelist,
+ unsigned nelem, size_t size,
+ void __user *data)
+{
+ unsigned pageidx;
+ void *pagedata;
+ int ret;
+
+ if (size > PAGE_SIZE)
+ return 0;
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* quiet, gcc */
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page = alloc_page(GFP_KERNEL);
+
+ ret = -ENOMEM;
+ if (page == NULL)
+ goto fail;
+
+ pagedata = page_address(page);
+
+ list_add_tail(&page->lru, pagelist);
+ pageidx = 0;
+ }
+
+ ret = -EFAULT;
+ if (copy_from_user(pagedata + pageidx, data, size))
+ goto fail;
+
+ data += size;
+ pageidx += size;
+ }
+
+ ret = 0;
+
+fail:
+ return ret;
+}
+
+/*
+ * Call function "fn" on each element of the array fragmented
+ * over a list of pages.
+ */
+static int traverse_pages(unsigned nelem, size_t size,
+ struct list_head *pos,
+ int (*fn)(void *data, void *state),
+ void *state)
+{
+ void *pagedata;
+ unsigned pageidx;
+ int ret = 0;
+
+ BUG_ON(size > PAGE_SIZE);
+
+ pageidx = PAGE_SIZE;
+ pagedata = NULL; /* hush, gcc */
+
+ while (nelem--) {
+ if (pageidx > PAGE_SIZE-size) {
+ struct page *page;
+ pos = pos->next;
+ page = list_entry(pos, struct page, lru);
+ pagedata = page_address(page);
+ pageidx = 0;
+ }
+
+ ret = (*fn)(pagedata + pageidx, state);
+ if (ret)
+ break;
+ pageidx += size;
+ }
+
+ return ret;
+}
+
+struct mmap_mfn_state {
+ unsigned long va;
+ struct vm_area_struct *vma;
+ domid_t domain;
+};
+
+static int mmap_mfn_range(void *data, void *state)
+{
+ struct privcmd_mmap_entry *msg = data;
+ struct mmap_mfn_state *st = state;
+ struct vm_area_struct *vma = st->vma;
+ int rc;
+
+ /* Do not allow range to wrap the address space. */
+ if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) ||
+ ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va))
+ return -EINVAL;
+
+ /* Range chunks must be contiguous in va space. */
+ if ((msg->va != st->va) ||
+ ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end))
+ return -EINVAL;
+
+ rc = xen_remap_domain_mfn_range(vma,
+ msg->va & PAGE_MASK,
+ msg->mfn, msg->npages,
+ vma->vm_page_prot,
+ st->domain);
+ if (rc < 0)
+ return rc;
+
+ st->va += msg->npages << PAGE_SHIFT;
+
+ return 0;
+}
+
+static long privcmd_ioctl_mmap(void __user *udata)
+{
+ struct privcmd_mmap mmapcmd;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ int rc;
+ LIST_HEAD(pagelist);
+ struct mmap_mfn_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd)))
+ return -EFAULT;
+
+ rc = gather_array(&pagelist,
+ mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ mmapcmd.entry);
+
+ if (rc || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ {
+ struct page *page = list_first_entry(&pagelist,
+ struct page, lru);
+ struct privcmd_mmap_entry *msg = page_address(page);
+
+ vma = find_vma(mm, msg->va);
+ rc = -EINVAL;
+
+ if (!vma || (msg->va != vma->vm_start) ||
+ !privcmd_enforce_singleshot_mapping(vma))
+ goto out_up;
+ }
+
+ state.va = vma->vm_start;
+ state.vma = vma;
+ state.domain = mmapcmd.dom;
+
+ rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry),
+ &pagelist,
+ mmap_mfn_range, &state);
+
+
+out_up:
+ up_write(&mm->mmap_sem);
+
+out:
+ free_page_list(&pagelist);
+
+ return rc;
+}
+
+struct mmap_batch_state {
+ domid_t domain;
+ unsigned long va;
+ struct vm_area_struct *vma;
+ int err;
+
+ xen_pfn_t __user *user;
+};
+
+static int mmap_batch_fn(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
+ st->vma->vm_page_prot, st->domain) < 0) {
+ *mfnp |= 0xf0000000U;
+ st->err++;
+ }
+ st->va += PAGE_SIZE;
+
+ return 0;
+}
+
+static int mmap_return_errors(void *data, void *state)
+{
+ xen_pfn_t *mfnp = data;
+ struct mmap_batch_state *st = state;
+
+ put_user(*mfnp, st->user++);
+
+ return 0;
+}
+
+static struct vm_operations_struct privcmd_vm_ops;
+
+static long privcmd_ioctl_mmap_batch(void __user *udata)
+{
+ int ret;
+ struct privcmd_mmapbatch m;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long nr_pages;
+ LIST_HEAD(pagelist);
+ struct mmap_batch_state state;
+
+ if (!xen_initial_domain())
+ return -EPERM;
+
+ if (copy_from_user(&m, udata, sizeof(m)))
+ return -EFAULT;
+
+ nr_pages = m.num;
+ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t),
+ m.arr);
+
+ if (ret || list_empty(&pagelist))
+ goto out;
+
+ down_write(&mm->mmap_sem);
+
+ vma = find_vma(mm, m.addr);
+ ret = -EINVAL;
+ if (!vma ||
+ vma->vm_ops != &privcmd_vm_ops ||
+ (m.addr != vma->vm_start) ||
+ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) ||
+ !privcmd_enforce_singleshot_mapping(vma)) {
+ up_write(&mm->mmap_sem);
+ goto out;
+ }
+
+ state.domain = m.dom;
+ state.vma = vma;
+ state.va = m.addr;
+ state.err = 0;
+
+ ret = traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist, mmap_batch_fn, &state);
+
+ up_write(&mm->mmap_sem);
+
+ if (state.err > 0) {
+ ret = 0;
+
+ state.user = m.arr;
+ traverse_pages(m.num, sizeof(xen_pfn_t),
+ &pagelist,
+ mmap_return_errors, &state);
+ }
+
+out:
+ free_page_list(&pagelist);
+
+ return ret;
+}
+
+static long privcmd_ioctl(struct file *file,
+ unsigned int cmd, unsigned long data)
+{
+ int ret = -ENOSYS;
+ void __user *udata = (void __user *) data;
+
+ switch (cmd) {
+ case IOCTL_PRIVCMD_HYPERCALL:
+ ret = privcmd_ioctl_hypercall(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAP:
+ ret = privcmd_ioctl_mmap(udata);
+ break;
+
+ case IOCTL_PRIVCMD_MMAPBATCH:
+ ret = privcmd_ioctl_mmap_batch(udata);
+ break;
+
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+#ifndef HAVE_ARCH_PRIVCMD_MMAP
+static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
+ vma, vma->vm_start, vma->vm_end,
+ vmf->pgoff, vmf->virtual_address);
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct privcmd_vm_ops = {
+ .fault = privcmd_fault
+};
+
+static int privcmd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ /* Unsupported for auto-translate guests. */
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -ENOSYS;
+
+ /* DONTCOPY is essential for Xen as copy_page_range is broken. */
+ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY;
+ vma->vm_ops = &privcmd_vm_ops;
+ vma->vm_private_data = NULL;
+
+ return 0;
+}
+
+static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma)
+{
+ return (xchg(&vma->vm_private_data, (void *)1) == NULL);
+}
+#endif
+
+const struct file_operations privcmd_file_ops = {
+ .unlocked_ioctl = privcmd_ioctl,
+ .mmap = privcmd_mmap,
+};
diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 6559e0c752ce..f6339d11d59c 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -12,6 +12,10 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/magic.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+
+#include <xen/xen.h>
#include "xenfs.h"
@@ -20,6 +24,62 @@
MODULE_DESCRIPTION("Xen filesystem");
MODULE_LICENSE("GPL");
+static int xenfs_set_page_dirty(struct page *page)
+{
+ return !TestSetPageDirty(page);
+}
+
+static const struct address_space_operations xenfs_aops = {
+ .set_page_dirty = xenfs_set_page_dirty,
+};
+
+static struct backing_dev_info xenfs_backing_dev_info = {
+ .ra_pages = 0, /* No readahead */
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static struct inode *xenfs_make_inode(struct super_block *sb, int mode)
+{
+ struct inode *ret = new_inode(sb);
+
+ if (ret) {
+ ret->i_mode = mode;
+ ret->i_mapping->a_ops = &xenfs_aops;
+ ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info;
+ ret->i_uid = ret->i_gid = 0;
+ ret->i_blocks = 0;
+ ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME;
+ }
+ return ret;
+}
+
+static struct dentry *xenfs_create_file(struct super_block *sb,
+ struct dentry *parent,
+ const char *name,
+ const struct file_operations *fops,
+ void *data,
+ int mode)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ dentry = d_alloc_name(parent, name);
+ if (!dentry)
+ return NULL;
+
+ inode = xenfs_make_inode(sb, S_IFREG | mode);
+ if (!inode) {
+ dput(dentry);
+ return NULL;
+ }
+
+ inode->i_fop = fops;
+ inode->i_private = data;
+
+ d_add(dentry, inode);
+ return dentry;
+}
+
static ssize_t capabilities_read(struct file *file, char __user *buf,
size_t size, loff_t *off)
{
@@ -33,6 +93,7 @@ static ssize_t capabilities_read(struct file *file, char __user *buf,
static const struct file_operations capabilities_file_ops = {
.read = capabilities_read,
+ .llseek = default_llseek,
};
static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
@@ -41,38 +102,65 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
[1] = {},
{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
{ "capabilities", &capabilities_file_ops, S_IRUGO },
+ { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR },
{""},
};
+ int rc;
+
+ rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+ if (rc < 0)
+ return rc;
+
+ if (xen_initial_domain()) {
+ xenfs_create_file(sb, sb->s_root, "xsd_kva",
+ &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR);
+ xenfs_create_file(sb, sb->s_root, "xsd_port",
+ &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR);
+ }
- return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files);
+ return rc;
}
-static int xenfs_get_sb(struct file_system_type *fs_type,
+static int xenfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name,
- void *data, struct vfsmount *mnt)
+ void *data)
{
- return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt);
+ return mount_single(fs_type, flags, data, xenfs_fill_super);
}
static struct file_system_type xenfs_type = {
.owner = THIS_MODULE,
.name = "xenfs",
- .get_sb = xenfs_get_sb,
+ .mount = xenfs_mount,
.kill_sb = kill_litter_super,
};
static int __init xenfs_init(void)
{
- if (xen_pv_domain())
- return register_filesystem(&xenfs_type);
+ int err;
+ if (!xen_domain()) {
+ printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n");
+ return 0;
+ }
+
+ err = register_filesystem(&xenfs_type);
+ if (err) {
+ printk(KERN_ERR "xenfs: Unable to register filesystem!\n");
+ goto out;
+ }
+
+ err = bdi_init(&xenfs_backing_dev_info);
+ if (err)
+ unregister_filesystem(&xenfs_type);
+
+ out:
- printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n");
- return 0;
+ return err;
}
static void __exit xenfs_exit(void)
{
- if (xen_pv_domain())
+ if (xen_domain())
unregister_filesystem(&xenfs_type);
}
diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c
index a9592d981b10..1c1236087f78 100644
--- a/drivers/xen/xenfs/xenbus.c
+++ b/drivers/xen/xenfs/xenbus.c
@@ -43,6 +43,7 @@
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/mutex.h>
+#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
@@ -50,6 +51,7 @@
#include <linux/init.h>
#include <linux/namei.h>
#include <linux/string.h>
+#include <linux/slab.h>
#include "xenfs.h"
#include "../xenbus/xenbus_comms.h"
@@ -122,6 +124,9 @@ static ssize_t xenbus_file_read(struct file *filp,
mutex_lock(&u->reply_mutex);
while (list_empty(&u->read_buffers)) {
mutex_unlock(&u->reply_mutex);
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
ret = wait_event_interruptible(u->read_waitq,
!list_empty(&u->read_buffers));
if (ret)
@@ -589,4 +594,5 @@ const struct file_operations xenbus_file_ops = {
.open = xenbus_file_open,
.release = xenbus_file_release,
.poll = xenbus_file_poll,
+ .llseek = no_llseek,
};
diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h
index 51f08b2d0bf1..b68aa6200003 100644
--- a/drivers/xen/xenfs/xenfs.h
+++ b/drivers/xen/xenfs/xenfs.h
@@ -2,5 +2,8 @@
#define _XENFS_XENBUS_H
extern const struct file_operations xenbus_file_ops;
+extern const struct file_operations privcmd_file_ops;
+extern const struct file_operations xsd_kva_file_ops;
+extern const struct file_operations xsd_port_file_ops;
#endif /* _XENFS_XENBUS_H */
diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c
new file mode 100644
index 000000000000..fef20dbc6a5c
--- /dev/null
+++ b/drivers/xen/xenfs/xenstored.c
@@ -0,0 +1,68 @@
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#include <xen/page.h>
+
+#include "xenfs.h"
+#include "../xenbus/xenbus_comms.h"
+
+static ssize_t xsd_read(struct file *file, char __user *buf,
+ size_t size, loff_t *off)
+{
+ const char *str = (const char *)file->private_data;
+ return simple_read_from_buffer(buf, size, off, str, strlen(str));
+}
+
+static int xsd_release(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
+
+static int xsd_kva_open(struct inode *inode, struct file *file)
+{
+ file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p",
+ xen_store_interface);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ size_t size = vma->vm_end - vma->vm_start;
+
+ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0))
+ return -EINVAL;
+
+ if (remap_pfn_range(vma, vma->vm_start,
+ virt_to_pfn(xen_store_interface),
+ size, vma->vm_page_prot))
+ return -EAGAIN;
+
+ return 0;
+}
+
+const struct file_operations xsd_kva_file_ops = {
+ .open = xsd_kva_open,
+ .mmap = xsd_kva_mmap,
+ .read = xsd_read,
+ .release = xsd_release,
+};
+
+static int xsd_port_open(struct inode *inode, struct file *file)
+{
+ file->private_data = (void *)kasprintf(GFP_KERNEL, "%d",
+ xen_store_evtchn);
+ if (!file->private_data)
+ return -ENOMEM;
+ return 0;
+}
+
+const struct file_operations xsd_port_file_ops = {
+ .open = xsd_port_open,
+ .read = xsd_read,
+ .release = xsd_release,
+};