diff options
Diffstat (limited to 'drivers/xen')
-rw-r--r-- | drivers/xen/Kconfig | 14 | ||||
-rw-r--r-- | drivers/xen/Makefile | 4 | ||||
-rw-r--r-- | drivers/xen/balloon.c | 15 | ||||
-rw-r--r-- | drivers/xen/events.c | 96 | ||||
-rw-r--r-- | drivers/xen/grant-table.c | 77 | ||||
-rw-r--r-- | drivers/xen/manage.c | 46 | ||||
-rw-r--r-- | drivers/xen/platform-pci.c | 207 | ||||
-rw-r--r-- | drivers/xen/swiotlb-xen.c | 515 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_client.c | 90 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_probe.c | 52 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_xs.c | 57 | ||||
-rw-r--r-- | drivers/xen/xenfs/super.c | 4 | ||||
-rw-r--r-- | drivers/xen/xenfs/xenbus.c | 3 |
13 files changed, 1095 insertions, 85 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index fad3df2c1276..60d71e9abe9f 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -62,4 +62,18 @@ config XEN_SYS_HYPERVISOR virtual environment, /sys/hypervisor will still be present, but will have no xen contents. +config XEN_PLATFORM_PCI + tristate "xen platform pci device driver" + depends on XEN_PVHVM + default m + help + Driver for the Xen PCI Platform device: it is responsible for + initializing xenbus and grant_table when running in a Xen HVM + domain. As a consequence this driver is required to run any Xen PV + frontend on Xen HVM. + +config SWIOTLB_XEN + def_bool y + depends on SWIOTLB + endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 7c284342f30f..fcaf838f54be 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -9,4 +9,6 @@ obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o obj-$(CONFIG_XENFS) += xenfs/ -obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
\ No newline at end of file +obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o +obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o +obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 1a0d8c2a0354..500290b150bb 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -85,13 +85,6 @@ static struct sys_device balloon_sysdev; static int register_balloon(struct sys_device *sysdev); -/* - * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and driver_pages, and - * balloon lists. - */ -static DEFINE_SPINLOCK(balloon_lock); - static struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ @@ -210,7 +203,7 @@ static int increase_reservation(unsigned long nr_pages) if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - spin_lock_irqsave(&balloon_lock, flags); + spin_lock_irqsave(&xen_reservation_lock, flags); page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { @@ -254,7 +247,7 @@ static int increase_reservation(unsigned long nr_pages) balloon_stats.current_pages += rc; out: - spin_unlock_irqrestore(&balloon_lock, flags); + spin_unlock_irqrestore(&xen_reservation_lock, flags); return rc < 0 ? rc : rc != nr_pages; } @@ -299,7 +292,7 @@ static int decrease_reservation(unsigned long nr_pages) kmap_flush_unused(); flush_tlb_all(); - spin_lock_irqsave(&balloon_lock, flags); + spin_lock_irqsave(&xen_reservation_lock, flags); /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { @@ -315,7 +308,7 @@ static int decrease_reservation(unsigned long nr_pages) balloon_stats.current_pages -= nr_pages; - spin_unlock_irqrestore(&balloon_lock, flags); + spin_unlock_irqrestore(&xen_reservation_lock, flags); return need_sleep; } diff --git a/drivers/xen/events.c b/drivers/xen/events.c index db8f506817f0..72f91bff29c7 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -29,6 +29,7 @@ #include <linux/bootmem.h> #include <linux/slab.h> +#include <asm/desc.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/idle.h> @@ -36,10 +37,14 @@ #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/xen.h> +#include <xen/hvm.h> #include <xen/xen-ops.h> #include <xen/events.h> #include <xen/interface/xen.h> #include <xen/interface/event_channel.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/hvm/params.h> /* * This lock protects updates to the following mapping and reference-count @@ -335,9 +340,18 @@ static int find_unbound_irq(void) int irq; struct irq_desc *desc; - for (irq = 0; irq < nr_irqs; irq++) + for (irq = 0; irq < nr_irqs; irq++) { + desc = irq_to_desc(irq); + /* only 0->15 have init'd desc; handle irq > 16 */ + if (desc == NULL) + break; + if (desc->chip == &no_irq_chip) + break; + if (desc->chip != &xen_dynamic_chip) + continue; if (irq_info[irq].type == IRQT_UNBOUND) break; + } if (irq == nr_irqs) panic("No available IRQ to bind to: increase nr_irqs!\n"); @@ -346,7 +360,7 @@ static int find_unbound_irq(void) if (WARN_ON(desc == NULL)) return -1; - dynamic_irq_init(irq); + dynamic_irq_init_keep_chip_data(irq); return irq; } @@ -536,6 +550,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, if (irq < 0) return irq; + irqflags |= IRQF_NO_SUSPEND; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -617,17 +632,13 @@ static DEFINE_PER_CPU(unsigned, xed_nesting_count); * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -void xen_evtchn_do_upcall(struct pt_regs *regs) +static void __xen_evtchn_do_upcall(void) { int cpu = get_cpu(); - struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); unsigned count; - exit_idle(); - irq_enter(); - do { unsigned long pending_words; @@ -664,14 +675,31 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) count = __get_cpu_var(xed_nesting_count); __get_cpu_var(xed_nesting_count) = 0; - } while(count != 1); + } while (count != 1 || vcpu_info->evtchn_upcall_pending); out: + + put_cpu(); +} + +void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + exit_idle(); + irq_enter(); + + __xen_evtchn_do_upcall(); + irq_exit(); set_irq_regs(old_regs); +} - put_cpu(); +void xen_hvm_evtchn_do_upcall(void) +{ + __xen_evtchn_do_upcall(); } +EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); /* Rebind a new event channel to an existing irq. */ void rebind_evtchn_irq(int evtchn, int irq) @@ -708,7 +736,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); - if (!VALID_EVTCHN(evtchn)) + /* events delivered via platform PCI interrupts are always + * routed to vcpu 0 */ + if (!VALID_EVTCHN(evtchn) || + (xen_hvm_domain() && !xen_have_vector_callback)) return -1; /* Send future instances of this interrupt to other vcpu. */ @@ -933,6 +964,44 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .retrigger = retrigger_dynirq, }; +int xen_set_callback_via(uint64_t via) +{ + struct xen_hvm_param a; + a.domid = DOMID_SELF; + a.index = HVM_PARAM_CALLBACK_IRQ; + a.value = via; + return HYPERVISOR_hvm_op(HVMOP_set_param, &a); +} +EXPORT_SYMBOL_GPL(xen_set_callback_via); + +#ifdef CONFIG_XEN_PVHVM +/* Vector callbacks are better than PCI interrupts to receive event + * channel notifications because we can receive vector callbacks on any + * vcpu and we don't need PCI support or APIC interactions. */ +void xen_callback_vector(void) +{ + int rc; + uint64_t callback_via; + if (xen_have_vector_callback) { + callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); + rc = xen_set_callback_via(callback_via); + if (rc) { + printk(KERN_ERR "Request for Xen HVM callback vector" + " failed.\n"); + xen_have_vector_callback = 0; + return; + } + printk(KERN_INFO "Xen HVM callback vector for event delivery is " + "enabled\n"); + /* in the restore case the vector has already been allocated */ + if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors)) + alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); + } +} +#else +void xen_callback_vector(void) {} +#endif + void __init xen_init_IRQ(void) { int i; @@ -947,5 +1016,10 @@ void __init xen_init_IRQ(void) for (i = 0; i < NR_EVENT_CHANNELS; i++) mask_evtchn(i); - irq_ctx_init(smp_processor_id()); + if (xen_hvm_domain()) { + xen_callback_vector(); + native_init_IRQ(); + } else { + irq_ctx_init(smp_processor_id()); + } } diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index f66db3b91d61..6c4531816496 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -37,11 +37,13 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <linux/io.h> #include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/page.h> #include <xen/grant_table.h> +#include <xen/interface/memory.h> #include <asm/xen/hypercall.h> #include <asm/pgtable.h> @@ -59,6 +61,8 @@ static unsigned int boot_max_nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); +unsigned long xen_hvm_resume_frames; +EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); static struct grant_entry *shared; @@ -433,7 +437,7 @@ static unsigned int __max_nr_grant_frames(void) return query.max_nr_frames; } -static inline unsigned int max_nr_grant_frames(void) +unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); @@ -441,6 +445,7 @@ static inline unsigned int max_nr_grant_frames(void) return boot_max_nr_grant_frames; return xen_max; } +EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { @@ -449,6 +454,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; + if (xen_hvm_domain()) { + struct xen_add_to_physmap xatp; + unsigned int i = end_idx; + rc = 0; + /* + * Loop backwards, so that the first hypercall has the largest + * index, ensuring that the table will grow only once. + */ + do { + xatp.domid = DOMID_SELF; + xatp.idx = i; + xatp.space = XENMAPSPACE_grant_table; + xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); + if (rc != 0) { + printk(KERN_WARNING + "grant table add_to_physmap failed, err=%d\n", rc); + break; + } + } while (i-- > start_idx); + + return rc; + } + frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); if (!frames) return -ENOMEM; @@ -465,7 +494,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) BUG_ON(rc || setup.status); - rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(), + rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(), &shared); BUG_ON(rc); @@ -476,9 +505,27 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) int gnttab_resume(void) { - if (max_nr_grant_frames() < nr_grant_frames) + unsigned int max_nr_gframes; + + max_nr_gframes = gnttab_max_grant_frames(); + if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - return gnttab_map(0, nr_grant_frames - 1); + + if (xen_pv_domain()) + return gnttab_map(0, nr_grant_frames - 1); + + if (!shared) { + shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes); + if (shared == NULL) { + printk(KERN_WARNING + "Failed to ioremap gnttab share frames!"); + return -ENOMEM; + } + } + + gnttab_map(0, nr_grant_frames - 1); + + return 0; } int gnttab_suspend(void) @@ -495,7 +542,7 @@ static int gnttab_expand(unsigned int req_entries) cur = nr_grant_frames; extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / GREFS_PER_GRANT_FRAME); - if (cur + extra > max_nr_grant_frames()) + if (cur + extra > gnttab_max_grant_frames()) return -ENOSPC; rc = gnttab_map(cur, cur + extra - 1); @@ -505,15 +552,12 @@ static int gnttab_expand(unsigned int req_entries) return rc; } -static int __devinit gnttab_init(void) +int gnttab_init(void) { int i; unsigned int max_nr_glist_frames, nr_glist_frames; unsigned int nr_init_grefs; - if (!xen_domain()) - return -ENODEV; - nr_grant_frames = 1; boot_max_nr_grant_frames = __max_nr_grant_frames(); @@ -556,5 +600,18 @@ static int __devinit gnttab_init(void) kfree(gnttab_list); return -ENOMEM; } +EXPORT_SYMBOL_GPL(gnttab_init); + +static int __devinit __gnttab_init(void) +{ + /* Delay grant-table initialization in the PV on HVM case */ + if (xen_hvm_domain()) + return 0; + + if (!xen_pv_domain()) + return -ENODEV; + + return gnttab_init(); +} -core_initcall(gnttab_init); +core_initcall(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 07e857b0de13..1799bd890315 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -9,6 +9,7 @@ #include <linux/stop_machine.h> #include <linux/freezer.h> +#include <xen/xen.h> #include <xen/xenbus.h> #include <xen/grant_table.h> #include <xen/events.h> @@ -17,6 +18,7 @@ #include <asm/xen/hypercall.h> #include <asm/xen/page.h> +#include <asm/xen/hypervisor.h> enum shutdown_state { SHUTDOWN_INVALID = -1, @@ -33,10 +35,30 @@ enum shutdown_state { static enum shutdown_state shutting_down = SHUTDOWN_INVALID; #ifdef CONFIG_PM_SLEEP -static int xen_suspend(void *data) +static int xen_hvm_suspend(void *data) { + struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; int *cancelled = data; + + BUG_ON(!irqs_disabled()); + + *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); + + xen_hvm_post_suspend(*cancelled); + gnttab_resume(); + + if (!*cancelled) { + xen_irq_resume(); + xen_timer_resume(); + } + + return 0; +} + +static int xen_suspend(void *data) +{ int err; + int *cancelled = data; BUG_ON(!irqs_disabled()); @@ -106,7 +128,10 @@ static void do_suspend(void) goto out_resume; } - err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + if (xen_hvm_domain()) + err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); + else + err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); dpm_resume_noirq(PMSG_RESUME); @@ -255,7 +280,19 @@ static int shutdown_event(struct notifier_block *notifier, return NOTIFY_DONE; } -static int __init setup_shutdown_event(void) +static int __init __setup_shutdown_event(void) +{ + /* Delay initialization in the PV on HVM case */ + if (xen_hvm_domain()) + return 0; + + if (!xen_pv_domain()) + return -ENODEV; + + return xen_setup_shutdown_event(); +} + +int xen_setup_shutdown_event(void) { static struct notifier_block xenstore_notifier = { .notifier_call = shutdown_event @@ -264,5 +301,6 @@ static int __init setup_shutdown_event(void) return 0; } +EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); -subsys_initcall(setup_shutdown_event); +subsys_initcall(__setup_shutdown_event); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c new file mode 100644 index 000000000000..c01b5ddce529 --- /dev/null +++ b/drivers/xen/platform-pci.c @@ -0,0 +1,207 @@ +/****************************************************************************** + * platform-pci.c + * + * Xen platform PCI device driver + * Copyright (c) 2005, Intel Corporation. + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + + +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <xen/platform_pci.h> +#include <xen/grant_table.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/hvm.h> +#include <xen/xen-ops.h> + +#define DRV_NAME "xen-platform-pci" + +MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com"); +MODULE_DESCRIPTION("Xen platform PCI device"); +MODULE_LICENSE("GPL"); + +static unsigned long platform_mmio; +static unsigned long platform_mmio_alloc; +static unsigned long platform_mmiolen; +static uint64_t callback_via; + +unsigned long alloc_xen_mmio(unsigned long len) +{ + unsigned long addr; + + addr = platform_mmio + platform_mmio_alloc; + platform_mmio_alloc += len; + BUG_ON(platform_mmio_alloc > platform_mmiolen); + + return addr; +} + +static uint64_t get_callback_via(struct pci_dev *pdev) +{ + u8 pin; + int irq; + + irq = pdev->irq; + if (irq < 16) + return irq; /* ISA IRQ */ + + pin = pdev->pin; + + /* We don't know the GSI. Specify the PCI INTx line instead. */ + return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */ + ((uint64_t)pci_domain_nr(pdev->bus) << 32) | + ((uint64_t)pdev->bus->number << 16) | + ((uint64_t)(pdev->devfn & 0xff) << 8) | + ((uint64_t)(pin - 1) & 3); +} + +static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) +{ + xen_hvm_evtchn_do_upcall(); + return IRQ_HANDLED; +} + +static int xen_allocate_irq(struct pci_dev *pdev) +{ + return request_irq(pdev->irq, do_hvm_evtchn_intr, + IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + "xen-platform-pci", pdev); +} + +static int platform_pci_resume(struct pci_dev *pdev) +{ + int err; + if (xen_have_vector_callback) + return 0; + err = xen_set_callback_via(callback_via); + if (err) { + dev_err(&pdev->dev, "platform_pci_resume failure!\n"); + return err; + } + return 0; +} + +static int __devinit platform_pci_init(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int i, ret; + long ioaddr, iolen; + long mmio_addr, mmio_len; + unsigned int max_nr_gframes; + + i = pci_enable_device(pdev); + if (i) + return i; + + ioaddr = pci_resource_start(pdev, 0); + iolen = pci_resource_len(pdev, 0); + + mmio_addr = pci_resource_start(pdev, 1); + mmio_len = pci_resource_len(pdev, 1); + + if (mmio_addr == 0 || ioaddr == 0) { + dev_err(&pdev->dev, "no resources found\n"); + ret = -ENOENT; + goto pci_out; + } + + if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) { + dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n", + mmio_addr, mmio_len); + ret = -EBUSY; + goto pci_out; + } + + if (request_region(ioaddr, iolen, DRV_NAME) == NULL) { + dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n", + iolen, ioaddr); + ret = -EBUSY; + goto mem_out; + } + + platform_mmio = mmio_addr; + platform_mmiolen = mmio_len; + + if (!xen_have_vector_callback) { + ret = xen_allocate_irq(pdev); + if (ret) { + dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); + goto out; + } + callback_via = get_callback_via(pdev); + ret = xen_set_callback_via(callback_via); + if (ret) { + dev_warn(&pdev->dev, "Unable to set the evtchn callback " + "err=%d\n", ret); + goto out; + } + } + + max_nr_gframes = gnttab_max_grant_frames(); + xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_init(); + if (ret) + goto out; + xenbus_probe(NULL); + ret = xen_setup_shutdown_event(); + if (ret) + goto out; + return 0; + +out: + release_region(ioaddr, iolen); +mem_out: + release_mem_region(mmio_addr, mmio_len); +pci_out: + pci_disable_device(pdev); + return ret; +} + +static struct pci_device_id platform_pci_tbl[] __devinitdata = { + {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {0,} +}; + +MODULE_DEVICE_TABLE(pci, platform_pci_tbl); + +static struct pci_driver platform_driver = { + .name = DRV_NAME, + .probe = platform_pci_init, + .id_table = platform_pci_tbl, +#ifdef CONFIG_PM + .resume_early = platform_pci_resume, +#endif +}; + +static int __init platform_pci_module_init(void) +{ + /* no unplug has been done, IGNORE hasn't been specified: just + * return now */ + if (!xen_platform_pci_unplug) + return -ENODEV; + + return pci_register_driver(&platform_driver); +} + +module_init(platform_pci_module_init); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c new file mode 100644 index 000000000000..54469c3eeacd --- /dev/null +++ b/drivers/xen/swiotlb-xen.c @@ -0,0 +1,515 @@ +/* + * Copyright 2010 + * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This code provides a IOMMU for Xen PV guests with PCI passthrough. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * PV guests under Xen are running in an non-contiguous memory architecture. + * + * When PCI pass-through is utilized, this necessitates an IOMMU for + * translating bus (DMA) to virtual and vice-versa and also providing a + * mechanism to have contiguous pages for device drivers operations (say DMA + * operations). + * + * Specifically, under Xen the Linux idea of pages is an illusion. It + * assumes that pages start at zero and go up to the available memory. To + * help with that, the Linux Xen MMU provides a lookup mechanism to + * translate the page frame numbers (PFN) to machine frame numbers (MFN) + * and vice-versa. The MFN are the "real" frame numbers. Furthermore + * memory is not contiguous. Xen hypervisor stitches memory for guests + * from different pools, which means there is no guarantee that PFN==MFN + * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are + * allocated in descending order (high to low), meaning the guest might + * never get any MFN's under the 4GB mark. + * + */ + +#include <linux/bootmem.h> +#include <linux/dma-mapping.h> +#include <xen/swiotlb-xen.h> +#include <xen/page.h> +#include <xen/xen-ops.h> +/* + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ + +static char *xen_io_tlb_start, *xen_io_tlb_end; +static unsigned long xen_io_tlb_nslabs; +/* + * Quick lookup value of the bus address of the IOTLB. + */ + +u64 start_dma_addr; + +static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +{ + return phys_to_machine(XPADDR(paddr)).maddr;; +} + +static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +{ + return machine_to_phys(XMADDR(baddr)).paddr; +} + +static dma_addr_t xen_virt_to_bus(void *address) +{ + return xen_phys_to_bus(virt_to_phys(address)); +} + +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; + + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; + } + return 1; +} + +static int range_straddles_page_boundary(phys_addr_t p, size_t size) +{ + unsigned long pfn = PFN_DOWN(p); + unsigned int offset = p & ~PAGE_MASK; + + if (offset + size <= PAGE_SIZE) + return 0; + if (check_pages_physically_contiguous(pfn, offset, size)) + return 0; + return 1; +} + +static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) +{ + unsigned long mfn = PFN_DOWN(dma_addr); + unsigned long pfn = mfn_to_local_pfn(mfn); + phys_addr_t paddr; + + /* If the address is outside our domain, it CAN + * have the same virtual address as another address + * in our domain. Therefore _only_ check address within our domain. + */ + if (pfn_valid(pfn)) { + paddr = PFN_PHYS(pfn); + return paddr >= virt_to_phys(xen_io_tlb_start) && + paddr < virt_to_phys(xen_io_tlb_end); + } + return 0; +} + +static int max_dma_bits = 32; + +static int +xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) +{ + int i, rc; + int dma_bits; + + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + + i = 0; + do { + int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); + + do { + rc = xen_create_contiguous_region( + (unsigned long)buf + (i << IO_TLB_SHIFT), + get_order(slabs << IO_TLB_SHIFT), + dma_bits); + } while (rc && dma_bits++ < max_dma_bits); + if (rc) + return rc; + + i += slabs; + } while (i < nslabs); + return 0; +} + +void __init xen_swiotlb_init(int verbose) +{ + unsigned long bytes; + int rc; + + xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); + xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); + + bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + + /* + * Get IO TLB memory from any location. + */ + xen_io_tlb_start = alloc_bootmem(bytes); + if (!xen_io_tlb_start) + panic("Cannot allocate SWIOTLB buffer"); + + xen_io_tlb_end = xen_io_tlb_start + bytes; + /* + * And replace that memory with pages under 4GB. + */ + rc = xen_swiotlb_fixup(xen_io_tlb_start, + bytes, + xen_io_tlb_nslabs); + if (rc) + goto error; + + start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); + swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); + + return; +error: + panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\ + "We either don't have the permission or you do not have enough"\ + "free memory under 4GB!\n", rc); +} + +void * +xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +{ + void *ret; + int order = get_order(size); + u64 dma_mask = DMA_BIT_MASK(32); + unsigned long vstart; + + /* + * Ignore region specifiers - the kernel's ideas of + * pseudo-phys memory layout has nothing to do with the + * machine physical layout. We can't allocate highmem + * because we can't return a pointer to it. + */ + flags &= ~(__GFP_DMA | __GFP_HIGHMEM); + + if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) + return ret; + + vstart = __get_free_pages(flags, order); + ret = (void *)vstart; + + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = dma_alloc_coherent_mask(hwdev, flags); + + if (ret) { + if (xen_create_contiguous_region(vstart, order, + fls64(dma_mask)) != 0) { + free_pages(vstart, order); + return NULL; + } + memset(ret, 0, size); + *dma_handle = virt_to_machine(ret).maddr; + } + return ret; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); + +void +xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dev_addr) +{ + int order = get_order(size); + + if (dma_release_from_coherent(hwdev, order, vaddr)) + return; + + xen_destroy_contiguous_region((unsigned long)vaddr, order); + free_pages((unsigned long)vaddr, order); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); + + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. + */ +dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = xen_phys_to_bus(phys); + void *map; + + BUG_ON(dir == DMA_NONE); + /* + * If the address happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (dma_capable(dev, dev_addr, size) && + !range_straddles_page_boundary(phys, size) && !swiotlb_force) + return dev_addr; + + /* + * Oh well, have to allocate and map a bounce buffer. + */ + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); + if (!map) + return DMA_ERROR_CODE; + + dev_addr = xen_virt_to_bus(map); + + /* + * Ensure that the address returned is DMA'ble + */ + if (!dma_capable(dev, dev_addr, size)) + panic("map_single: bounce buffer is not DMA'ble"); + + return dev_addr; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous xen_swiotlb_map_page call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = xen_bus_to_phys(dev_addr); + + BUG_ON(dir == DMA_NONE); + + /* NOTE: We use dev_addr here, not paddr! */ + if (is_xen_swiotlb_buffer(dev_addr)) { + swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + /* + * phys_to_virt doesn't work with hihgmem page but we could + * call dma_mark_clean() with hihgmem page here. However, we + * are fine since dma_mark_clean() is null on POWERPC. We can + * make dma_mark_clean() take a physical address if necessary. + */ + dma_mark_clean(phys_to_virt(paddr), size); +} + +void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + xen_unmap_single(hwdev, dev_addr, size, dir); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the dma mapping, you must + * call this function before doing so. At the next point you give the dma + * address back to the card, you must first perform a + * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +static void +xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + phys_addr_t paddr = xen_bus_to_phys(dev_addr); + + BUG_ON(dir == DMA_NONE); + + /* NOTE: We use dev_addr here, not paddr! */ + if (is_xen_swiotlb_buffer(dev_addr)) { + swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, + target); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(phys_to_virt(paddr), size); +} + +void +xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu); + +void +xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device); + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above xen_swiotlb_map_page + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for xen_swiotlb_map_page are the + * same here. + */ +int +xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = xen_phys_to_bus(paddr); + + if (swiotlb_force || + !dma_capable(hwdev, dev_addr, sg->length) || + range_straddles_page_boundary(paddr, sg->length)) { + void *map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, dir); + if (!map) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, + attrs); + sgl[0].dma_length = 0; + return DMA_ERROR_CODE; + } + sg->dma_address = xen_virt_to_bus(map); + } else + sg->dma_address = dev_addr; + sg->dma_length = sg->length; + } + return nelems; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); + +int +xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir) +{ + return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg); + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_page() above. + */ +void +xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) + xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); + +void +xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir) +{ + return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg); + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +static void +xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nelems, i) + xen_swiotlb_sync_single(hwdev, sg->dma_address, + sg->dma_length, dir, target); +} + +void +xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu); + +void +xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device); + +int +xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) +{ + return !dma_addr; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error); + +/* + * Return whether the given device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) +{ + return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 7b3e973a1aee..7e49527189b6 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -133,17 +133,12 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, } EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); +static void xenbus_switch_fatal(struct xenbus_device *, int, int, + const char *, ...); -/** - * xenbus_switch_state - * @dev: xenbus device - * @state: new state - * - * Advertise in the store a change of the given driver to the given new_state. - * Return 0 on success, or -errno on error. On error, the device will switch - * to XenbusStateClosing, and the error will be saved in the store. - */ -int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) +static int +__xenbus_switch_state(struct xenbus_device *dev, + enum xenbus_state state, int depth) { /* We check whether the state is currently set to the given value, and if not, then the state is set. We don't want to unconditionally @@ -152,35 +147,65 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) to it, as the device will be tearing down, and we don't want to resurrect that directory. - Note that, because of this cached value of our state, this function - will not work inside a Xenstore transaction (something it was - trying to in the past) because dev->state would not get reset if - the transaction was aborted. - + Note that, because of this cached value of our state, this + function will not take a caller's Xenstore transaction + (something it was trying to in the past) because dev->state + would not get reset if the transaction was aborted. */ + struct xenbus_transaction xbt; int current_state; - int err; + int err, abort; if (state == dev->state) return 0; - err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d", - ¤t_state); - if (err != 1) +again: + abort = 1; + + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_switch_fatal(dev, depth, err, "starting transaction"); return 0; + } + + err = xenbus_scanf(xbt, dev->nodename, "state", "%d", ¤t_state); + if (err != 1) + goto abort; - err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state); + err = xenbus_printf(xbt, dev->nodename, "state", "%d", state); if (err) { - if (state != XenbusStateClosing) /* Avoid looping */ - xenbus_dev_fatal(dev, err, "writing new state"); - return err; + xenbus_switch_fatal(dev, depth, err, "writing new state"); + goto abort; } - dev->state = state; + abort = 0; +abort: + err = xenbus_transaction_end(xbt, abort); + if (err) { + if (err == -EAGAIN && !abort) + goto again; + xenbus_switch_fatal(dev, depth, err, "ending transaction"); + } else + dev->state = state; return 0; } + +/** + * xenbus_switch_state + * @dev: xenbus device + * @state: new state + * + * Advertise in the store a change of the given driver to the given new_state. + * Return 0 on success, or -errno on error. On error, the device will switch + * to XenbusStateClosing, and the error will be saved in the store. + */ +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) +{ + return __xenbus_switch_state(dev, state, 0); +} + EXPORT_SYMBOL_GPL(xenbus_switch_state); int xenbus_frontend_closed(struct xenbus_device *dev) @@ -284,6 +309,23 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...) EXPORT_SYMBOL_GPL(xenbus_dev_fatal); /** + * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps + * avoiding recursion within xenbus_switch_state. + */ +static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + xenbus_va_dev_error(dev, err, fmt, ap); + va_end(ap); + + if (!depth) + __xenbus_switch_state(dev, XenbusStateClosing, 1); +} + +/** * xenbus_grant_ring * @dev: xenbus device * @ring_mfn: mfn of ring to grant diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 3479332113e9..29bac5118877 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -56,6 +56,9 @@ #include <xen/events.h> #include <xen/page.h> +#include <xen/platform_pci.h> +#include <xen/hvm.h> + #include "xenbus_comms.h" #include "xenbus_probe.h" @@ -752,10 +755,7 @@ int register_xenstore_notifier(struct notifier_block *nb) { int ret = 0; - if (xenstored_ready > 0) - ret = nb->notifier_call(nb, 0, NULL); - else - blocking_notifier_chain_register(&xenstore_chain, nb); + blocking_notifier_chain_register(&xenstore_chain, nb); return ret; } @@ -779,8 +779,23 @@ void xenbus_probe(struct work_struct *unused) /* Notify others that xenstore is up */ blocking_notifier_call_chain(&xenstore_chain, 0, NULL); } +EXPORT_SYMBOL_GPL(xenbus_probe); + +static int __init xenbus_probe_initcall(void) +{ + if (!xen_domain()) + return -ENODEV; + + if (xen_initial_domain() || xen_hvm_domain()) + return 0; + + xenbus_probe(NULL); + return 0; +} + +device_initcall(xenbus_probe_initcall); -static int __init xenbus_probe_init(void) +static int __init xenbus_init(void) { int err = 0; @@ -805,11 +820,24 @@ static int __init xenbus_probe_init(void) if (xen_initial_domain()) { /* dom0 not yet supported */ } else { + if (xen_hvm_domain()) { + uint64_t v = 0; + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + if (err) + goto out_error; + xen_store_mfn = (unsigned long)v; + xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + } else { + xen_store_evtchn = xen_start_info->store_evtchn; + xen_store_mfn = xen_start_info->store_mfn; + xen_store_interface = mfn_to_virt(xen_store_mfn); + } xenstored_ready = 1; - xen_store_evtchn = xen_start_info->store_evtchn; - xen_store_mfn = xen_start_info->store_mfn; } - xen_store_interface = mfn_to_virt(xen_store_mfn); /* Initialize the interface to xenstore. */ err = xs_init(); @@ -819,9 +847,6 @@ static int __init xenbus_probe_init(void) goto out_unreg_back; } - if (!xen_initial_domain()) - xenbus_probe(NULL); - #ifdef CONFIG_XEN_COMPAT_XENFS /* * Create xenfs mountpoint in /proc for compatibility with @@ -842,7 +867,7 @@ static int __init xenbus_probe_init(void) return err; } -postcore_initcall(xenbus_probe_init); +postcore_initcall(xenbus_init); MODULE_LICENSE("GPL"); @@ -950,6 +975,9 @@ static void wait_for_devices(struct xenbus_driver *xendrv) #ifndef MODULE static int __init boot_wait_for_devices(void) { + if (xen_hvm_domain() && !xen_platform_pci_unplug) + return -ENODEV; + ready_to_wait_for_devices = 1; wait_for_devices(NULL); return 0; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 7b547f53f65e..5534690075af 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -76,6 +76,14 @@ struct xs_handle { /* * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. * response_mutex is never taken simultaneously with the other three. + * + * transaction_mutex must be held before incrementing + * transaction_count. The mutex is held when a suspend is in + * progress to prevent new transactions starting. + * + * When decrementing transaction_count to zero the wait queue + * should be woken up, the suspend code waits for count to + * reach zero. */ /* One request at a time. */ @@ -85,7 +93,9 @@ struct xs_handle { struct mutex response_mutex; /* Protect transactions against save/restore. */ - struct rw_semaphore transaction_mutex; + struct mutex transaction_mutex; + atomic_t transaction_count; + wait_queue_head_t transaction_wq; /* Protect watch (de)register against save/restore. */ struct rw_semaphore watch_mutex; @@ -157,6 +167,31 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) return body; } +static void transaction_start(void) +{ + mutex_lock(&xs_state.transaction_mutex); + atomic_inc(&xs_state.transaction_count); + mutex_unlock(&xs_state.transaction_mutex); +} + +static void transaction_end(void) +{ + if (atomic_dec_and_test(&xs_state.transaction_count)) + wake_up(&xs_state.transaction_wq); +} + +static void transaction_suspend(void) +{ + mutex_lock(&xs_state.transaction_mutex); + wait_event(xs_state.transaction_wq, + atomic_read(&xs_state.transaction_count) == 0); +} + +static void transaction_resume(void) +{ + mutex_unlock(&xs_state.transaction_mutex); +} + void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) { void *ret; @@ -164,7 +199,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) int err; if (req_msg.type == XS_TRANSACTION_START) - down_read(&xs_state.transaction_mutex); + transaction_start(); mutex_lock(&xs_state.request_mutex); @@ -180,7 +215,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) - up_read(&xs_state.transaction_mutex); + transaction_end(); return ret; } @@ -432,11 +467,11 @@ int xenbus_transaction_start(struct xenbus_transaction *t) { char *id_str; - down_read(&xs_state.transaction_mutex); + transaction_start(); id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); if (IS_ERR(id_str)) { - up_read(&xs_state.transaction_mutex); + transaction_end(); return PTR_ERR(id_str); } @@ -461,7 +496,7 @@ int xenbus_transaction_end(struct xenbus_transaction t, int abort) err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); - up_read(&xs_state.transaction_mutex); + transaction_end(); return err; } @@ -662,7 +697,7 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch); void xs_suspend(void) { - down_write(&xs_state.transaction_mutex); + transaction_suspend(); down_write(&xs_state.watch_mutex); mutex_lock(&xs_state.request_mutex); mutex_lock(&xs_state.response_mutex); @@ -677,7 +712,7 @@ void xs_resume(void) mutex_unlock(&xs_state.response_mutex); mutex_unlock(&xs_state.request_mutex); - up_write(&xs_state.transaction_mutex); + transaction_resume(); /* No need for watches_lock: the watch_mutex is sufficient. */ list_for_each_entry(watch, &watches, list) { @@ -693,7 +728,7 @@ void xs_suspend_cancel(void) mutex_unlock(&xs_state.response_mutex); mutex_unlock(&xs_state.request_mutex); up_write(&xs_state.watch_mutex); - up_write(&xs_state.transaction_mutex); + mutex_unlock(&xs_state.transaction_mutex); } static int xenwatch_thread(void *unused) @@ -843,8 +878,10 @@ int xs_init(void) mutex_init(&xs_state.request_mutex); mutex_init(&xs_state.response_mutex); - init_rwsem(&xs_state.transaction_mutex); + mutex_init(&xs_state.transaction_mutex); init_rwsem(&xs_state.watch_mutex); + atomic_set(&xs_state.transaction_count, 0); + init_waitqueue_head(&xs_state.transaction_wq); /* Initialize the shared memory rings to talk to xenstored */ err = xb_init_comms(); diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 8924d93136f1..78bfab0700ba 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -65,7 +65,7 @@ static struct file_system_type xenfs_type = { static int __init xenfs_init(void) { - if (xen_pv_domain()) + if (xen_domain()) return register_filesystem(&xenfs_type); printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n"); @@ -74,7 +74,7 @@ static int __init xenfs_init(void) static void __exit xenfs_exit(void) { - if (xen_pv_domain()) + if (xen_domain()) unregister_filesystem(&xenfs_type); } diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c index f28ece397361..3b39c3752e21 100644 --- a/drivers/xen/xenfs/xenbus.c +++ b/drivers/xen/xenfs/xenbus.c @@ -124,6 +124,9 @@ static ssize_t xenbus_file_read(struct file *filp, mutex_lock(&u->reply_mutex); while (list_empty(&u->read_buffers)) { mutex_unlock(&u->reply_mutex); + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + ret = wait_event_interruptible(u->read_waitq, !list_empty(&u->read_buffers)); if (ret) |