diff options
Diffstat (limited to 'drivers/xen')
-rw-r--r-- | drivers/xen/Kconfig | 27 | ||||
-rw-r--r-- | drivers/xen/Makefile | 9 | ||||
-rw-r--r-- | drivers/xen/balloon.c | 62 | ||||
-rw-r--r-- | drivers/xen/biomerge.c | 13 | ||||
-rw-r--r-- | drivers/xen/cpu_hotplug.c | 3 | ||||
-rw-r--r-- | drivers/xen/events.c | 784 | ||||
-rw-r--r-- | drivers/xen/evtchn.c | 7 | ||||
-rw-r--r-- | drivers/xen/grant-table.c | 79 | ||||
-rw-r--r-- | drivers/xen/manage.c | 84 | ||||
-rw-r--r-- | drivers/xen/pci.c | 117 | ||||
-rw-r--r-- | drivers/xen/platform-pci.c | 207 | ||||
-rw-r--r-- | drivers/xen/swiotlb-xen.c | 515 | ||||
-rw-r--r-- | drivers/xen/sys-hypervisor.c | 4 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_client.c | 93 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_probe.c | 135 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_xs.c | 59 | ||||
-rw-r--r-- | drivers/xen/xencomm.c | 2 | ||||
-rw-r--r-- | drivers/xen/xenfs/Makefile | 3 | ||||
-rw-r--r-- | drivers/xen/xenfs/privcmd.c | 404 | ||||
-rw-r--r-- | drivers/xen/xenfs/super.c | 108 | ||||
-rw-r--r-- | drivers/xen/xenfs/xenbus.c | 6 | ||||
-rw-r--r-- | drivers/xen/xenfs/xenfs.h | 3 | ||||
-rw-r--r-- | drivers/xen/xenfs/xenstored.c | 68 |
23 files changed, 2543 insertions, 249 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index cab100acf983..6e6180ccd726 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -1,6 +1,8 @@ +menu "Xen driver support" + depends on XEN + config XEN_BALLOON bool "Xen memory balloon driver" - depends on XEN default y help The balloon driver allows the Xen domain to request more memory from @@ -20,7 +22,6 @@ config XEN_SCRUB_PAGES config XEN_DEV_EVTCHN tristate "Xen /dev/xen/evtchn device" - depends on XEN default y help The evtchn driver allows a userspace process to triger event @@ -30,7 +31,6 @@ config XEN_DEV_EVTCHN config XENFS tristate "Xen filesystem" - depends on XEN default y help The xen filesystem provides a way for domains to share @@ -53,11 +53,28 @@ config XEN_COMPAT_XENFS config XEN_SYS_HYPERVISOR bool "Create xen entries under /sys/hypervisor" - depends on XEN && SYSFS + depends on SYSFS select SYS_HYPERVISOR default y help Create entries under /sys/hypervisor describing the Xen hypervisor environment. When running native or in another virtual environment, /sys/hypervisor will still be present, - but will have no xen contents.
\ No newline at end of file + but will have no xen contents. + +config XEN_PLATFORM_PCI + tristate "xen platform pci device driver" + depends on XEN_PVHVM + default m + help + Driver for the Xen PCI Platform device: it is responsible for + initializing xenbus and grant_table when running in a Xen HVM + domain. As a consequence this driver is required to run any Xen PV + frontend on Xen HVM. + +config SWIOTLB_XEN + def_bool y + depends on PCI + select SWIOTLB + +endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index ec2a39b1e26f..eb8a78d77d9d 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,9 +1,16 @@ obj-y += grant-table.o features.o events.o manage.o obj-y += xenbus/ +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_features.o := $(nostackp) + +obj-$(CONFIG_BLOCK) += biomerge.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o obj-$(CONFIG_XENFS) += xenfs/ -obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o
\ No newline at end of file +obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o +obj-$(CONFIG_XEN_PLATFORM_PCI) += platform-pci.o +obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o +obj-$(CONFIG_XEN_DOM0) += pci.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index f5bbd9e83416..500290b150bb 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -43,6 +43,7 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/sysdev.h> +#include <linux/gfp.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -52,6 +53,8 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> + +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/memory.h> #include <xen/xenbus.h> @@ -66,8 +69,6 @@ struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ unsigned long current_pages; unsigned long target_pages; - /* We may hit the hard limit in Xen. If we do then we remember it. */ - unsigned long hard_limit; /* * Drivers may alter the memory reservation independently, but they * must inform the balloon driver so we avoid hitting the hard limit. @@ -84,23 +85,12 @@ static struct sys_device balloon_sysdev; static int register_balloon(struct sys_device *sysdev); -/* - * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and driver_pages, and - * balloon lists. - */ -static DEFINE_SPINLOCK(balloon_lock); - static struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; -/* VM /proc information for memory */ -extern unsigned long totalram_pages; - #ifdef CONFIG_HIGHMEM -extern unsigned long totalhigh_pages; #define inc_totalhigh_pages() (totalhigh_pages++) #define dec_totalhigh_pages() (totalhigh_pages--) #else @@ -140,6 +130,8 @@ static void balloon_append(struct page *page) list_add(&page->lru, &ballooned_pages); balloon_stats.balloon_low++; } + + totalram_pages--; } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -160,6 +152,8 @@ static struct page *balloon_retrieve(void) else balloon_stats.balloon_low--; + totalram_pages++; + return page; } @@ -185,7 +179,7 @@ static void balloon_alarm(unsigned long unused) static unsigned long current_target(void) { - unsigned long target = min(balloon_stats.target_pages, balloon_stats.hard_limit); + unsigned long target = balloon_stats.target_pages; target = min(target, balloon_stats.current_pages + @@ -209,35 +203,22 @@ static int increase_reservation(unsigned long nr_pages) if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); - spin_lock_irqsave(&balloon_lock, flags); + spin_lock_irqsave(&xen_reservation_lock, flags); page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { BUG_ON(page == NULL); - frame_list[i] = page_to_pfn(page);; + frame_list[i] = page_to_pfn(page); page = balloon_next_page(page); } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc < nr_pages) { - if (rc > 0) { - int ret; - - /* We hit the Xen hard limit: reprobe. */ - reservation.nr_extents = rc; - ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, - &reservation); - BUG_ON(ret != rc); - } - if (rc >= 0) - balloon_stats.hard_limit = (balloon_stats.current_pages + rc - - balloon_stats.driver_pages); + if (rc < 0) goto out; - } - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < rc; i++) { page = balloon_retrieve(); BUG_ON(page == NULL); @@ -263,13 +244,12 @@ static int increase_reservation(unsigned long nr_pages) __free_page(page); } - balloon_stats.current_pages += nr_pages; - totalram_pages = balloon_stats.current_pages; + balloon_stats.current_pages += rc; out: - spin_unlock_irqrestore(&balloon_lock, flags); + spin_unlock_irqrestore(&xen_reservation_lock, flags); - return 0; + return rc < 0 ? rc : rc != nr_pages; } static int decrease_reservation(unsigned long nr_pages) @@ -312,7 +292,7 @@ static int decrease_reservation(unsigned long nr_pages) kmap_flush_unused(); flush_tlb_all(); - spin_lock_irqsave(&balloon_lock, flags); + spin_lock_irqsave(&xen_reservation_lock, flags); /* No more mappings: invalidate P2M and add to balloon. */ for (i = 0; i < nr_pages; i++) { @@ -327,9 +307,8 @@ static int decrease_reservation(unsigned long nr_pages) BUG_ON(ret != nr_pages); balloon_stats.current_pages -= nr_pages; - totalram_pages = balloon_stats.current_pages; - spin_unlock_irqrestore(&balloon_lock, flags); + spin_unlock_irqrestore(&xen_reservation_lock, flags); return need_sleep; } @@ -371,7 +350,6 @@ static void balloon_process(struct work_struct *work) static void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ - balloon_stats.hard_limit = ~0UL; balloon_stats.target_pages = target; schedule_work(&balloon_worker); } @@ -426,12 +404,10 @@ static int __init balloon_init(void) pr_info("xen_balloon: Initialising balloon driver.\n"); balloon_stats.current_pages = min(xen_start_info->nr_pages, max_pfn); - totalram_pages = balloon_stats.current_pages; balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; balloon_stats.driver_pages = 0UL; - balloon_stats.hard_limit = ~0UL; init_timer(&balloon_timer); balloon_timer.data = 0; @@ -476,9 +452,6 @@ module_exit(balloon_exit); BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); -BALLOON_SHOW(hard_limit_kb, - (balloon_stats.hard_limit!=~0UL) ? "%lu\n" : "???\n", - (balloon_stats.hard_limit!=~0UL) ? PAGES2KB(balloon_stats.hard_limit) : 0); BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, @@ -548,7 +521,6 @@ static struct attribute *balloon_info_attrs[] = { &attr_current_kb.attr, &attr_low_kb.attr, &attr_high_kb.attr, - &attr_hard_limit_kb.attr, &attr_driver_kb.attr, NULL }; diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c new file mode 100644 index 000000000000..ba6eda4b5143 --- /dev/null +++ b/drivers/xen/biomerge.c @@ -0,0 +1,13 @@ +#include <linux/bio.h> +#include <linux/io.h> +#include <xen/page.h> + +bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, + const struct bio_vec *vec2) +{ + unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page)); + unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page)); + + return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && + ((mfn1 == mfn2) || ((mfn1+1) == mfn2)); +} diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index bdfd584ad853..14e2d995e958 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -1,5 +1,6 @@ #include <linux/notifier.h> +#include <xen/xen.h> #include <xen/xenbus.h> #include <asm/xen/hypervisor.h> @@ -86,7 +87,7 @@ static int setup_cpu_watcher(struct notifier_block *notifier, for_each_possible_cpu(cpu) { if (vcpu_online(cpu) == 0) { (void)cpu_down(cpu); - cpu_clear(cpu, cpu_present_map); + set_cpu_present(cpu, false); } } diff --git a/drivers/xen/events.c b/drivers/xen/events.c index abad71b1632b..321a0c8346e5 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -16,7 +16,7 @@ * (typically dom0). * 2. VIRQs, typically used for timers. These are per-cpu events. * 3. IPIs. - * 4. Hardware interrupts. Not supported at present. + * 4. PIRQs - Hardware interrupts. * * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ @@ -27,18 +27,28 @@ #include <linux/module.h> #include <linux/string.h> #include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/irqnr.h> +#include <linux/pci.h> +#include <asm/desc.h> #include <asm/ptrace.h> #include <asm/irq.h> #include <asm/idle.h> +#include <asm/io_apic.h> #include <asm/sync_bitops.h> +#include <asm/xen/pci.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> +#include <xen/xen.h> +#include <xen/hvm.h> #include <xen/xen-ops.h> #include <xen/events.h> #include <xen/interface/xen.h> #include <xen/interface/event_channel.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/hvm/params.h> /* * This lock protects updates to the following mapping and reference-count @@ -47,10 +57,10 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock); /* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; /* Interrupt types. */ enum xen_irq_type { @@ -67,7 +77,8 @@ enum xen_irq_type { * event channel - irq->event channel mapping * cpu - cpu this event channel is bound to * index - type-specific information: - * PIRQ - vector, with MSB being "needs EIO" + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. * VIRQ - virq number * IPI - IPI vector * EVTCHN - @@ -82,21 +93,30 @@ struct irq_info unsigned short virq; enum ipi_vector ipi; struct { + unsigned short pirq; unsigned short gsi; - unsigned short vector; + unsigned char vector; + unsigned char flags; } pirq; } u; }; +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) -static struct irq_info irq_info[NR_IRQS]; +static struct irq_info *irq_info; +static int *pirq_to_irq; +static int nr_pirqs; -static int evtchn_to_irq[NR_EVENT_CHANNELS] = { - [0 ... NR_EVENT_CHANNELS-1] = -1 -}; +static int *evtchn_to_irq; struct cpu_evtchn_s { unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG]; }; -static struct cpu_evtchn_s *cpu_evtchn_mask_p; + +static __initdata struct cpu_evtchn_s init_evtchn_mask = { + .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, +}; +static struct cpu_evtchn_s *cpu_evtchn_mask_p = &init_evtchn_mask; + static inline unsigned long *cpu_evtchn_mask(int cpu) { return cpu_evtchn_mask_p[cpu].bits; @@ -106,6 +126,8 @@ static inline unsigned long *cpu_evtchn_mask(int cpu) #define VALID_EVTCHN(chn) ((chn) != 0) static struct irq_chip xen_dynamic_chip; +static struct irq_chip xen_percpu_chip; +static struct irq_chip xen_pirq_chip; /* Constructor for packed IRQ information. */ static struct irq_info mk_unbound_info(void) @@ -131,11 +153,12 @@ static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq) .cpu = 0, .u.virq = virq }; } -static struct irq_info mk_pirq_info(unsigned short evtchn, +static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq, unsigned short gsi, unsigned short vector) { return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn, - .cpu = 0, .u.pirq = { .gsi = gsi, .vector = vector } }; + .cpu = 0, + .u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } }; } /* @@ -177,6 +200,16 @@ static unsigned virq_from_irq(unsigned irq) return info->u.virq; } +static unsigned pirq_from_irq(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_PIRQ); + + return info->u.pirq.pirq; +} + static unsigned gsi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); @@ -218,6 +251,15 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn) return ret; } +static bool pirq_needs_eoi(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info->type != IRQT_PIRQ); + + return info->u.pirq.flags & PIRQ_NEEDS_EOI; +} + static inline unsigned long active_evtchns(unsigned int cpu, struct shared_info *sh, unsigned int idx) @@ -254,7 +296,7 @@ static void init_evtchn_cpu_bindings(void) } #endif - memset(cpu_evtchn_mask(0), ~0, sizeof(cpu_evtchn_mask(0))); + memset(cpu_evtchn_mask(0), ~0, sizeof(struct cpu_evtchn_s)); } static inline void clear_evtchn(int port) @@ -329,26 +371,413 @@ static void unmask_evtchn(int port) put_cpu(); } -static int find_unbound_irq(void) +static int get_nr_hw_irqs(void) { - int irq; - struct irq_desc *desc; + int ret = 1; - for (irq = 0; irq < nr_irqs; irq++) - if (irq_info[irq].type == IRQT_UNBOUND) +#ifdef CONFIG_X86_IO_APIC + ret = get_nr_irqs_gsi(); +#endif + + return ret; +} + +/* callers of this function should make sure that PHYSDEVOP_get_nr_pirqs + * succeeded otherwise nr_pirqs won't hold the right value */ +static int find_unbound_pirq(void) +{ + int i; + for (i = nr_pirqs-1; i >= 0; i--) { + if (pirq_to_irq[i] < 0) + return i; + } + return -1; +} + +static int find_unbound_irq(void) +{ + struct irq_data *data; + int irq, res; + int start = get_nr_hw_irqs(); + + if (start == nr_irqs) + goto no_irqs; + + /* nr_irqs is a magic value. Must not use it.*/ + for (irq = nr_irqs-1; irq > start; irq--) { + data = irq_get_irq_data(irq); + /* only 0->15 have init'd desc; handle irq > 16 */ + if (!data) break; + if (data->chip == &no_irq_chip) + break; + if (data->chip != &xen_dynamic_chip) + continue; + if (irq_info[irq].type == IRQT_UNBOUND) + return irq; + } + + if (irq == start) + goto no_irqs; - if (irq == nr_irqs) - panic("No available IRQ to bind to: increase nr_irqs!\n"); + res = irq_alloc_desc_at(irq, 0); - desc = irq_to_desc_alloc_node(irq, 0); - if (WARN_ON(desc == NULL)) + if (WARN_ON(res != irq)) return -1; - dynamic_irq_init(irq); + return irq; + +no_irqs: + panic("No available IRQ to bind to: increase nr_irqs!\n"); +} + +static bool identity_mapped_irq(unsigned irq) +{ + /* identity map all the hardware irqs */ + return irq < get_nr_hw_irqs(); +} + +static void pirq_unmask_notify(int irq) +{ + struct physdev_eoi eoi = { .irq = pirq_from_irq(irq) }; + + if (unlikely(pirq_needs_eoi(irq))) { + int rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + WARN_ON(rc); + } +} + +static void pirq_query_unmask(int irq) +{ + struct physdev_irq_status_query irq_status; + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info->type != IRQT_PIRQ); + + irq_status.irq = pirq_from_irq(irq); + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + irq_status.flags = 0; + + info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; + if (irq_status.flags & XENIRQSTAT_needs_eoi) + info->u.pirq.flags |= PIRQ_NEEDS_EOI; +} + +static bool probing_irq(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return desc && desc->action == NULL; +} + +static unsigned int startup_pirq(unsigned int irq) +{ + struct evtchn_bind_pirq bind_pirq; + struct irq_info *info = info_for_irq(irq); + int evtchn = evtchn_from_irq(irq); + int rc; + + BUG_ON(info->type != IRQT_PIRQ); + + if (VALID_EVTCHN(evtchn)) + goto out; + + bind_pirq.pirq = pirq_from_irq(irq); + /* NB. We are happy to share unless we are probing. */ + bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? + BIND_PIRQ__WILL_SHARE : 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); + if (rc != 0) { + if (!probing_irq(irq)) + printk(KERN_INFO "Failed to obtain physical IRQ %d\n", + irq); + return 0; + } + evtchn = bind_pirq.port; + + pirq_query_unmask(irq); + + evtchn_to_irq[evtchn] = irq; + bind_evtchn_to_cpu(evtchn, 0); + info->evtchn = evtchn; + +out: + unmask_evtchn(evtchn); + pirq_unmask_notify(irq); + + return 0; +} + +static void shutdown_pirq(unsigned int irq) +{ + struct evtchn_close close; + struct irq_info *info = info_for_irq(irq); + int evtchn = evtchn_from_irq(irq); + + BUG_ON(info->type != IRQT_PIRQ); + + if (!VALID_EVTCHN(evtchn)) + return; + + mask_evtchn(evtchn); + + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + bind_evtchn_to_cpu(evtchn, 0); + evtchn_to_irq[evtchn] = -1; + info->evtchn = 0; +} + +static void enable_pirq(unsigned int irq) +{ + startup_pirq(irq); +} + +static void disable_pirq(unsigned int irq) +{ +} + +static void ack_pirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) { + mask_evtchn(evtchn); + clear_evtchn(evtchn); + } +} + +static void end_pirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + struct irq_desc *desc = irq_to_desc(irq); + + if (WARN_ON(!desc)) + return; + + if ((desc->status & (IRQ_DISABLED|IRQ_PENDING)) == + (IRQ_DISABLED|IRQ_PENDING)) { + shutdown_pirq(irq); + } else if (VALID_EVTCHN(evtchn)) { + unmask_evtchn(evtchn); + pirq_unmask_notify(irq); + } +} + +static int find_irq_by_gsi(unsigned gsi) +{ + int irq; + + for (irq = 0; irq < nr_irqs; irq++) { + struct irq_info *info = info_for_irq(irq); + + if (info == NULL || info->type != IRQT_PIRQ) + continue; + + if (gsi_from_irq(irq) == gsi) + return irq; + } + + return -1; +} + +int xen_allocate_pirq(unsigned gsi, int shareable, char *name) +{ + return xen_map_pirq_gsi(gsi, gsi, shareable, name); +} + +/* xen_map_pirq_gsi might allocate irqs from the top down, as a + * consequence don't assume that the irq number returned has a low value + * or can be used as a pirq number unless you know otherwise. + * + * One notable exception is when xen_map_pirq_gsi is called passing an + * hardware gsi as argument, in that case the irq number returned + * matches the gsi number passed as second argument. + * + * Note: We don't assign an event channel until the irq actually started + * up. Return an existing irq if we've already got one for the gsi. + */ +int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) +{ + int irq = 0; + struct physdev_irq irq_op; + + spin_lock(&irq_mapping_update_lock); + + if ((pirq > nr_pirqs) || (gsi > nr_irqs)) { + printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n", + pirq > nr_pirqs ? "nr_pirqs" :"", + gsi > nr_irqs ? "nr_irqs" : ""); + goto out; + } + + irq = find_irq_by_gsi(gsi); + if (irq != -1) { + printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", + irq, gsi); + goto out; /* XXX need refcount? */ + } + /* If we are a PV guest, we don't have GSIs (no ACPI passed). Therefore + * we are using the !xen_initial_domain() to drop in the function.*/ + if (identity_mapped_irq(gsi) || (!xen_initial_domain() && + xen_pv_domain())) { + irq = gsi; + irq_alloc_desc_at(irq, 0); + } else + irq = find_unbound_irq(); + + set_irq_chip_and_handler_name(irq, &xen_pirq_chip, + handle_level_irq, name); + + irq_op.irq = irq; + irq_op.vector = 0; + + /* Only the privileged domain can do this. For non-priv, the pcifront + * driver provides a PCI bus that does the call to do exactly + * this in the priv domain. */ + if (xen_initial_domain() && + HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { + irq_free_desc(irq); + irq = -ENOSPC; + goto out; + } + + irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector); + irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0; + pirq_to_irq[pirq] = irq; + +out: + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +#ifdef CONFIG_PCI_MSI +#include <linux/msi.h> +#include "../pci/msi.h" + +void xen_allocate_pirq_msi(char *name, int *irq, int *pirq) +{ + spin_lock(&irq_mapping_update_lock); + + *irq = find_unbound_irq(); + if (*irq == -1) + goto out; + + *pirq = find_unbound_pirq(); + if (*pirq == -1) + goto out; + + set_irq_chip_and_handler_name(*irq, &xen_pirq_chip, + handle_level_irq, name); + + irq_info[*irq] = mk_pirq_info(0, *pirq, 0, 0); + pirq_to_irq[*pirq] = *irq; + +out: + spin_unlock(&irq_mapping_update_lock); +} + +int xen_create_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int type) +{ + int irq = -1; + struct physdev_map_pirq map_irq; + int rc; + int pos; + u32 table_offset, bir; + + memset(&map_irq, 0, sizeof(map_irq)); + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_MSI; + map_irq.index = -1; + map_irq.pirq = -1; + map_irq.bus = dev->bus->number; + map_irq.devfn = dev->devfn; + + if (type == PCI_CAP_ID_MSIX) { + pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); + + pci_read_config_dword(dev, msix_table_offset_reg(pos), + &table_offset); + bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK); + + map_irq.table_base = pci_resource_start(dev, bir); + map_irq.entry_nr = msidesc->msi_attrib.entry_nr; + } + + spin_lock(&irq_mapping_update_lock); + + irq = find_unbound_irq(); + + if (irq == -1) + goto out; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + printk(KERN_WARNING "xen map irq failed %d\n", rc); + + irq_free_desc(irq); + + irq = -1; + goto out; + } + irq_info[irq] = mk_pirq_info(0, map_irq.pirq, 0, map_irq.index); + + set_irq_chip_and_handler_name(irq, &xen_pirq_chip, + handle_level_irq, + (type == PCI_CAP_ID_MSIX) ? "msi-x":"msi"); + +out: + spin_unlock(&irq_mapping_update_lock); return irq; } +#endif + +int xen_destroy_irq(int irq) +{ + struct irq_desc *desc; + struct physdev_unmap_pirq unmap_irq; + struct irq_info *info = info_for_irq(irq); + int rc = -ENOENT; + + spin_lock(&irq_mapping_update_lock); + + desc = irq_to_desc(irq); + if (!desc) + goto out; + + if (xen_initial_domain()) { + unmap_irq.pirq = info->u.pirq.gsi; + unmap_irq.domid = DOMID_SELF; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); + if (rc) { + printk(KERN_WARNING "unmap irq failed %d\n", rc); + goto out; + } + } + irq_info[irq] = mk_unbound_info(); + + irq_free_desc(irq); + +out: + spin_unlock(&irq_mapping_update_lock); + return rc; +} + +int xen_vector_from_irq(unsigned irq) +{ + return vector_from_irq(irq); +} + +int xen_gsi_from_irq(unsigned irq) +{ + return gsi_from_irq(irq); +} int bind_evtchn_to_irq(unsigned int evtchn) { @@ -362,7 +791,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) irq = find_unbound_irq(); set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "event"); + handle_fasteoi_irq, "event"); evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_evtchn_info(evtchn); @@ -388,8 +817,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) if (irq < 0) goto out; - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "ipi"); + set_irq_chip_and_handler_name(irq, &xen_percpu_chip, + handle_percpu_irq, "ipi"); bind_ipi.vcpu = cpu; if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, @@ -410,7 +839,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) } -static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; int evtchn, irq; @@ -420,6 +849,11 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) irq = per_cpu(virq_to_irq, cpu)[virq]; if (irq == -1) { + irq = find_unbound_irq(); + + set_irq_chip_and_handler_name(irq, &xen_percpu_chip, + handle_percpu_irq, "virq"); + bind_virq.virq = virq; bind_virq.vcpu = cpu; if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, @@ -427,11 +861,6 @@ static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) BUG(); evtchn = bind_virq.port; - irq = find_unbound_irq(); - - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "virq"); - evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_virq_info(evtchn, virq); @@ -474,9 +903,12 @@ static void unbind_from_irq(unsigned int irq) bind_evtchn_to_cpu(evtchn, 0); evtchn_to_irq[evtchn] = -1; + } + + if (irq_info[irq].type != IRQT_UNBOUND) { irq_info[irq] = mk_unbound_info(); - dynamic_irq_cleanup(irq); + irq_free_desc(irq); } spin_unlock(&irq_mapping_update_lock); @@ -532,6 +964,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, if (irq < 0) return irq; + irqflags |= IRQF_NO_SUSPEND; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -559,41 +992,75 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) { struct shared_info *sh = HYPERVISOR_shared_info; int cpu = smp_processor_id(); + unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu); int i; unsigned long flags; static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; spin_lock_irqsave(&debug_lock, flags); - printk("vcpu %d\n ", cpu); + printk("\nvcpu %d\n ", cpu); for_each_online_cpu(i) { - struct vcpu_info *v = per_cpu(xen_vcpu, i); - printk("%d: masked=%d pending=%d event_sel %08lx\n ", i, - (get_irq_regs() && i == cpu) ? xen_irqs_disabled(get_irq_regs()) : v->evtchn_upcall_mask, - v->evtchn_upcall_pending, - v->evtchn_pending_sel); - } - printk("pending:\n "); - for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_pending[i], - i % 8 == 0 ? "\n " : " "); - printk("\nmasks:\n "); - for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); - - printk("\nunmasked:\n "); - for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) - printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i], - i % 8 == 0 ? "\n " : " "); + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*lx\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*lx%s", (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*lx%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (NR_EVENT_CHANNELS/BITS_PER_LONG)-1; i >= 0; i--) + printk("%0*lx%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + unsigned long pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*lx%s", (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } printk("\npending list:\n"); - for(i = 0; i < NR_EVENT_CHANNELS; i++) { + for (i = 0; i < NR_EVENT_CHANNELS; i++) { if (sync_test_bit(i, sh->evtchn_pending)) { - printk(" %d: event %d -> irq %d\n", + int word_idx = i / BITS_PER_LONG; + printk(" %d: event %d -> irq %d%s%s%s\n", cpu_from_evtchn(i), i, - evtchn_to_irq[i]); + evtchn_to_irq[i], + sync_test_bit(word_idx, &v->evtchn_pending_sel) + ? "" : " l2-clear", + !sync_test_bit(i, sh->evtchn_mask) + ? "" : " globally-masked", + sync_test_bit(i, cpu_evtchn) + ? "" : " locally-masked"); } } @@ -602,6 +1069,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for @@ -611,24 +1080,19 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -void xen_evtchn_do_upcall(struct pt_regs *regs) +static void __xen_evtchn_do_upcall(void) { int cpu = get_cpu(); - struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); unsigned count; - exit_idle(); - irq_enter(); - do { unsigned long pending_words; vcpu_info->evtchn_upcall_pending = 0; - if (__get_cpu_var(nesting_count)++) + if (__get_cpu_var(xed_nesting_count)++) goto out; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -645,24 +1109,48 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) int bit_idx = __ffs(pending_bits); int port = (word_idx * BITS_PER_LONG) + bit_idx; int irq = evtchn_to_irq[port]; + struct irq_desc *desc; + + mask_evtchn(port); + clear_evtchn(port); - if (irq != -1) - handle_irq(irq, regs); + if (irq != -1) { + desc = irq_to_desc(irq); + if (desc) + generic_handle_irq_desc(irq, desc); + } } } BUG_ON(!irqs_disabled()); - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; - } while(count != 1); + count = __get_cpu_var(xed_nesting_count); + __get_cpu_var(xed_nesting_count) = 0; + } while (count != 1 || vcpu_info->evtchn_upcall_pending); out: + + put_cpu(); +} + +void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + exit_idle(); + irq_enter(); + + __xen_evtchn_do_upcall(); + irq_exit(); set_irq_regs(old_regs); +} - put_cpu(); +void xen_hvm_evtchn_do_upcall(void) +{ + __xen_evtchn_do_upcall(); } +EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); /* Rebind a new event channel to an existing irq. */ void rebind_evtchn_irq(int evtchn, int irq) @@ -699,7 +1187,10 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); - if (!VALID_EVTCHN(evtchn)) + /* events delivered via platform PCI interrupts are always + * routed to vcpu 0 */ + if (!VALID_EVTCHN(evtchn) || + (xen_hvm_domain() && !xen_have_vector_callback)) return -1; /* Send future instances of this interrupt to other vcpu. */ @@ -760,10 +1251,10 @@ static void ack_dynirq(unsigned int irq) { int evtchn = evtchn_from_irq(irq); - move_native_irq(irq); + move_masked_irq(irq); if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); + unmask_evtchn(evtchn); } static int retrigger_dynirq(unsigned int irq) @@ -808,9 +1299,6 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_virq_info(evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); - - /* Ready for use. */ - unmask_evtchn(evtchn); } } @@ -836,10 +1324,6 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn_to_irq[evtchn] = irq; irq_info[irq] = mk_ipi_info(evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); - - /* Ready for use. */ - unmask_evtchn(evtchn); - } } @@ -851,7 +1335,7 @@ void xen_clear_irq_pending(int irq) if (VALID_EVTCHN(evtchn)) clear_evtchn(evtchn); } - +EXPORT_SYMBOL(xen_clear_irq_pending); void xen_set_irq_pending(int irq) { int evtchn = evtchn_from_irq(irq); @@ -871,9 +1355,9 @@ bool xen_test_irq_pending(int irq) return ret; } -/* Poll waiting for an irq to become pending. In the usual case, the - irq will be disabled so it won't deliver an interrupt. */ -void xen_poll_irq(int irq) +/* Poll waiting for an irq to become pending with timeout. In the usual case, + * the irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq_timeout(int irq, u64 timeout) { evtchn_port_t evtchn = evtchn_from_irq(irq); @@ -881,17 +1365,25 @@ void xen_poll_irq(int irq) struct sched_poll poll; poll.nr_ports = 1; - poll.timeout = 0; + poll.timeout = timeout; set_xen_guest_handle(poll.ports, &evtchn); if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) BUG(); } } +EXPORT_SYMBOL(xen_poll_irq_timeout); +/* Poll waiting for an irq to become pending. In the usual case, the + * irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq(int irq) +{ + xen_poll_irq_timeout(irq, 0 /* no timeout */); +} void xen_irq_resume(void) { unsigned int cpu, irq, evtchn; + struct irq_desc *desc; init_evtchn_cpu_bindings(); @@ -910,6 +1402,23 @@ void xen_irq_resume(void) restore_cpu_virqs(cpu); restore_cpu_ipis(cpu); } + + /* + * Unmask any IRQF_NO_SUSPEND IRQs which are enabled. These + * are not handled by the IRQ core. + */ + for_each_irq_desc(irq, desc) { + if (!desc->action || !(desc->action->flags & IRQF_NO_SUSPEND)) + continue; + if (desc->status & IRQ_DISABLED) + continue; + + evtchn = evtchn_from_irq(irq); + if (evtchn == -1) + continue; + + unmask_evtchn(evtchn); + } } static struct irq_chip xen_dynamic_chip __read_mostly = { @@ -919,18 +1428,107 @@ static struct irq_chip xen_dynamic_chip __read_mostly = { .mask = disable_dynirq, .unmask = enable_dynirq, - .ack = ack_dynirq, + .eoi = ack_dynirq, .set_affinity = set_affinity_irq, .retrigger = retrigger_dynirq, }; +static struct irq_chip xen_pirq_chip __read_mostly = { + .name = "xen-pirq", + + .startup = startup_pirq, + .shutdown = shutdown_pirq, + + .enable = enable_pirq, + .unmask = enable_pirq, + + .disable = disable_pirq, + .mask = disable_pirq, + + .ack = ack_pirq, + .end = end_pirq, + + .set_affinity = set_affinity_irq, + + .retrigger = retrigger_dynirq, +}; + +static struct irq_chip xen_percpu_chip __read_mostly = { + .name = "xen-percpu", + + .disable = disable_dynirq, + .mask = disable_dynirq, + .unmask = enable_dynirq, + + .ack = ack_dynirq, +}; + +int xen_set_callback_via(uint64_t via) +{ + struct xen_hvm_param a; + a.domid = DOMID_SELF; + a.index = HVM_PARAM_CALLBACK_IRQ; + a.value = via; + return HYPERVISOR_hvm_op(HVMOP_set_param, &a); +} +EXPORT_SYMBOL_GPL(xen_set_callback_via); + +#ifdef CONFIG_XEN_PVHVM +/* Vector callbacks are better than PCI interrupts to receive event + * channel notifications because we can receive vector callbacks on any + * vcpu and we don't need PCI support or APIC interactions. */ +void xen_callback_vector(void) +{ + int rc; + uint64_t callback_via; + if (xen_have_vector_callback) { + callback_via = HVM_CALLBACK_VECTOR(XEN_HVM_EVTCHN_CALLBACK); + rc = xen_set_callback_via(callback_via); + if (rc) { + printk(KERN_ERR "Request for Xen HVM callback vector" + " failed.\n"); + xen_have_vector_callback = 0; + return; + } + printk(KERN_INFO "Xen HVM callback vector for event delivery is " + "enabled\n"); + /* in the restore case the vector has already been allocated */ + if (!test_bit(XEN_HVM_EVTCHN_CALLBACK, used_vectors)) + alloc_intr_gate(XEN_HVM_EVTCHN_CALLBACK, xen_hvm_callback_vector); + } +} +#else +void xen_callback_vector(void) {} +#endif + void __init xen_init_IRQ(void) { - int i; + int i, rc; + struct physdev_nr_pirqs op_nr_pirqs; cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s), GFP_KERNEL); - BUG_ON(cpu_evtchn_mask_p == NULL); + irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL); + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_nr_pirqs, &op_nr_pirqs); + if (rc < 0) { + nr_pirqs = nr_irqs; + if (rc != -ENOSYS) + printk(KERN_WARNING "PHYSDEVOP_get_nr_pirqs returned rc=%d\n", rc); + } else { + if (xen_pv_domain() && !xen_initial_domain()) + nr_pirqs = max((int)op_nr_pirqs.nr_pirqs, nr_irqs); + else + nr_pirqs = op_nr_pirqs.nr_pirqs; + } + pirq_to_irq = kcalloc(nr_pirqs, sizeof(*pirq_to_irq), GFP_KERNEL); + for (i = 0; i < nr_pirqs; i++) + pirq_to_irq[i] = -1; + + evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), + GFP_KERNEL); + for (i = 0; i < NR_EVENT_CHANNELS; i++) + evtchn_to_irq[i] = -1; init_evtchn_cpu_bindings(); @@ -938,5 +1536,15 @@ void __init xen_init_IRQ(void) for (i = 0; i < NR_EVENT_CHANNELS; i++) mask_evtchn(i); - irq_ctx_init(smp_processor_id()); + if (xen_hvm_domain()) { + xen_callback_vector(); + native_init_IRQ(); + /* pci_xen_hvm_init must be called after native_init_IRQ so that + * __acpi_register_gsi can point at the right function */ + pci_xen_hvm_init(); + } else { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + xen_setup_pirqs(); + } } diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index f3594ec2ee33..ef11daf0cafe 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -38,7 +38,6 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/fs.h> -#include <linux/errno.h> #include <linux/miscdevice.h> #include <linux/major.h> #include <linux/proc_fs.h> @@ -46,9 +45,10 @@ #include <linux/poll.h> #include <linux/irq.h> #include <linux/init.h> -#include <linux/gfp.h> #include <linux/mutex.h> #include <linux/cpu.h> + +#include <xen/xen.h> #include <xen/events.h> #include <xen/evtchn.h> #include <asm/xen/hypervisor.h> @@ -470,7 +470,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) filp->private_data = u; - return 0; + return nonseekable_open(inode, filp);; } static int evtchn_release(struct inode *inode, struct file *filp) @@ -513,6 +513,7 @@ static const struct file_operations evtchn_fops = { .fasync = evtchn_fasync, .open = evtchn_open, .release = evtchn_release, + .llseek = no_llseek, }; static struct miscdevice evtchn_miscdev = { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 7d8f531fb8e8..6c4531816496 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -34,12 +34,16 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <linux/io.h> +#include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/page.h> #include <xen/grant_table.h> +#include <xen/interface/memory.h> #include <asm/xen/hypercall.h> #include <asm/pgtable.h> @@ -57,6 +61,8 @@ static unsigned int boot_max_nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); +unsigned long xen_hvm_resume_frames; +EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); static struct grant_entry *shared; @@ -431,7 +437,7 @@ static unsigned int __max_nr_grant_frames(void) return query.max_nr_frames; } -static inline unsigned int max_nr_grant_frames(void) +unsigned int gnttab_max_grant_frames(void) { unsigned int xen_max = __max_nr_grant_frames(); @@ -439,6 +445,7 @@ static inline unsigned int max_nr_grant_frames(void) return boot_max_nr_grant_frames; return xen_max; } +EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { @@ -447,6 +454,30 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; + if (xen_hvm_domain()) { + struct xen_add_to_physmap xatp; + unsigned int i = end_idx; + rc = 0; + /* + * Loop backwards, so that the first hypercall has the largest + * index, ensuring that the table will grow only once. + */ + do { + xatp.domid = DOMID_SELF; + xatp.idx = i; + xatp.space = XENMAPSPACE_grant_table; + xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); + if (rc != 0) { + printk(KERN_WARNING + "grant table add_to_physmap failed, err=%d\n", rc); + break; + } + } while (i-- > start_idx); + + return rc; + } + frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); if (!frames) return -ENOMEM; @@ -463,7 +494,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) BUG_ON(rc || setup.status); - rc = arch_gnttab_map_shared(frames, nr_gframes, max_nr_grant_frames(), + rc = arch_gnttab_map_shared(frames, nr_gframes, gnttab_max_grant_frames(), &shared); BUG_ON(rc); @@ -474,9 +505,27 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) int gnttab_resume(void) { - if (max_nr_grant_frames() < nr_grant_frames) + unsigned int max_nr_gframes; + + max_nr_gframes = gnttab_max_grant_frames(); + if (max_nr_gframes < nr_grant_frames) return -ENOSYS; - return gnttab_map(0, nr_grant_frames - 1); + + if (xen_pv_domain()) + return gnttab_map(0, nr_grant_frames - 1); + + if (!shared) { + shared = ioremap(xen_hvm_resume_frames, PAGE_SIZE * max_nr_gframes); + if (shared == NULL) { + printk(KERN_WARNING + "Failed to ioremap gnttab share frames!"); + return -ENOMEM; + } + } + + gnttab_map(0, nr_grant_frames - 1); + + return 0; } int gnttab_suspend(void) @@ -493,7 +542,7 @@ static int gnttab_expand(unsigned int req_entries) cur = nr_grant_frames; extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / GREFS_PER_GRANT_FRAME); - if (cur + extra > max_nr_grant_frames()) + if (cur + extra > gnttab_max_grant_frames()) return -ENOSPC; rc = gnttab_map(cur, cur + extra - 1); @@ -503,15 +552,12 @@ static int gnttab_expand(unsigned int req_entries) return rc; } -static int __devinit gnttab_init(void) +int gnttab_init(void) { int i; unsigned int max_nr_glist_frames, nr_glist_frames; unsigned int nr_init_grefs; - if (!xen_domain()) - return -ENODEV; - nr_grant_frames = 1; boot_max_nr_grant_frames = __max_nr_grant_frames(); @@ -554,5 +600,18 @@ static int __devinit gnttab_init(void) kfree(gnttab_list); return -ENOMEM; } +EXPORT_SYMBOL_GPL(gnttab_init); + +static int __devinit __gnttab_init(void) +{ + /* Delay grant-table initialization in the PV on HVM case */ + if (xen_hvm_domain()) + return 0; + + if (!xen_pv_domain()) + return -ENODEV; + + return gnttab_init(); +} -core_initcall(gnttab_init); +core_initcall(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 10d03d7931c4..ef9c7db52077 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -3,11 +3,13 @@ */ #include <linux/kernel.h> #include <linux/err.h> +#include <linux/slab.h> #include <linux/reboot.h> #include <linux/sysrq.h> #include <linux/stop_machine.h> #include <linux/freezer.h> +#include <xen/xen.h> #include <xen/xenbus.h> #include <xen/grant_table.h> #include <xen/events.h> @@ -16,6 +18,7 @@ #include <asm/xen/hypercall.h> #include <asm/xen/page.h> +#include <asm/xen/hypervisor.h> enum shutdown_state { SHUTDOWN_INVALID = -1, @@ -32,10 +35,30 @@ enum shutdown_state { static enum shutdown_state shutting_down = SHUTDOWN_INVALID; #ifdef CONFIG_PM_SLEEP -static int xen_suspend(void *data) +static int xen_hvm_suspend(void *data) { + struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; int *cancelled = data; + + BUG_ON(!irqs_disabled()); + + *cancelled = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); + + xen_hvm_post_suspend(*cancelled); + gnttab_resume(); + + if (!*cancelled) { + xen_irq_resume(); + xen_timer_resume(); + } + + return 0; +} + +static int xen_suspend(void *data) +{ int err; + int *cancelled = data; BUG_ON(!irqs_disabled()); @@ -43,7 +66,6 @@ static int xen_suspend(void *data) if (err) { printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", err); - dpm_resume_noirq(PMSG_RESUME); return err; } @@ -69,7 +91,6 @@ static int xen_suspend(void *data) } sysdev_resume(); - dpm_resume_noirq(PMSG_RESUME); return 0; } @@ -88,14 +109,14 @@ static void do_suspend(void) err = freeze_processes(); if (err) { printk(KERN_ERR "xen suspend: freeze failed %d\n", err); - return; + goto out; } #endif err = dpm_suspend_start(PMSG_SUSPEND); if (err) { printk(KERN_ERR "xen suspend: dpm_suspend_start %d\n", err); - goto out; + goto out_thaw; } printk(KERN_DEBUG "suspending xenstore...\n"); @@ -104,31 +125,37 @@ static void do_suspend(void) err = dpm_suspend_noirq(PMSG_SUSPEND); if (err) { printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); - goto resume_devices; + goto out_resume; } - err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + if (xen_hvm_domain()) + err = stop_machine(xen_hvm_suspend, &cancelled, cpumask_of(0)); + else + err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); + + dpm_resume_noirq(PMSG_RESUME); + if (err) { printk(KERN_ERR "failed to start xen_suspend: %d\n", err); - goto out; + cancelled = 1; } +out_resume: if (!cancelled) { xen_arch_resume(); xs_resume(); } else xs_suspend_cancel(); - dpm_resume_noirq(PMSG_RESUME); - -resume_devices: dpm_resume_end(PMSG_RESUME); /* Make sure timer events get retriggered on all CPUs */ clock_was_set(); -out: + +out_thaw: #ifdef CONFIG_PREEMPT thaw_processes(); +out: #endif shutting_down = SHUTDOWN_INVALID; } @@ -183,6 +210,7 @@ static void shutdown_handler(struct xenbus_watch *watch, kfree(str); } +#ifdef CONFIG_MAGIC_SYSRQ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, unsigned int len) { @@ -209,18 +237,19 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, goto again; if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL); + handle_sysrq(sysrq_key); } -static struct xenbus_watch shutdown_watch = { - .node = "control/shutdown", - .callback = shutdown_handler -}; - static struct xenbus_watch sysrq_watch = { .node = "control/sysrq", .callback = sysrq_handler }; +#endif + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; static int setup_shutdown_watcher(void) { @@ -232,11 +261,13 @@ static int setup_shutdown_watcher(void) return err; } +#ifdef CONFIG_MAGIC_SYSRQ err = register_xenbus_watch(&sysrq_watch); if (err) { printk(KERN_ERR "Failed to set sysrq watcher\n"); return err; } +#endif return 0; } @@ -249,7 +280,19 @@ static int shutdown_event(struct notifier_block *notifier, return NOTIFY_DONE; } -static int __init setup_shutdown_event(void) +static int __init __setup_shutdown_event(void) +{ + /* Delay initialization in the PV on HVM case */ + if (xen_hvm_domain()) + return 0; + + if (!xen_pv_domain()) + return -ENODEV; + + return xen_setup_shutdown_event(); +} + +int xen_setup_shutdown_event(void) { static struct notifier_block xenstore_notifier = { .notifier_call = shutdown_event @@ -258,5 +301,6 @@ static int __init setup_shutdown_event(void) return 0; } +EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); -subsys_initcall(setup_shutdown_event); +subsys_initcall(__setup_shutdown_event); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c new file mode 100644 index 000000000000..cef4bafc07dc --- /dev/null +++ b/drivers/xen/pci.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Weidong Han <weidong.han@intel.com> + */ + +#include <linux/pci.h> +#include <xen/xen.h> +#include <xen/interface/physdev.h> +#include <xen/interface/xen.h> + +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> +#include "../pci/pci.h" + +static int xen_add_device(struct device *dev) +{ + int r; + struct pci_dev *pci_dev = to_pci_dev(dev); + +#ifdef CONFIG_PCI_IOV + if (pci_dev->is_virtfn) { + struct physdev_manage_pci_ext manage_pci_ext = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + .is_virtfn = 1, + .physfn.bus = pci_dev->physfn->bus->number, + .physfn.devfn = pci_dev->physfn->devfn, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, + &manage_pci_ext); + } else +#endif + if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { + struct physdev_manage_pci_ext manage_pci_ext = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + .is_extfn = 1, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, + &manage_pci_ext); + } else { + struct physdev_manage_pci manage_pci = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, + &manage_pci); + } + + return r; +} + +static int xen_remove_device(struct device *dev) +{ + int r; + struct pci_dev *pci_dev = to_pci_dev(dev); + struct physdev_manage_pci manage_pci; + + manage_pci.bus = pci_dev->bus->number; + manage_pci.devfn = pci_dev->devfn; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, + &manage_pci); + + return r; +} + +static int xen_pci_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + int r = 0; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + r = xen_add_device(dev); + break; + case BUS_NOTIFY_DEL_DEVICE: + r = xen_remove_device(dev); + break; + default: + break; + } + + return r; +} + +struct notifier_block device_nb = { + .notifier_call = xen_pci_notifier, +}; + +static int __init register_xen_pci_notifier(void) +{ + if (!xen_initial_domain()) + return 0; + + return bus_register_notifier(&pci_bus_type, &device_nb); +} + +arch_initcall(register_xen_pci_notifier); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c new file mode 100644 index 000000000000..c01b5ddce529 --- /dev/null +++ b/drivers/xen/platform-pci.c @@ -0,0 +1,207 @@ +/****************************************************************************** + * platform-pci.c + * + * Xen platform PCI device driver + * Copyright (c) 2005, Intel Corporation. + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + + +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <xen/platform_pci.h> +#include <xen/grant_table.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/hvm.h> +#include <xen/xen-ops.h> + +#define DRV_NAME "xen-platform-pci" + +MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com"); +MODULE_DESCRIPTION("Xen platform PCI device"); +MODULE_LICENSE("GPL"); + +static unsigned long platform_mmio; +static unsigned long platform_mmio_alloc; +static unsigned long platform_mmiolen; +static uint64_t callback_via; + +unsigned long alloc_xen_mmio(unsigned long len) +{ + unsigned long addr; + + addr = platform_mmio + platform_mmio_alloc; + platform_mmio_alloc += len; + BUG_ON(platform_mmio_alloc > platform_mmiolen); + + return addr; +} + +static uint64_t get_callback_via(struct pci_dev *pdev) +{ + u8 pin; + int irq; + + irq = pdev->irq; + if (irq < 16) + return irq; /* ISA IRQ */ + + pin = pdev->pin; + + /* We don't know the GSI. Specify the PCI INTx line instead. */ + return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */ + ((uint64_t)pci_domain_nr(pdev->bus) << 32) | + ((uint64_t)pdev->bus->number << 16) | + ((uint64_t)(pdev->devfn & 0xff) << 8) | + ((uint64_t)(pin - 1) & 3); +} + +static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) +{ + xen_hvm_evtchn_do_upcall(); + return IRQ_HANDLED; +} + +static int xen_allocate_irq(struct pci_dev *pdev) +{ + return request_irq(pdev->irq, do_hvm_evtchn_intr, + IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + "xen-platform-pci", pdev); +} + +static int platform_pci_resume(struct pci_dev *pdev) +{ + int err; + if (xen_have_vector_callback) + return 0; + err = xen_set_callback_via(callback_via); + if (err) { + dev_err(&pdev->dev, "platform_pci_resume failure!\n"); + return err; + } + return 0; +} + +static int __devinit platform_pci_init(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int i, ret; + long ioaddr, iolen; + long mmio_addr, mmio_len; + unsigned int max_nr_gframes; + + i = pci_enable_device(pdev); + if (i) + return i; + + ioaddr = pci_resource_start(pdev, 0); + iolen = pci_resource_len(pdev, 0); + + mmio_addr = pci_resource_start(pdev, 1); + mmio_len = pci_resource_len(pdev, 1); + + if (mmio_addr == 0 || ioaddr == 0) { + dev_err(&pdev->dev, "no resources found\n"); + ret = -ENOENT; + goto pci_out; + } + + if (request_mem_region(mmio_addr, mmio_len, DRV_NAME) == NULL) { + dev_err(&pdev->dev, "MEM I/O resource 0x%lx @ 0x%lx busy\n", + mmio_addr, mmio_len); + ret = -EBUSY; + goto pci_out; + } + + if (request_region(ioaddr, iolen, DRV_NAME) == NULL) { + dev_err(&pdev->dev, "I/O resource 0x%lx @ 0x%lx busy\n", + iolen, ioaddr); + ret = -EBUSY; + goto mem_out; + } + + platform_mmio = mmio_addr; + platform_mmiolen = mmio_len; + + if (!xen_have_vector_callback) { + ret = xen_allocate_irq(pdev); + if (ret) { + dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); + goto out; + } + callback_via = get_callback_via(pdev); + ret = xen_set_callback_via(callback_via); + if (ret) { + dev_warn(&pdev->dev, "Unable to set the evtchn callback " + "err=%d\n", ret); + goto out; + } + } + + max_nr_gframes = gnttab_max_grant_frames(); + xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_init(); + if (ret) + goto out; + xenbus_probe(NULL); + ret = xen_setup_shutdown_event(); + if (ret) + goto out; + return 0; + +out: + release_region(ioaddr, iolen); +mem_out: + release_mem_region(mmio_addr, mmio_len); +pci_out: + pci_disable_device(pdev); + return ret; +} + +static struct pci_device_id platform_pci_tbl[] __devinitdata = { + {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {0,} +}; + +MODULE_DEVICE_TABLE(pci, platform_pci_tbl); + +static struct pci_driver platform_driver = { + .name = DRV_NAME, + .probe = platform_pci_init, + .id_table = platform_pci_tbl, +#ifdef CONFIG_PM + .resume_early = platform_pci_resume, +#endif +}; + +static int __init platform_pci_module_init(void) +{ + /* no unplug has been done, IGNORE hasn't been specified: just + * return now */ + if (!xen_platform_pci_unplug) + return -ENODEV; + + return pci_register_driver(&platform_driver); +} + +module_init(platform_pci_module_init); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c new file mode 100644 index 000000000000..54469c3eeacd --- /dev/null +++ b/drivers/xen/swiotlb-xen.c @@ -0,0 +1,515 @@ +/* + * Copyright 2010 + * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This code provides a IOMMU for Xen PV guests with PCI passthrough. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * PV guests under Xen are running in an non-contiguous memory architecture. + * + * When PCI pass-through is utilized, this necessitates an IOMMU for + * translating bus (DMA) to virtual and vice-versa and also providing a + * mechanism to have contiguous pages for device drivers operations (say DMA + * operations). + * + * Specifically, under Xen the Linux idea of pages is an illusion. It + * assumes that pages start at zero and go up to the available memory. To + * help with that, the Linux Xen MMU provides a lookup mechanism to + * translate the page frame numbers (PFN) to machine frame numbers (MFN) + * and vice-versa. The MFN are the "real" frame numbers. Furthermore + * memory is not contiguous. Xen hypervisor stitches memory for guests + * from different pools, which means there is no guarantee that PFN==MFN + * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are + * allocated in descending order (high to low), meaning the guest might + * never get any MFN's under the 4GB mark. + * + */ + +#include <linux/bootmem.h> +#include <linux/dma-mapping.h> +#include <xen/swiotlb-xen.h> +#include <xen/page.h> +#include <xen/xen-ops.h> +/* + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ + +static char *xen_io_tlb_start, *xen_io_tlb_end; +static unsigned long xen_io_tlb_nslabs; +/* + * Quick lookup value of the bus address of the IOTLB. + */ + +u64 start_dma_addr; + +static dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +{ + return phys_to_machine(XPADDR(paddr)).maddr;; +} + +static phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +{ + return machine_to_phys(XMADDR(baddr)).paddr; +} + +static dma_addr_t xen_virt_to_bus(void *address) +{ + return xen_phys_to_bus(virt_to_phys(address)); +} + +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; + + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; + } + return 1; +} + +static int range_straddles_page_boundary(phys_addr_t p, size_t size) +{ + unsigned long pfn = PFN_DOWN(p); + unsigned int offset = p & ~PAGE_MASK; + + if (offset + size <= PAGE_SIZE) + return 0; + if (check_pages_physically_contiguous(pfn, offset, size)) + return 0; + return 1; +} + +static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) +{ + unsigned long mfn = PFN_DOWN(dma_addr); + unsigned long pfn = mfn_to_local_pfn(mfn); + phys_addr_t paddr; + + /* If the address is outside our domain, it CAN + * have the same virtual address as another address + * in our domain. Therefore _only_ check address within our domain. + */ + if (pfn_valid(pfn)) { + paddr = PFN_PHYS(pfn); + return paddr >= virt_to_phys(xen_io_tlb_start) && + paddr < virt_to_phys(xen_io_tlb_end); + } + return 0; +} + +static int max_dma_bits = 32; + +static int +xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) +{ + int i, rc; + int dma_bits; + + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + + i = 0; + do { + int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); + + do { + rc = xen_create_contiguous_region( + (unsigned long)buf + (i << IO_TLB_SHIFT), + get_order(slabs << IO_TLB_SHIFT), + dma_bits); + } while (rc && dma_bits++ < max_dma_bits); + if (rc) + return rc; + + i += slabs; + } while (i < nslabs); + return 0; +} + +void __init xen_swiotlb_init(int verbose) +{ + unsigned long bytes; + int rc; + + xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); + xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); + + bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + + /* + * Get IO TLB memory from any location. + */ + xen_io_tlb_start = alloc_bootmem(bytes); + if (!xen_io_tlb_start) + panic("Cannot allocate SWIOTLB buffer"); + + xen_io_tlb_end = xen_io_tlb_start + bytes; + /* + * And replace that memory with pages under 4GB. + */ + rc = xen_swiotlb_fixup(xen_io_tlb_start, + bytes, + xen_io_tlb_nslabs); + if (rc) + goto error; + + start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); + swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); + + return; +error: + panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\ + "We either don't have the permission or you do not have enough"\ + "free memory under 4GB!\n", rc); +} + +void * +xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags) +{ + void *ret; + int order = get_order(size); + u64 dma_mask = DMA_BIT_MASK(32); + unsigned long vstart; + + /* + * Ignore region specifiers - the kernel's ideas of + * pseudo-phys memory layout has nothing to do with the + * machine physical layout. We can't allocate highmem + * because we can't return a pointer to it. + */ + flags &= ~(__GFP_DMA | __GFP_HIGHMEM); + + if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) + return ret; + + vstart = __get_free_pages(flags, order); + ret = (void *)vstart; + + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = dma_alloc_coherent_mask(hwdev, flags); + + if (ret) { + if (xen_create_contiguous_region(vstart, order, + fls64(dma_mask)) != 0) { + free_pages(vstart, order); + return NULL; + } + memset(ret, 0, size); + *dma_handle = virt_to_machine(ret).maddr; + } + return ret; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); + +void +xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dev_addr) +{ + int order = get_order(size); + + if (dma_release_from_coherent(hwdev, order, vaddr)) + return; + + xen_destroy_contiguous_region((unsigned long)vaddr, order); + free_pages((unsigned long)vaddr, order); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); + + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. + */ +dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + phys_addr_t phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = xen_phys_to_bus(phys); + void *map; + + BUG_ON(dir == DMA_NONE); + /* + * If the address happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (dma_capable(dev, dev_addr, size) && + !range_straddles_page_boundary(phys, size) && !swiotlb_force) + return dev_addr; + + /* + * Oh well, have to allocate and map a bounce buffer. + */ + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); + if (!map) + return DMA_ERROR_CODE; + + dev_addr = xen_virt_to_bus(map); + + /* + * Ensure that the address returned is DMA'ble + */ + if (!dma_capable(dev, dev_addr, size)) + panic("map_single: bounce buffer is not DMA'ble"); + + return dev_addr; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous xen_swiotlb_map_page call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + phys_addr_t paddr = xen_bus_to_phys(dev_addr); + + BUG_ON(dir == DMA_NONE); + + /* NOTE: We use dev_addr here, not paddr! */ + if (is_xen_swiotlb_buffer(dev_addr)) { + swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + /* + * phys_to_virt doesn't work with hihgmem page but we could + * call dma_mark_clean() with hihgmem page here. However, we + * are fine since dma_mark_clean() is null on POWERPC. We can + * make dma_mark_clean() take a physical address if necessary. + */ + dma_mark_clean(phys_to_virt(paddr), size); +} + +void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + xen_unmap_single(hwdev, dev_addr, size, dir); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the dma mapping, you must + * call this function before doing so. At the next point you give the dma + * address back to the card, you must first perform a + * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +static void +xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + phys_addr_t paddr = xen_bus_to_phys(dev_addr); + + BUG_ON(dir == DMA_NONE); + + /* NOTE: We use dev_addr here, not paddr! */ + if (is_xen_swiotlb_buffer(dev_addr)) { + swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir, + target); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(phys_to_virt(paddr), size); +} + +void +xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu); + +void +xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device); + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above xen_swiotlb_map_page + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for xen_swiotlb_map_page are the + * same here. + */ +int +xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = xen_phys_to_bus(paddr); + + if (swiotlb_force || + !dma_capable(hwdev, dev_addr, sg->length) || + range_straddles_page_boundary(paddr, sg->length)) { + void *map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, dir); + if (!map) { + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, + attrs); + sgl[0].dma_length = 0; + return DMA_ERROR_CODE; + } + sg->dma_address = xen_virt_to_bus(map); + } else + sg->dma_address = dev_addr; + sg->dma_length = sg->length; + } + return nelems; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); + +int +xen_swiotlb_map_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir) +{ + return xen_swiotlb_map_sg_attrs(hwdev, sgl, nelems, dir, NULL); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg); + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_page() above. + */ +void +xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) + xen_unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); + +void +xen_swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sgl, int nelems, + enum dma_data_direction dir) +{ + return xen_swiotlb_unmap_sg_attrs(hwdev, sgl, nelems, dir, NULL); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg); + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +static void +xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nelems, i) + xen_swiotlb_sync_single(hwdev, sg->dma_address, + sg->dma_length, dir, target); +} + +void +xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu); + +void +xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device); + +int +xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) +{ + return !dma_addr; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error); + +/* + * Return whether the given device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) +{ + return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 88a60e03ccf0..60f1827a32cb 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -7,6 +7,7 @@ * published by the Free Software Foundation. */ +#include <linux/slab.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/kobject.h> @@ -14,6 +15,7 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/xenbus.h> #include <xen/interface/xen.h> #include <xen/interface/version.h> @@ -425,7 +427,7 @@ static ssize_t hyp_sysfs_store(struct kobject *kobj, return 0; } -static struct sysfs_ops hyp_sysfs_ops = { +static const struct sysfs_ops hyp_sysfs_ops = { .show = hyp_sysfs_show, .store = hyp_sysfs_store, }; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 92a1ef80a288..cdacf923e073 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -30,6 +30,7 @@ * IN THE SOFTWARE. */ +#include <linux/slab.h> #include <linux/types.h> #include <linux/vmalloc.h> #include <asm/xen/hypervisor.h> @@ -49,6 +50,8 @@ const char *xenbus_strstate(enum xenbus_state state) [ XenbusStateConnected ] = "Connected", [ XenbusStateClosing ] = "Closing", [ XenbusStateClosed ] = "Closed", + [XenbusStateReconfiguring] = "Reconfiguring", + [XenbusStateReconfigured] = "Reconfigured", }; return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID"; } @@ -132,17 +135,12 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, } EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); +static void xenbus_switch_fatal(struct xenbus_device *, int, int, + const char *, ...); -/** - * xenbus_switch_state - * @dev: xenbus device - * @state: new state - * - * Advertise in the store a change of the given driver to the given new_state. - * Return 0 on success, or -errno on error. On error, the device will switch - * to XenbusStateClosing, and the error will be saved in the store. - */ -int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) +static int +__xenbus_switch_state(struct xenbus_device *dev, + enum xenbus_state state, int depth) { /* We check whether the state is currently set to the given value, and if not, then the state is set. We don't want to unconditionally @@ -151,35 +149,65 @@ int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) to it, as the device will be tearing down, and we don't want to resurrect that directory. - Note that, because of this cached value of our state, this function - will not work inside a Xenstore transaction (something it was - trying to in the past) because dev->state would not get reset if - the transaction was aborted. - + Note that, because of this cached value of our state, this + function will not take a caller's Xenstore transaction + (something it was trying to in the past) because dev->state + would not get reset if the transaction was aborted. */ + struct xenbus_transaction xbt; int current_state; - int err; + int err, abort; if (state == dev->state) return 0; - err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d", - ¤t_state); - if (err != 1) +again: + abort = 1; + + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_switch_fatal(dev, depth, err, "starting transaction"); return 0; + } + + err = xenbus_scanf(xbt, dev->nodename, "state", "%d", ¤t_state); + if (err != 1) + goto abort; - err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state); + err = xenbus_printf(xbt, dev->nodename, "state", "%d", state); if (err) { - if (state != XenbusStateClosing) /* Avoid looping */ - xenbus_dev_fatal(dev, err, "writing new state"); - return err; + xenbus_switch_fatal(dev, depth, err, "writing new state"); + goto abort; } - dev->state = state; + abort = 0; +abort: + err = xenbus_transaction_end(xbt, abort); + if (err) { + if (err == -EAGAIN && !abort) + goto again; + xenbus_switch_fatal(dev, depth, err, "ending transaction"); + } else + dev->state = state; return 0; } + +/** + * xenbus_switch_state + * @dev: xenbus device + * @state: new state + * + * Advertise in the store a change of the given driver to the given new_state. + * Return 0 on success, or -errno on error. On error, the device will switch + * to XenbusStateClosing, and the error will be saved in the store. + */ +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) +{ + return __xenbus_switch_state(dev, state, 0); +} + EXPORT_SYMBOL_GPL(xenbus_switch_state); int xenbus_frontend_closed(struct xenbus_device *dev) @@ -283,6 +311,23 @@ void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...) EXPORT_SYMBOL_GPL(xenbus_dev_fatal); /** + * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps + * avoiding recursion within xenbus_switch_state. + */ +static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + xenbus_va_dev_error(dev, err, fmt, ap); + va_end(ap); + + if (!depth) + __xenbus_switch_state(dev, XenbusStateClosing, 1); +} + +/** * xenbus_grant_ring * @dev: xenbus device * @ring_mfn: mfn of ring to grant diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index d42e25d5968d..deb9c4ba3a93 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -45,22 +45,30 @@ #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/io.h> +#include <linux/slab.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/xen/hypervisor.h> + +#include <xen/xen.h> #include <xen/xenbus.h> #include <xen/events.h> #include <xen/page.h> +#include <xen/platform_pci.h> +#include <xen/hvm.h> + #include "xenbus_comms.h" #include "xenbus_probe.h" int xen_store_evtchn; -EXPORT_SYMBOL(xen_store_evtchn); +EXPORT_SYMBOL_GPL(xen_store_evtchn); struct xenstore_domain_interface *xen_store_interface; +EXPORT_SYMBOL_GPL(xen_store_interface); + static unsigned long xen_store_mfn; static BLOCKING_NOTIFIER_HEAD(xenstore_chain); @@ -454,21 +462,21 @@ static ssize_t xendev_show_nodename(struct device *dev, { return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); } -DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); +static DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); static ssize_t xendev_show_devtype(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); } -DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); +static DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); static ssize_t xendev_show_modalias(struct device *dev, struct device_attribute *attr, char *buf) { return sprintf(buf, "xen:%s\n", to_xenbus_device(dev)->devicetype); } -DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); +static DEVICE_ATTR(modalias, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_modalias, NULL); int xenbus_probe_node(struct xen_bus_type *bus, const char *type, @@ -766,7 +774,7 @@ EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); void xenbus_probe(struct work_struct *unused) { - BUG_ON((xenstored_ready <= 0)); + xenstored_ready = 1; /* Enumerate devices in xenstore and watch for changes. */ xenbus_probe_devices(&xenbus_frontend); @@ -776,10 +784,26 @@ void xenbus_probe(struct work_struct *unused) /* Notify others that xenstore is up */ blocking_notifier_call_chain(&xenstore_chain, 0, NULL); } +EXPORT_SYMBOL_GPL(xenbus_probe); -static int __init xenbus_probe_init(void) +static int __init xenbus_probe_initcall(void) +{ + if (!xen_domain()) + return -ENODEV; + + if (xen_initial_domain() || xen_hvm_domain()) + return 0; + + xenbus_probe(NULL); + return 0; +} + +device_initcall(xenbus_probe_initcall); + +static int __init xenbus_init(void) { int err = 0; + unsigned long page = 0; DPRINTK(""); @@ -800,13 +824,50 @@ static int __init xenbus_probe_init(void) * Domain0 doesn't have a store_evtchn or store_mfn yet. */ if (xen_initial_domain()) { - /* dom0 not yet supported */ + struct evtchn_alloc_unbound alloc_unbound; + + /* Allocate Xenstore page */ + page = get_zeroed_page(GFP_KERNEL); + if (!page) + goto out_error; + + xen_store_mfn = xen_start_info->store_mfn = + pfn_to_mfn(virt_to_phys((void *)page) >> + PAGE_SHIFT); + + /* Next allocate a local port which xenstored can bind to */ + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = 0; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (err == -ENOSYS) + goto out_error; + + BUG_ON(err); + xen_store_evtchn = xen_start_info->store_evtchn = + alloc_unbound.port; + + xen_store_interface = mfn_to_virt(xen_store_mfn); } else { - xenstored_ready = 1; - xen_store_evtchn = xen_start_info->store_evtchn; - xen_store_mfn = xen_start_info->store_mfn; + if (xen_hvm_domain()) { + uint64_t v = 0; + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + if (err) + goto out_error; + xen_store_mfn = (unsigned long)v; + xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + } else { + xen_store_evtchn = xen_start_info->store_evtchn; + xen_store_mfn = xen_start_info->store_mfn; + xen_store_interface = mfn_to_virt(xen_store_mfn); + xenstored_ready = 1; + } } - xen_store_interface = mfn_to_virt(xen_store_mfn); /* Initialize the interface to xenstore. */ err = xs_init(); @@ -816,9 +877,6 @@ static int __init xenbus_probe_init(void) goto out_unreg_back; } - if (!xen_initial_domain()) - xenbus_probe(NULL); - #ifdef CONFIG_XEN_COMPAT_XENFS /* * Create xenfs mountpoint in /proc for compatibility with @@ -836,14 +894,16 @@ static int __init xenbus_probe_init(void) bus_unregister(&xenbus_frontend.bus); out_error: + if (page != 0) + free_page(page); return err; } -postcore_initcall(xenbus_probe_init); +postcore_initcall(xenbus_init); MODULE_LICENSE("GPL"); -static int is_disconnected_device(struct device *dev, void *data) +static int is_device_connecting(struct device *dev, void *data) { struct xenbus_device *xendev = to_xenbus_device(dev); struct device_driver *drv = data; @@ -861,14 +921,15 @@ static int is_disconnected_device(struct device *dev, void *data) return 0; xendrv = to_xenbus_driver(dev->driver); - return (xendev->state != XenbusStateConnected || - (xendrv->is_ready && !xendrv->is_ready(xendev))); + return (xendev->state < XenbusStateConnected || + (xendev->state == XenbusStateConnected && + xendrv->is_ready && !xendrv->is_ready(xendev))); } -static int exists_disconnected_device(struct device_driver *drv) +static int exists_connecting_device(struct device_driver *drv) { return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, - is_disconnected_device); + is_device_connecting); } static int print_device_status(struct device *dev, void *data) @@ -884,10 +945,13 @@ static int print_device_status(struct device *dev, void *data) /* Information only: is this too noisy? */ printk(KERN_INFO "XENBUS: Device with no driver: %s\n", xendev->nodename); - } else if (xendev->state != XenbusStateConnected) { + } else if (xendev->state < XenbusStateConnected) { + enum xenbus_state rstate = XenbusStateUnknown; + if (xendev->otherend) + rstate = xenbus_read_driver_state(xendev->otherend); printk(KERN_WARNING "XENBUS: Timeout connecting " - "to device: %s (state %d)\n", - xendev->nodename, xendev->state); + "to device: %s (local state %d, remote state %d)\n", + xendev->nodename, xendev->state, rstate); } return 0; @@ -897,7 +961,7 @@ static int print_device_status(struct device *dev, void *data) static int ready_to_wait_for_devices; /* - * On a 10 second timeout, wait for all devices currently configured. We need + * On a 5-minute timeout, wait for all devices currently configured. We need * to do this to guarantee that the filesystems and / or network devices * needed for boot are available, before we can allow the boot to proceed. * @@ -912,18 +976,30 @@ static int ready_to_wait_for_devices; */ static void wait_for_devices(struct xenbus_driver *xendrv) { - unsigned long timeout = jiffies + 10*HZ; + unsigned long start = jiffies; struct device_driver *drv = xendrv ? &xendrv->driver : NULL; + unsigned int seconds_waited = 0; if (!ready_to_wait_for_devices || !xen_domain()) return; - while (exists_disconnected_device(drv)) { - if (time_after(jiffies, timeout)) - break; + while (exists_connecting_device(drv)) { + if (time_after(jiffies, start + (seconds_waited+5)*HZ)) { + if (!seconds_waited) + printk(KERN_WARNING "XENBUS: Waiting for " + "devices to initialise: "); + seconds_waited += 5; + printk("%us...", 300 - seconds_waited); + if (seconds_waited == 300) + break; + } + schedule_timeout_interruptible(HZ/10); } + if (seconds_waited) + printk("\n"); + bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, print_device_status); } @@ -931,6 +1007,9 @@ static void wait_for_devices(struct xenbus_driver *xendrv) #ifndef MODULE static int __init boot_wait_for_devices(void) { + if (xen_hvm_domain() && !xen_platform_pci_unplug) + return -ENODEV; + ready_to_wait_for_devices = 1; wait_for_devices(NULL); return 0; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index eab33f1dbdf7..5534690075af 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -76,6 +76,14 @@ struct xs_handle { /* * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. * response_mutex is never taken simultaneously with the other three. + * + * transaction_mutex must be held before incrementing + * transaction_count. The mutex is held when a suspend is in + * progress to prevent new transactions starting. + * + * When decrementing transaction_count to zero the wait queue + * should be woken up, the suspend code waits for count to + * reach zero. */ /* One request at a time. */ @@ -85,7 +93,9 @@ struct xs_handle { struct mutex response_mutex; /* Protect transactions against save/restore. */ - struct rw_semaphore transaction_mutex; + struct mutex transaction_mutex; + atomic_t transaction_count; + wait_queue_head_t transaction_wq; /* Protect watch (de)register against save/restore. */ struct rw_semaphore watch_mutex; @@ -157,6 +167,31 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) return body; } +static void transaction_start(void) +{ + mutex_lock(&xs_state.transaction_mutex); + atomic_inc(&xs_state.transaction_count); + mutex_unlock(&xs_state.transaction_mutex); +} + +static void transaction_end(void) +{ + if (atomic_dec_and_test(&xs_state.transaction_count)) + wake_up(&xs_state.transaction_wq); +} + +static void transaction_suspend(void) +{ + mutex_lock(&xs_state.transaction_mutex); + wait_event(xs_state.transaction_wq, + atomic_read(&xs_state.transaction_count) == 0); +} + +static void transaction_resume(void) +{ + mutex_unlock(&xs_state.transaction_mutex); +} + void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) { void *ret; @@ -164,7 +199,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) int err; if (req_msg.type == XS_TRANSACTION_START) - down_read(&xs_state.transaction_mutex); + transaction_start(); mutex_lock(&xs_state.request_mutex); @@ -180,7 +215,7 @@ void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) if ((msg->type == XS_TRANSACTION_END) || ((req_msg.type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) - up_read(&xs_state.transaction_mutex); + transaction_end(); return ret; } @@ -432,11 +467,11 @@ int xenbus_transaction_start(struct xenbus_transaction *t) { char *id_str; - down_read(&xs_state.transaction_mutex); + transaction_start(); id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); if (IS_ERR(id_str)) { - up_read(&xs_state.transaction_mutex); + transaction_end(); return PTR_ERR(id_str); } @@ -461,7 +496,7 @@ int xenbus_transaction_end(struct xenbus_transaction t, int abort) err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); - up_read(&xs_state.transaction_mutex); + transaction_end(); return err; } @@ -499,7 +534,7 @@ int xenbus_printf(struct xenbus_transaction t, #define PRINTF_BUFFER_SIZE 4096 char *printf_buffer; - printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); + printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_NOIO | __GFP_HIGH); if (printf_buffer == NULL) return -ENOMEM; @@ -662,7 +697,7 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch); void xs_suspend(void) { - down_write(&xs_state.transaction_mutex); + transaction_suspend(); down_write(&xs_state.watch_mutex); mutex_lock(&xs_state.request_mutex); mutex_lock(&xs_state.response_mutex); @@ -677,7 +712,7 @@ void xs_resume(void) mutex_unlock(&xs_state.response_mutex); mutex_unlock(&xs_state.request_mutex); - up_write(&xs_state.transaction_mutex); + transaction_resume(); /* No need for watches_lock: the watch_mutex is sufficient. */ list_for_each_entry(watch, &watches, list) { @@ -693,7 +728,7 @@ void xs_suspend_cancel(void) mutex_unlock(&xs_state.response_mutex); mutex_unlock(&xs_state.request_mutex); up_write(&xs_state.watch_mutex); - up_write(&xs_state.transaction_mutex); + mutex_unlock(&xs_state.transaction_mutex); } static int xenwatch_thread(void *unused) @@ -843,8 +878,10 @@ int xs_init(void) mutex_init(&xs_state.request_mutex); mutex_init(&xs_state.response_mutex); - init_rwsem(&xs_state.transaction_mutex); + mutex_init(&xs_state.transaction_mutex); init_rwsem(&xs_state.watch_mutex); + atomic_set(&xs_state.transaction_count, 0); + init_waitqueue_head(&xs_state.transaction_wq); /* Initialize the shared memory rings to talk to xenstored */ err = xb_init_comms(); diff --git a/drivers/xen/xencomm.c b/drivers/xen/xencomm.c index a240b2c20b99..b91f8ff50d05 100644 --- a/drivers/xen/xencomm.c +++ b/drivers/xen/xencomm.c @@ -18,8 +18,8 @@ * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ -#include <linux/gfp.h> #include <linux/mm.h> +#include <linux/slab.h> #include <asm/page.h> #include <xen/xencomm.h> #include <xen/interface/xen.h> diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index 25275c3bbdff..4fde9440fe1f 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_XENFS) += xenfs.o -xenfs-objs = super.o xenbus.o
\ No newline at end of file +xenfs-y = super.o xenbus.o privcmd.o +xenfs-$(CONFIG_XEN_DOM0) += xenstored.o diff --git a/drivers/xen/xenfs/privcmd.c b/drivers/xen/xenfs/privcmd.c new file mode 100644 index 000000000000..f80be7f6eb95 --- /dev/null +++ b/drivers/xen/xenfs/privcmd.c @@ -0,0 +1,404 @@ +/****************************************************************************** + * privcmd.c + * + * Interface to privileged domain-0 commands. + * + * Copyright (c) 2002-2004, K A Fraser, B Dragovic + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/uaccess.h> +#include <linux/swap.h> +#include <linux/smp_lock.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/privcmd.h> +#include <xen/interface/xen.h> +#include <xen/features.h> +#include <xen/page.h> +#include <xen/xen-ops.h> + +#ifndef HAVE_ARCH_PRIVCMD_MMAP +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); +#endif + +static long privcmd_ioctl_hypercall(void __user *udata) +{ + struct privcmd_hypercall hypercall; + long ret; + + if (copy_from_user(&hypercall, udata, sizeof(hypercall))) + return -EFAULT; + + ret = privcmd_call(hypercall.op, + hypercall.arg[0], hypercall.arg[1], + hypercall.arg[2], hypercall.arg[3], + hypercall.arg[4]); + + return ret; +} + +static void free_page_list(struct list_head *pages) +{ + struct page *p, *n; + + list_for_each_entry_safe(p, n, pages, lru) + __free_page(p); + + INIT_LIST_HEAD(pages); +} + +/* + * Given an array of items in userspace, return a list of pages + * containing the data. If copying fails, either because of memory + * allocation failure or a problem reading user memory, return an + * error code; its up to the caller to dispose of any partial list. + */ +static int gather_array(struct list_head *pagelist, + unsigned nelem, size_t size, + void __user *data) +{ + unsigned pageidx; + void *pagedata; + int ret; + + if (size > PAGE_SIZE) + return 0; + + pageidx = PAGE_SIZE; + pagedata = NULL; /* quiet, gcc */ + while (nelem--) { + if (pageidx > PAGE_SIZE-size) { + struct page *page = alloc_page(GFP_KERNEL); + + ret = -ENOMEM; + if (page == NULL) + goto fail; + + pagedata = page_address(page); + + list_add_tail(&page->lru, pagelist); + pageidx = 0; + } + + ret = -EFAULT; + if (copy_from_user(pagedata + pageidx, data, size)) + goto fail; + + data += size; + pageidx += size; + } + + ret = 0; + +fail: + return ret; +} + +/* + * Call function "fn" on each element of the array fragmented + * over a list of pages. + */ +static int traverse_pages(unsigned nelem, size_t size, + struct list_head *pos, + int (*fn)(void *data, void *state), + void *state) +{ + void *pagedata; + unsigned pageidx; + int ret = 0; + + BUG_ON(size > PAGE_SIZE); + + pageidx = PAGE_SIZE; + pagedata = NULL; /* hush, gcc */ + + while (nelem--) { + if (pageidx > PAGE_SIZE-size) { + struct page *page; + pos = pos->next; + page = list_entry(pos, struct page, lru); + pagedata = page_address(page); + pageidx = 0; + } + + ret = (*fn)(pagedata + pageidx, state); + if (ret) + break; + pageidx += size; + } + + return ret; +} + +struct mmap_mfn_state { + unsigned long va; + struct vm_area_struct *vma; + domid_t domain; +}; + +static int mmap_mfn_range(void *data, void *state) +{ + struct privcmd_mmap_entry *msg = data; + struct mmap_mfn_state *st = state; + struct vm_area_struct *vma = st->vma; + int rc; + + /* Do not allow range to wrap the address space. */ + if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || + ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) + return -EINVAL; + + /* Range chunks must be contiguous in va space. */ + if ((msg->va != st->va) || + ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) + return -EINVAL; + + rc = xen_remap_domain_mfn_range(vma, + msg->va & PAGE_MASK, + msg->mfn, msg->npages, + vma->vm_page_prot, + st->domain); + if (rc < 0) + return rc; + + st->va += msg->npages << PAGE_SHIFT; + + return 0; +} + +static long privcmd_ioctl_mmap(void __user *udata) +{ + struct privcmd_mmap mmapcmd; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc; + LIST_HEAD(pagelist); + struct mmap_mfn_state state; + + if (!xen_initial_domain()) + return -EPERM; + + if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) + return -EFAULT; + + rc = gather_array(&pagelist, + mmapcmd.num, sizeof(struct privcmd_mmap_entry), + mmapcmd.entry); + + if (rc || list_empty(&pagelist)) + goto out; + + down_write(&mm->mmap_sem); + + { + struct page *page = list_first_entry(&pagelist, + struct page, lru); + struct privcmd_mmap_entry *msg = page_address(page); + + vma = find_vma(mm, msg->va); + rc = -EINVAL; + + if (!vma || (msg->va != vma->vm_start) || + !privcmd_enforce_singleshot_mapping(vma)) + goto out_up; + } + + state.va = vma->vm_start; + state.vma = vma; + state.domain = mmapcmd.dom; + + rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), + &pagelist, + mmap_mfn_range, &state); + + +out_up: + up_write(&mm->mmap_sem); + +out: + free_page_list(&pagelist); + + return rc; +} + +struct mmap_batch_state { + domid_t domain; + unsigned long va; + struct vm_area_struct *vma; + int err; + + xen_pfn_t __user *user; +}; + +static int mmap_batch_fn(void *data, void *state) +{ + xen_pfn_t *mfnp = data; + struct mmap_batch_state *st = state; + + if (xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1, + st->vma->vm_page_prot, st->domain) < 0) { + *mfnp |= 0xf0000000U; + st->err++; + } + st->va += PAGE_SIZE; + + return 0; +} + +static int mmap_return_errors(void *data, void *state) +{ + xen_pfn_t *mfnp = data; + struct mmap_batch_state *st = state; + + put_user(*mfnp, st->user++); + + return 0; +} + +static struct vm_operations_struct privcmd_vm_ops; + +static long privcmd_ioctl_mmap_batch(void __user *udata) +{ + int ret; + struct privcmd_mmapbatch m; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long nr_pages; + LIST_HEAD(pagelist); + struct mmap_batch_state state; + + if (!xen_initial_domain()) + return -EPERM; + + if (copy_from_user(&m, udata, sizeof(m))) + return -EFAULT; + + nr_pages = m.num; + if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) + return -EINVAL; + + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), + m.arr); + + if (ret || list_empty(&pagelist)) + goto out; + + down_write(&mm->mmap_sem); + + vma = find_vma(mm, m.addr); + ret = -EINVAL; + if (!vma || + vma->vm_ops != &privcmd_vm_ops || + (m.addr != vma->vm_start) || + ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || + !privcmd_enforce_singleshot_mapping(vma)) { + up_write(&mm->mmap_sem); + goto out; + } + + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.err = 0; + + ret = traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state); + + up_write(&mm->mmap_sem); + + if (state.err > 0) { + ret = 0; + + state.user = m.arr; + traverse_pages(m.num, sizeof(xen_pfn_t), + &pagelist, + mmap_return_errors, &state); + } + +out: + free_page_list(&pagelist); + + return ret; +} + +static long privcmd_ioctl(struct file *file, + unsigned int cmd, unsigned long data) +{ + int ret = -ENOSYS; + void __user *udata = (void __user *) data; + + switch (cmd) { + case IOCTL_PRIVCMD_HYPERCALL: + ret = privcmd_ioctl_hypercall(udata); + break; + + case IOCTL_PRIVCMD_MMAP: + ret = privcmd_ioctl_mmap(udata); + break; + + case IOCTL_PRIVCMD_MMAPBATCH: + ret = privcmd_ioctl_mmap_batch(udata); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +#ifndef HAVE_ARCH_PRIVCMD_MMAP +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", + vma, vma->vm_start, vma->vm_end, + vmf->pgoff, vmf->virtual_address); + + return VM_FAULT_SIGBUS; +} + +static struct vm_operations_struct privcmd_vm_ops = { + .fault = privcmd_fault +}; + +static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* Unsupported for auto-translate guests. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; + + /* DONTCOPY is essential for Xen as copy_page_range is broken. */ + vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; + vma->vm_ops = &privcmd_vm_ops; + vma->vm_private_data = NULL; + + return 0; +} + +static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) +{ + return (xchg(&vma->vm_private_data, (void *)1) == NULL); +} +#endif + +const struct file_operations privcmd_file_ops = { + .unlocked_ioctl = privcmd_ioctl, + .mmap = privcmd_mmap, +}; diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 6559e0c752ce..f6339d11d59c 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -12,6 +12,10 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/magic.h> +#include <linux/mm.h> +#include <linux/backing-dev.h> + +#include <xen/xen.h> #include "xenfs.h" @@ -20,6 +24,62 @@ MODULE_DESCRIPTION("Xen filesystem"); MODULE_LICENSE("GPL"); +static int xenfs_set_page_dirty(struct page *page) +{ + return !TestSetPageDirty(page); +} + +static const struct address_space_operations xenfs_aops = { + .set_page_dirty = xenfs_set_page_dirty, +}; + +static struct backing_dev_info xenfs_backing_dev_info = { + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static struct inode *xenfs_make_inode(struct super_block *sb, int mode) +{ + struct inode *ret = new_inode(sb); + + if (ret) { + ret->i_mode = mode; + ret->i_mapping->a_ops = &xenfs_aops; + ret->i_mapping->backing_dev_info = &xenfs_backing_dev_info; + ret->i_uid = ret->i_gid = 0; + ret->i_blocks = 0; + ret->i_atime = ret->i_mtime = ret->i_ctime = CURRENT_TIME; + } + return ret; +} + +static struct dentry *xenfs_create_file(struct super_block *sb, + struct dentry *parent, + const char *name, + const struct file_operations *fops, + void *data, + int mode) +{ + struct dentry *dentry; + struct inode *inode; + + dentry = d_alloc_name(parent, name); + if (!dentry) + return NULL; + + inode = xenfs_make_inode(sb, S_IFREG | mode); + if (!inode) { + dput(dentry); + return NULL; + } + + inode->i_fop = fops; + inode->i_private = data; + + d_add(dentry, inode); + return dentry; +} + static ssize_t capabilities_read(struct file *file, char __user *buf, size_t size, loff_t *off) { @@ -33,6 +93,7 @@ static ssize_t capabilities_read(struct file *file, char __user *buf, static const struct file_operations capabilities_file_ops = { .read = capabilities_read, + .llseek = default_llseek, }; static int xenfs_fill_super(struct super_block *sb, void *data, int silent) @@ -41,38 +102,65 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) [1] = {}, { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, + { "privcmd", &privcmd_file_ops, S_IRUSR|S_IWUSR }, {""}, }; + int rc; + + rc = simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); + if (rc < 0) + return rc; + + if (xen_initial_domain()) { + xenfs_create_file(sb, sb->s_root, "xsd_kva", + &xsd_kva_file_ops, NULL, S_IRUSR|S_IWUSR); + xenfs_create_file(sb, sb->s_root, "xsd_port", + &xsd_port_file_ops, NULL, S_IRUSR|S_IWUSR); + } - return simple_fill_super(sb, XENFS_SUPER_MAGIC, xenfs_files); + return rc; } -static int xenfs_get_sb(struct file_system_type *fs_type, +static int xenfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_single(fs_type, flags, data, xenfs_fill_super, mnt); + return mount_single(fs_type, flags, data, xenfs_fill_super); } static struct file_system_type xenfs_type = { .owner = THIS_MODULE, .name = "xenfs", - .get_sb = xenfs_get_sb, + .mount = xenfs_mount, .kill_sb = kill_litter_super, }; static int __init xenfs_init(void) { - if (xen_pv_domain()) - return register_filesystem(&xenfs_type); + int err; + if (!xen_domain()) { + printk(KERN_INFO "xenfs: not registering filesystem on non-xen platform\n"); + return 0; + } + + err = register_filesystem(&xenfs_type); + if (err) { + printk(KERN_ERR "xenfs: Unable to register filesystem!\n"); + goto out; + } + + err = bdi_init(&xenfs_backing_dev_info); + if (err) + unregister_filesystem(&xenfs_type); + + out: - printk(KERN_INFO "XENFS: not registering filesystem on non-xen platform\n"); - return 0; + return err; } static void __exit xenfs_exit(void) { - if (xen_pv_domain()) + if (xen_domain()) unregister_filesystem(&xenfs_type); } diff --git a/drivers/xen/xenfs/xenbus.c b/drivers/xen/xenfs/xenbus.c index a9592d981b10..1c1236087f78 100644 --- a/drivers/xen/xenfs/xenbus.c +++ b/drivers/xen/xenfs/xenbus.c @@ -43,6 +43,7 @@ #include <linux/fs.h> #include <linux/poll.h> #include <linux/mutex.h> +#include <linux/sched.h> #include <linux/spinlock.h> #include <linux/mount.h> #include <linux/pagemap.h> @@ -50,6 +51,7 @@ #include <linux/init.h> #include <linux/namei.h> #include <linux/string.h> +#include <linux/slab.h> #include "xenfs.h" #include "../xenbus/xenbus_comms.h" @@ -122,6 +124,9 @@ static ssize_t xenbus_file_read(struct file *filp, mutex_lock(&u->reply_mutex); while (list_empty(&u->read_buffers)) { mutex_unlock(&u->reply_mutex); + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + ret = wait_event_interruptible(u->read_waitq, !list_empty(&u->read_buffers)); if (ret) @@ -589,4 +594,5 @@ const struct file_operations xenbus_file_ops = { .open = xenbus_file_open, .release = xenbus_file_release, .poll = xenbus_file_poll, + .llseek = no_llseek, }; diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index 51f08b2d0bf1..b68aa6200003 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -2,5 +2,8 @@ #define _XENFS_XENBUS_H extern const struct file_operations xenbus_file_ops; +extern const struct file_operations privcmd_file_ops; +extern const struct file_operations xsd_kva_file_ops; +extern const struct file_operations xsd_port_file_ops; #endif /* _XENFS_XENBUS_H */ diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c new file mode 100644 index 000000000000..fef20dbc6a5c --- /dev/null +++ b/drivers/xen/xenfs/xenstored.c @@ -0,0 +1,68 @@ +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fs.h> + +#include <xen/page.h> + +#include "xenfs.h" +#include "../xenbus/xenbus_comms.h" + +static ssize_t xsd_read(struct file *file, char __user *buf, + size_t size, loff_t *off) +{ + const char *str = (const char *)file->private_data; + return simple_read_from_buffer(buf, size, off, str, strlen(str)); +} + +static int xsd_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static int xsd_kva_open(struct inode *inode, struct file *file) +{ + file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p", + xen_store_interface); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + + if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) + return -EINVAL; + + if (remap_pfn_range(vma, vma->vm_start, + virt_to_pfn(xen_store_interface), + size, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +const struct file_operations xsd_kva_file_ops = { + .open = xsd_kva_open, + .mmap = xsd_kva_mmap, + .read = xsd_read, + .release = xsd_release, +}; + +static int xsd_port_open(struct inode *inode, struct file *file) +{ + file->private_data = (void *)kasprintf(GFP_KERNEL, "%d", + xen_store_evtchn); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +const struct file_operations xsd_port_file_ops = { + .open = xsd_port_open, + .read = xsd_read, + .release = xsd_release, +}; |