diff options
Diffstat (limited to 'drivers/xen')
-rw-r--r-- | drivers/xen/Kconfig | 10 | ||||
-rw-r--r-- | drivers/xen/Makefile | 6 | ||||
-rw-r--r-- | drivers/xen/balloon.c | 359 | ||||
-rw-r--r-- | drivers/xen/events.c | 479 | ||||
-rw-r--r-- | drivers/xen/gntalloc.c | 545 | ||||
-rw-r--r-- | drivers/xen/gntdev.c | 382 | ||||
-rw-r--r-- | drivers/xen/grant-table.c | 10 | ||||
-rw-r--r-- | drivers/xen/xen-balloon.c | 256 |
8 files changed, 1484 insertions, 563 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 07bec09d1dad..a59638b37c1a 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -76,10 +76,20 @@ config XEN_XENBUS_FRONTEND config XEN_GNTDEV tristate "userspace grant access device driver" depends on XEN + default m select MMU_NOTIFIER help Allows userspace processes to use grants. +config XEN_GRANT_DEV_ALLOC + tristate "User-space grant reference allocator driver" + depends on XEN + default m + help + Allows userspace processes to create pages with access granted + to other domains. This can be used to implement frontend drivers + or as part of an inter-domain shared memory channel. + config XEN_PLATFORM_PCI tristate "xen platform pci device driver" depends on XEN_PVHVM && PCI diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 5088cc2e6fe2..f420f1ff7f13 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,4 +1,4 @@ -obj-y += grant-table.o features.o events.o manage.o +obj-y += grant-table.o features.o events.o manage.o balloon.o obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) @@ -7,9 +7,10 @@ CFLAGS_features.o := $(nostackp) obj-$(CONFIG_BLOCK) += biomerge.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o -obj-$(CONFIG_XEN_BALLOON) += balloon.o +obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o +obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o @@ -18,5 +19,6 @@ obj-$(CONFIG_XEN_DOM0) += pci.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o +xen-gntalloc-y := gntalloc.o xen-platform-pci-y := platform-pci.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 718050ace08f..043af8ad6b60 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -1,6 +1,4 @@ /****************************************************************************** - * balloon.c - * * Xen balloon driver - enables returning/claiming memory to/from Xen. * * Copyright (c) 2003, B Dragovic @@ -33,7 +31,6 @@ */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> @@ -42,13 +39,11 @@ #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/list.h> -#include <linux/sysdev.h> #include <linux/gfp.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/e820.h> @@ -58,35 +53,29 @@ #include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/memory.h> -#include <xen/xenbus.h> +#include <xen/balloon.h> #include <xen/features.h> #include <xen/page.h> -#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) - -#define BALLOON_CLASS_NAME "xen_memory" +/* + * balloon_process() state: + * + * BP_DONE: done or nothing to do, + * BP_EAGAIN: error, go to sleep, + * BP_ECANCELED: error, balloon operation canceled. + */ -struct balloon_stats { - /* We aim for 'current allocation' == 'target allocation'. */ - unsigned long current_pages; - unsigned long target_pages; - /* - * Drivers may alter the memory reservation independently, but they - * must inform the balloon driver so we avoid hitting the hard limit. - */ - unsigned long driver_pages; - /* Number of pages in high- and low-memory balloons. */ - unsigned long balloon_low; - unsigned long balloon_high; +enum bp_state { + BP_DONE, + BP_EAGAIN, + BP_ECANCELED }; -static DEFINE_MUTEX(balloon_mutex); - -static struct sys_device balloon_sysdev; -static int register_balloon(struct sys_device *sysdev); +static DEFINE_MUTEX(balloon_mutex); -static struct balloon_stats balloon_stats; +struct balloon_stats balloon_stats; +EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; @@ -104,8 +93,7 @@ static LIST_HEAD(ballooned_pages); /* Main work function, always executed in process context. */ static void balloon_process(struct work_struct *work); -static DECLARE_WORK(balloon_worker, balloon_process); -static struct timer_list balloon_timer; +static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ @@ -140,14 +128,17 @@ static void balloon_append(struct page *page) } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(void) +static struct page *balloon_retrieve(bool prefer_highmem) { struct page *page; if (list_empty(&ballooned_pages)) return NULL; - page = list_entry(ballooned_pages.next, struct page, lru); + if (prefer_highmem) + page = list_entry(ballooned_pages.prev, struct page, lru); + else + page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); if (PageHighMem(page)) { @@ -177,9 +168,29 @@ static struct page *balloon_next_page(struct page *page) return list_entry(next, struct page, lru); } -static void balloon_alarm(unsigned long unused) +static enum bp_state update_schedule(enum bp_state state) { - schedule_work(&balloon_worker); + if (state == BP_DONE) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_DONE; + } + + ++balloon_stats.retry_count; + + if (balloon_stats.max_retry_count != RETRY_UNLIMITED && + balloon_stats.retry_count > balloon_stats.max_retry_count) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_ECANCELED; + } + + balloon_stats.schedule_delay <<= 1; + + if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) + balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; + + return BP_EAGAIN; } static unsigned long current_target(void) @@ -194,11 +205,11 @@ static unsigned long current_target(void) return target; } -static int increase_reservation(unsigned long nr_pages) +static enum bp_state increase_reservation(unsigned long nr_pages) { + int rc; unsigned long pfn, i; struct page *page; - long rc; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, @@ -210,7 +221,10 @@ static int increase_reservation(unsigned long nr_pages) page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { - BUG_ON(page == NULL); + if (!page) { + nr_pages = i; + break; + } frame_list[i] = page_to_pfn(page); page = balloon_next_page(page); } @@ -218,11 +232,11 @@ static int increase_reservation(unsigned long nr_pages) set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc < 0) - goto out; + if (rc <= 0) + return BP_EAGAIN; for (i = 0; i < rc; i++) { - page = balloon_retrieve(); + page = balloon_retrieve(false); BUG_ON(page == NULL); pfn = page_to_pfn(page); @@ -249,15 +263,14 @@ static int increase_reservation(unsigned long nr_pages) balloon_stats.current_pages += rc; - out: - return rc < 0 ? rc : rc != nr_pages; + return BP_DONE; } -static int decrease_reservation(unsigned long nr_pages) +static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) { + enum bp_state state = BP_DONE; unsigned long pfn, i; struct page *page; - int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, @@ -269,9 +282,9 @@ static int decrease_reservation(unsigned long nr_pages) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = alloc_page(GFP_BALLOON)) == NULL) { + if ((page = alloc_page(gfp)) == NULL) { nr_pages = i; - need_sleep = 1; + state = BP_EAGAIN; break; } @@ -307,7 +320,7 @@ static int decrease_reservation(unsigned long nr_pages) balloon_stats.current_pages -= nr_pages; - return need_sleep; + return state; } /* @@ -318,77 +331,101 @@ static int decrease_reservation(unsigned long nr_pages) */ static void balloon_process(struct work_struct *work) { - int need_sleep = 0; + enum bp_state state = BP_DONE; long credit; mutex_lock(&balloon_mutex); do { credit = current_target() - balloon_stats.current_pages; + if (credit > 0) - need_sleep = (increase_reservation(credit) != 0); + state = increase_reservation(credit); + if (credit < 0) - need_sleep = (decrease_reservation(-credit) != 0); + state = decrease_reservation(-credit, GFP_BALLOON); + + state = update_schedule(state); #ifndef CONFIG_PREEMPT if (need_resched()) schedule(); #endif - } while ((credit != 0) && !need_sleep); + } while (credit && state == BP_DONE); /* Schedule more work if there is some still to be done. */ - if (current_target() != balloon_stats.current_pages) - mod_timer(&balloon_timer, jiffies + HZ); + if (state == BP_EAGAIN) + schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); mutex_unlock(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ -static void balloon_set_new_target(unsigned long target) +void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_work(&balloon_worker); + schedule_delayed_work(&balloon_worker, 0); } +EXPORT_SYMBOL_GPL(balloon_set_new_target); -static struct xenbus_watch target_watch = -{ - .node = "memory/target" -}; - -/* React to a change in the target key */ -static void watch_target(struct xenbus_watch *watch, - const char **vec, unsigned int len) +/** + * alloc_xenballooned_pages - get pages that have been ballooned out + * @nr_pages: Number of pages to get + * @pages: pages returned + * @return 0 on success, error otherwise + */ +int alloc_xenballooned_pages(int nr_pages, struct page** pages) { - unsigned long long new_target; - int err; - - err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); - if (err != 1) { - /* This is ok (for domain0 at least) - so just return */ - return; + int pgno = 0; + struct page* page; + mutex_lock(&balloon_mutex); + while (pgno < nr_pages) { + page = balloon_retrieve(true); + if (page) { + pages[pgno++] = page; + } else { + enum bp_state st; + st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER); + if (st != BP_DONE) + goto out_undo; + } } - - /* The given memory/target value is in KiB, so it needs converting to - * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. - */ - balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); + mutex_unlock(&balloon_mutex); + return 0; + out_undo: + while (pgno) + balloon_append(pages[--pgno]); + /* Free the memory back to the kernel soon */ + schedule_delayed_work(&balloon_worker, 0); + mutex_unlock(&balloon_mutex); + return -ENOMEM; } +EXPORT_SYMBOL(alloc_xenballooned_pages); -static int balloon_init_watcher(struct notifier_block *notifier, - unsigned long event, - void *data) +/** + * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * @nr_pages: Number of pages + * @pages: pages to return + */ +void free_xenballooned_pages(int nr_pages, struct page** pages) { - int err; + int i; - err = register_xenbus_watch(&target_watch); - if (err) - printk(KERN_ERR "Failed to set balloon watcher\n"); + mutex_lock(&balloon_mutex); - return NOTIFY_DONE; -} + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + balloon_append(pages[i]); + } + + /* The balloon may be too large now. Shrink it if needed. */ + if (current_target() != balloon_stats.current_pages) + schedule_delayed_work(&balloon_worker, 0); -static struct notifier_block xenstore_notifier; + mutex_unlock(&balloon_mutex); +} +EXPORT_SYMBOL(free_xenballooned_pages); static int __init balloon_init(void) { @@ -398,7 +435,7 @@ static int __init balloon_init(void) if (!xen_domain()) return -ENODEV; - pr_info("xen_balloon: Initialising balloon driver.\n"); + pr_info("xen/balloon: Initialising balloon driver.\n"); if (xen_pv_domain()) nr_pages = xen_start_info->nr_pages; @@ -408,13 +445,11 @@ static int __init balloon_init(void) balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; - balloon_stats.driver_pages = 0UL; - - init_timer(&balloon_timer); - balloon_timer.data = 0; - balloon_timer.function = balloon_alarm; - register_balloon(&balloon_sysdev); + balloon_stats.schedule_delay = 1; + balloon_stats.max_schedule_delay = 32; + balloon_stats.retry_count = 1; + balloon_stats.max_retry_count = RETRY_UNLIMITED; /* * Initialise the balloon with excess memory space. We need @@ -436,153 +471,9 @@ static int __init balloon_init(void) __balloon_append(page); } - target_watch.callback = watch_target; - xenstore_notifier.notifier_call = balloon_init_watcher; - - register_xenstore_notifier(&xenstore_notifier); - return 0; } subsys_initcall(balloon_init); -static void balloon_exit(void) -{ - /* XXX - release balloon here */ - return; -} - -module_exit(balloon_exit); - -#define BALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ - { \ - return sprintf(buf, format, ##args); \ - } \ - static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) - -BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); -BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); -BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); -BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); - -static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); -} - -static ssize_t store_target_kb(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, - size_t count) -{ - char *endchar; - unsigned long long target_bytes; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; - - balloon_set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, - show_target_kb, store_target_kb); - - -static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - return sprintf(buf, "%llu\n", - (unsigned long long)balloon_stats.target_pages - << PAGE_SHIFT); -} - -static ssize_t store_target(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, - size_t count) -{ - char *endchar; - unsigned long long target_bytes; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - target_bytes = memparse(buf, &endchar); - - balloon_set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, - show_target, store_target); - - -static struct sysdev_attribute *balloon_attrs[] = { - &attr_target_kb, - &attr_target, -}; - -static struct attribute *balloon_info_attrs[] = { - &attr_current_kb.attr, - &attr_low_kb.attr, - &attr_high_kb.attr, - &attr_driver_kb.attr, - NULL -}; - -static struct attribute_group balloon_info_group = { - .name = "info", - .attrs = balloon_info_attrs, -}; - -static struct sysdev_class balloon_sysdev_class = { - .name = BALLOON_CLASS_NAME, -}; - -static int register_balloon(struct sys_device *sysdev) -{ - int i, error; - - error = sysdev_class_register(&balloon_sysdev_class); - if (error) - return error; - - sysdev->id = 0; - sysdev->cls = &balloon_sysdev_class; - - error = sysdev_register(sysdev); - if (error) { - sysdev_class_unregister(&balloon_sysdev_class); - return error; - } - - for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { - error = sysdev_create_file(sysdev, balloon_attrs[i]); - if (error) - goto fail; - } - - error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); - if (error) - goto fail; - - return 0; - - fail: - while (--i >= 0) - sysdev_remove_file(sysdev, balloon_attrs[i]); - sysdev_unregister(sysdev); - sysdev_class_unregister(&balloon_sysdev_class); - return error; -} - MODULE_LICENSE("GPL"); diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 149fa875e396..02b5a9c05cfa 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -56,6 +56,8 @@ */ static DEFINE_SPINLOCK(irq_mapping_update_lock); +static LIST_HEAD(xen_irq_list_head); + /* IRQ <-> VIRQ mapping. */ static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; @@ -85,7 +87,9 @@ enum xen_irq_type { */ struct irq_info { + struct list_head list; enum xen_irq_type type; /* type */ + unsigned irq; unsigned short evtchn; /* event channel */ unsigned short cpu; /* cpu bound */ @@ -103,23 +107,10 @@ struct irq_info #define PIRQ_NEEDS_EOI (1 << 0) #define PIRQ_SHAREABLE (1 << 1) -static struct irq_info *irq_info; -static int *pirq_to_irq; - static int *evtchn_to_irq; -struct cpu_evtchn_s { - unsigned long bits[NR_EVENT_CHANNELS/BITS_PER_LONG]; -}; -static __initdata struct cpu_evtchn_s init_evtchn_mask = { - .bits[0 ... (NR_EVENT_CHANNELS/BITS_PER_LONG)-1] = ~0ul, -}; -static struct cpu_evtchn_s __refdata *cpu_evtchn_mask_p = &init_evtchn_mask; - -static inline unsigned long *cpu_evtchn_mask(int cpu) -{ - return cpu_evtchn_mask_p[cpu].bits; -} +static DEFINE_PER_CPU(unsigned long [NR_EVENT_CHANNELS/BITS_PER_LONG], + cpu_evtchn_mask); /* Xen will never allocate port zero for any purpose. */ #define VALID_EVTCHN(chn) ((chn) != 0) @@ -128,46 +119,86 @@ static struct irq_chip xen_dynamic_chip; static struct irq_chip xen_percpu_chip; static struct irq_chip xen_pirq_chip; -/* Constructor for packed IRQ information. */ -static struct irq_info mk_unbound_info(void) +/* Get info for IRQ */ +static struct irq_info *info_for_irq(unsigned irq) { - return (struct irq_info) { .type = IRQT_UNBOUND }; + return get_irq_data(irq); } -static struct irq_info mk_evtchn_info(unsigned short evtchn) +/* Constructors for packed IRQ information. */ +static void xen_irq_info_common_init(struct irq_info *info, + unsigned irq, + enum xen_irq_type type, + unsigned short evtchn, + unsigned short cpu) { - return (struct irq_info) { .type = IRQT_EVTCHN, .evtchn = evtchn, - .cpu = 0 }; + + BUG_ON(info->type != IRQT_UNBOUND && info->type != type); + + info->type = type; + info->irq = irq; + info->evtchn = evtchn; + info->cpu = cpu; + + evtchn_to_irq[evtchn] = irq; } -static struct irq_info mk_ipi_info(unsigned short evtchn, enum ipi_vector ipi) +static void xen_irq_info_evtchn_init(unsigned irq, + unsigned short evtchn) { - return (struct irq_info) { .type = IRQT_IPI, .evtchn = evtchn, - .cpu = 0, .u.ipi = ipi }; + struct irq_info *info = info_for_irq(irq); + + xen_irq_info_common_init(info, irq, IRQT_EVTCHN, evtchn, 0); } -static struct irq_info mk_virq_info(unsigned short evtchn, unsigned short virq) +static void xen_irq_info_ipi_init(unsigned cpu, + unsigned irq, + unsigned short evtchn, + enum ipi_vector ipi) { - return (struct irq_info) { .type = IRQT_VIRQ, .evtchn = evtchn, - .cpu = 0, .u.virq = virq }; + struct irq_info *info = info_for_irq(irq); + + xen_irq_info_common_init(info, irq, IRQT_IPI, evtchn, 0); + + info->u.ipi = ipi; + + per_cpu(ipi_to_irq, cpu)[ipi] = irq; } -static struct irq_info mk_pirq_info(unsigned short evtchn, unsigned short pirq, - unsigned short gsi, unsigned short vector) +static void xen_irq_info_virq_init(unsigned cpu, + unsigned irq, + unsigned short evtchn, + unsigned short virq) { - return (struct irq_info) { .type = IRQT_PIRQ, .evtchn = evtchn, - .cpu = 0, - .u.pirq = { .pirq = pirq, .gsi = gsi, .vector = vector } }; + struct irq_info *info = info_for_irq(irq); + + xen_irq_info_common_init(info, irq, IRQT_VIRQ, evtchn, 0); + + info->u.virq = virq; + + per_cpu(virq_to_irq, cpu)[virq] = irq; } -/* - * Accessors for packed IRQ information. - */ -static struct irq_info *info_for_irq(unsigned irq) +static void xen_irq_info_pirq_init(unsigned irq, + unsigned short evtchn, + unsigned short pirq, + unsigned short gsi, + unsigned short vector, + unsigned char flags) { - return &irq_info[irq]; + struct irq_info *info = info_for_irq(irq); + + xen_irq_info_common_init(info, irq, IRQT_PIRQ, evtchn, 0); + + info->u.pirq.pirq = pirq; + info->u.pirq.gsi = gsi; + info->u.pirq.vector = vector; + info->u.pirq.flags = flags; } +/* + * Accessors for packed IRQ information. + */ static unsigned int evtchn_from_irq(unsigned irq) { if (unlikely(WARN(irq < 0 || irq >= nr_irqs, "Invalid irq %d!\n", irq))) @@ -212,26 +243,6 @@ static unsigned pirq_from_irq(unsigned irq) return info->u.pirq.pirq; } -static unsigned gsi_from_irq(unsigned irq) -{ - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info == NULL); - BUG_ON(info->type != IRQT_PIRQ); - - return info->u.pirq.gsi; -} - -static unsigned vector_from_irq(unsigned irq) -{ - struct irq_info *info = info_for_irq(irq); - - BUG_ON(info == NULL); - BUG_ON(info->type != IRQT_PIRQ); - - return info->u.pirq.vector; -} - static enum xen_irq_type type_from_irq(unsigned irq) { return info_for_irq(irq)->type; @@ -267,7 +278,7 @@ static inline unsigned long active_evtchns(unsigned int cpu, unsigned int idx) { return (sh->evtchn_pending[idx] & - cpu_evtchn_mask(cpu)[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & ~sh->evtchn_mask[idx]); } @@ -280,28 +291,28 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) cpumask_copy(irq_to_desc(irq)->irq_data.affinity, cpumask_of(cpu)); #endif - clear_bit(chn, cpu_evtchn_mask(cpu_from_irq(irq))); - set_bit(chn, cpu_evtchn_mask(cpu)); + clear_bit(chn, per_cpu(cpu_evtchn_mask, cpu_from_irq(irq))); + set_bit(chn, per_cpu(cpu_evtchn_mask, cpu)); - irq_info[irq].cpu = cpu; + info_for_irq(irq)->cpu = cpu; } static void init_evtchn_cpu_bindings(void) { int i; #ifdef CONFIG_SMP - struct irq_desc *desc; + struct irq_info *info; /* By default all event channels notify CPU#0. */ - for_each_irq_desc(i, desc) { + list_for_each_entry(info, &xen_irq_list_head, list) { + struct irq_desc *desc = irq_to_desc(info->irq); cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); } #endif for_each_possible_cpu(i) - memset(cpu_evtchn_mask(i), - (i == 0) ? ~0 : 0, sizeof(struct cpu_evtchn_s)); - + memset(per_cpu(cpu_evtchn_mask, i), + (i == 0) ? ~0 : 0, sizeof(*per_cpu(cpu_evtchn_mask, i))); } static inline void clear_evtchn(int port) @@ -376,7 +387,28 @@ static void unmask_evtchn(int port) put_cpu(); } -static int xen_allocate_irq_dynamic(void) +static void xen_irq_init(unsigned irq) +{ + struct irq_info *info; + struct irq_desc *desc = irq_to_desc(irq); + +#ifdef CONFIG_SMP + /* By default all event channels notify CPU#0. */ + cpumask_copy(desc->irq_data.affinity, cpumask_of(0)); +#endif + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (info == NULL) + panic("Unable to allocate metadata for IRQ%d\n", irq); + + info->type = IRQT_UNBOUND; + + set_irq_data(irq, info); + + list_add_tail(&info->list, &xen_irq_list_head); +} + +static int __must_check xen_allocate_irq_dynamic(void) { int first = 0; int irq; @@ -393,22 +425,14 @@ static int xen_allocate_irq_dynamic(void) first = get_nr_irqs_gsi(); #endif -retry: irq = irq_alloc_desc_from(first, -1); - if (irq == -ENOMEM && first > NR_IRQS_LEGACY) { - printk(KERN_ERR "Out of dynamic IRQ space and eating into GSI space. You should increase nr_irqs\n"); - first = max(NR_IRQS_LEGACY, first - NR_IRQS_LEGACY); - goto retry; - } - - if (irq < 0) - panic("No available IRQ to bind to: increase nr_irqs!\n"); + xen_irq_init(irq); return irq; } -static int xen_allocate_irq_gsi(unsigned gsi) +static int __must_check xen_allocate_irq_gsi(unsigned gsi) { int irq; @@ -423,17 +447,25 @@ static int xen_allocate_irq_gsi(unsigned gsi) /* Legacy IRQ descriptors are already allocated by the arch. */ if (gsi < NR_IRQS_LEGACY) - return gsi; + irq = gsi; + else + irq = irq_alloc_desc_at(gsi, -1); - irq = irq_alloc_desc_at(gsi, -1); - if (irq < 0) - panic("Unable to allocate to IRQ%d (%d)\n", gsi, irq); + xen_irq_init(irq); return irq; } static void xen_free_irq(unsigned irq) { + struct irq_info *info = get_irq_data(irq); + + list_del(&info->list); + + set_irq_data(irq, NULL); + + kfree(info); + /* Legacy IRQ descriptors are managed by the arch. */ if (irq < NR_IRQS_LEGACY) return; @@ -563,51 +595,39 @@ static void ack_pirq(struct irq_data *data) static int find_irq_by_gsi(unsigned gsi) { - int irq; + struct irq_info *info; - for (irq = 0; irq < nr_irqs; irq++) { - struct irq_info *info = info_for_irq(irq); - - if (info == NULL || info->type != IRQT_PIRQ) + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) continue; - if (gsi_from_irq(irq) == gsi) - return irq; + if (info->u.pirq.gsi == gsi) + return info->irq; } return -1; } -int xen_allocate_pirq(unsigned gsi, int shareable, char *name) +int xen_allocate_pirq_gsi(unsigned gsi) { - return xen_map_pirq_gsi(gsi, gsi, shareable, name); + return gsi; } -/* xen_map_pirq_gsi might allocate irqs from the top down, as a - * consequence don't assume that the irq number returned has a low value - * or can be used as a pirq number unless you know otherwise. - * - * One notable exception is when xen_map_pirq_gsi is called passing an - * hardware gsi as argument, in that case the irq number returned - * matches the gsi number passed as second argument. +/* + * Do not make any assumptions regarding the relationship between the + * IRQ number returned here and the Xen pirq argument. * * Note: We don't assign an event channel until the irq actually started * up. Return an existing irq if we've already got one for the gsi. */ -int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) +int xen_bind_pirq_gsi_to_irq(unsigned gsi, + unsigned pirq, int shareable, char *name) { - int irq = 0; + int irq = -1; struct physdev_irq irq_op; spin_lock(&irq_mapping_update_lock); - if ((pirq > nr_irqs) || (gsi > nr_irqs)) { - printk(KERN_WARNING "xen_map_pirq_gsi: %s %s is incorrect!\n", - pirq > nr_irqs ? "pirq" :"", - gsi > nr_irqs ? "gsi" : ""); - goto out; - } - irq = find_irq_by_gsi(gsi); if (irq != -1) { printk(KERN_INFO "xen_map_pirq_gsi: returning irq %d for gsi %u\n", @@ -616,6 +636,8 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) } irq = xen_allocate_irq_gsi(gsi); + if (irq < 0) + goto out; set_irq_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, name); @@ -633,9 +655,8 @@ int xen_map_pirq_gsi(unsigned pirq, unsigned gsi, int shareable, char *name) goto out; } - irq_info[irq] = mk_pirq_info(0, pirq, gsi, irq_op.vector); - irq_info[irq].u.pirq.flags |= shareable ? PIRQ_SHAREABLE : 0; - pirq_to_irq[pirq] = irq; + xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, + shareable ? PIRQ_SHAREABLE : 0); out: spin_unlock(&irq_mapping_update_lock); @@ -672,9 +693,8 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, set_irq_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq, name); - irq_info[irq] = mk_pirq_info(0, pirq, 0, vector); - pirq_to_irq[pirq] = irq; - ret = set_irq_msi(irq, msidesc); + xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, 0); + ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; out: @@ -709,9 +729,6 @@ int xen_destroy_irq(int irq) goto out; } } - pirq_to_irq[info->u.pirq.pirq] = -1; - - irq_info[irq] = mk_unbound_info(); xen_free_irq(irq); @@ -720,19 +737,26 @@ out: return rc; } -int xen_vector_from_irq(unsigned irq) +int xen_irq_from_pirq(unsigned pirq) { - return vector_from_irq(irq); -} + int irq; -int xen_gsi_from_irq(unsigned irq) -{ - return gsi_from_irq(irq); -} + struct irq_info *info; -int xen_irq_from_pirq(unsigned pirq) -{ - return pirq_to_irq[pirq]; + spin_lock(&irq_mapping_update_lock); + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info == NULL || info->type != IRQT_PIRQ) + continue; + irq = info->irq; + if (info->u.pirq.pirq == pirq) + goto out; + } + irq = -1; +out: + spin_unlock(&irq_mapping_update_lock); + + return irq; } int bind_evtchn_to_irq(unsigned int evtchn) @@ -745,14 +769,16 @@ int bind_evtchn_to_irq(unsigned int evtchn) if (irq == -1) { irq = xen_allocate_irq_dynamic(); + if (irq == -1) + goto out; set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, handle_fasteoi_irq, "event"); - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_evtchn_info(evtchn); + xen_irq_info_evtchn_init(irq, evtchn); } +out: spin_unlock(&irq_mapping_update_lock); return irq; @@ -782,9 +808,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) BUG(); evtchn = bind_ipi.port; - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_ipi_info(evtchn, ipi); - per_cpu(ipi_to_irq, cpu)[ipi] = irq; + xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } @@ -794,6 +818,21 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) return irq; } +static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) +{ + struct evtchn_bind_interdomain bind_interdomain; + int err; + + bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_port = remote_port; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + + return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); +} + int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { @@ -806,6 +845,8 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) if (irq == -1) { irq = xen_allocate_irq_dynamic(); + if (irq == -1) + goto out; set_irq_chip_and_handler_name(irq, &xen_percpu_chip, handle_percpu_irq, "virq"); @@ -817,14 +858,12 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) BUG(); evtchn = bind_virq.port; - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_virq_info(evtchn, virq); - - per_cpu(virq_to_irq, cpu)[virq] = irq; + xen_irq_info_virq_init(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } +out: spin_unlock(&irq_mapping_update_lock); return irq; @@ -861,11 +900,9 @@ static void unbind_from_irq(unsigned int irq) evtchn_to_irq[evtchn] = -1; } - if (irq_info[irq].type != IRQT_UNBOUND) { - irq_info[irq] = mk_unbound_info(); + BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); - xen_free_irq(irq); - } + xen_free_irq(irq); spin_unlock(&irq_mapping_update_lock); } @@ -879,6 +916,8 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, int retval; irq = bind_evtchn_to_irq(evtchn); + if (irq < 0) + return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -889,6 +928,29 @@ int bind_evtchn_to_irqhandler(unsigned int evtchn, } EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); +int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + unsigned int remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); + int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) @@ -897,6 +959,8 @@ int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, int retval; irq = bind_virq_to_irq(virq, cpu); + if (irq < 0) + return irq; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -948,7 +1012,7 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) { struct shared_info *sh = HYPERVISOR_shared_info; int cpu = smp_processor_id(); - unsigned long *cpu_evtchn = cpu_evtchn_mask(cpu); + unsigned long *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); int i; unsigned long flags; static DEFINE_SPINLOCK(debug_lock); @@ -1026,6 +1090,13 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) } static DEFINE_PER_CPU(unsigned, xed_nesting_count); +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~0UL) << i)) /* * Search the CPUs pending events bitmasks. For each one found, map @@ -1038,6 +1109,9 @@ static DEFINE_PER_CPU(unsigned, xed_nesting_count); */ static void __xen_evtchn_do_upcall(void) { + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; int cpu = get_cpu(); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); @@ -1056,17 +1130,57 @@ static void __xen_evtchn_do_upcall(void) wmb(); #endif pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); + unsigned long words; + + words = MASK_LSBS(pending_words, word_idx); - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = __ffs(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + if (word_idx == start_word_idx) { + /* We scan the starting word in two parts */ + if (i == 0) + /* 1st time: start in the middle */ + bit_idx = start_bit_idx; + else + /* 2nd time: mask bits done already */ + bit_idx &= (1UL << start_bit_idx) - 1; + } + + do { + unsigned long bits; + int port, irq; struct irq_desc *desc; + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = __ffs(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_LONG) + bit_idx; + irq = evtchn_to_irq[port]; + mask_evtchn(port); clear_evtchn(port); @@ -1075,7 +1189,21 @@ static void __xen_evtchn_do_upcall(void) if (desc) generic_handle_irq_desc(irq, desc); } - } + + bit_idx = (bit_idx + 1) % BITS_PER_LONG; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_LONG); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_LONG; } BUG_ON(!irqs_disabled()); @@ -1125,8 +1253,7 @@ void rebind_evtchn_irq(int evtchn, int irq) so there should be a proper type */ BUG_ON(info->type == IRQT_UNBOUND); - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_evtchn_info(evtchn); + xen_irq_info_evtchn_init(irq, evtchn); spin_unlock(&irq_mapping_update_lock); @@ -1143,10 +1270,14 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) struct evtchn_bind_vcpu bind_vcpu; int evtchn = evtchn_from_irq(irq); - /* events delivered via platform PCI interrupts are always - * routed to vcpu 0 */ - if (!VALID_EVTCHN(evtchn) || - (xen_hvm_domain() && !xen_have_vector_callback)) + if (!VALID_EVTCHN(evtchn)) + return -1; + + /* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ + if (xen_hvm_domain() && !xen_have_vector_callback) return -1; /* Send future instances of this interrupt to other vcpu. */ @@ -1233,19 +1364,22 @@ static int retrigger_dynirq(struct irq_data *data) return ret; } -static void restore_cpu_pirqs(void) +static void restore_pirqs(void) { int pirq, rc, irq, gsi; struct physdev_map_pirq map_irq; + struct irq_info *info; - for (pirq = 0; pirq < nr_irqs; pirq++) { - irq = pirq_to_irq[pirq]; - if (irq == -1) + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) continue; + pirq = info->u.pirq.pirq; + gsi = info->u.pirq.gsi; + irq = info->irq; + /* save/restore of PT devices doesn't work, so at this point the * only devices present are GSI based emulated devices */ - gsi = gsi_from_irq(irq); if (!gsi) continue; @@ -1258,8 +1392,7 @@ static void restore_cpu_pirqs(void) if (rc) { printk(KERN_WARNING "xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", gsi, irq, pirq, rc); - irq_info[irq] = mk_unbound_info(); - pirq_to_irq[pirq] = -1; + xen_free_irq(irq); continue; } @@ -1289,8 +1422,7 @@ static void restore_cpu_virqs(unsigned int cpu) evtchn = bind_virq.port; /* Record the new mapping. */ - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_virq_info(evtchn, virq); + xen_irq_info_virq_init(cpu, irq, evtchn, virq); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1314,8 +1446,7 @@ static void restore_cpu_ipis(unsigned int cpu) evtchn = bind_ipi.port; /* Record the new mapping. */ - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_ipi_info(evtchn, ipi); + xen_irq_info_ipi_init(cpu, irq, evtchn, ipi); bind_evtchn_to_cpu(evtchn, cpu); } } @@ -1375,7 +1506,8 @@ void xen_poll_irq(int irq) void xen_irq_resume(void) { - unsigned int cpu, irq, evtchn; + unsigned int cpu, evtchn; + struct irq_info *info; init_evtchn_cpu_bindings(); @@ -1384,8 +1516,8 @@ void xen_irq_resume(void) mask_evtchn(evtchn); /* No IRQ <-> event-channel mappings. */ - for (irq = 0; irq < nr_irqs; irq++) - irq_info[irq].evtchn = 0; /* zap event-channel binding */ + list_for_each_entry(info, &xen_irq_list_head, list) + info->evtchn = 0; /* zap event-channel binding */ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) evtchn_to_irq[evtchn] = -1; @@ -1395,7 +1527,7 @@ void xen_irq_resume(void) restore_cpu_ipis(cpu); } - restore_cpu_pirqs(); + restore_pirqs(); } static struct irq_chip xen_dynamic_chip __read_mostly = { @@ -1481,17 +1613,6 @@ void __init xen_init_IRQ(void) { int i; - cpu_evtchn_mask_p = kcalloc(nr_cpu_ids, sizeof(struct cpu_evtchn_s), - GFP_KERNEL); - irq_info = kcalloc(nr_irqs, sizeof(*irq_info), GFP_KERNEL); - - /* We are using nr_irqs as the maximum number of pirq available but - * that number is actually chosen by Xen and we don't know exactly - * what it is. Be careful choosing high pirq numbers. */ - pirq_to_irq = kcalloc(nr_irqs, sizeof(*pirq_to_irq), GFP_KERNEL); - for (i = 0; i < nr_irqs; i++) - pirq_to_irq[i] = -1; - evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), GFP_KERNEL); for (i = 0; i < NR_EVENT_CHANNELS; i++) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c new file mode 100644 index 000000000000..a7ffdfe19fc9 --- /dev/null +++ b/drivers/xen/gntalloc.c @@ -0,0 +1,545 @@ +/****************************************************************************** + * gntalloc.c + * + * Device for creating grant references (in user-space) that may be shared + * with other domains. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * This driver exists to allow userspace programs in Linux to allocate kernel + * memory that will later be shared with another domain. Without this device, + * Linux userspace programs cannot create grant references. + * + * How this stuff works: + * X -> granting a page to Y + * Y -> mapping the grant from X + * + * 1. X uses the gntalloc device to allocate a page of kernel memory, P. + * 2. X creates an entry in the grant table that says domid(Y) can access P. + * This is done without a hypercall unless the grant table needs expansion. + * 3. X gives the grant reference identifier, GREF, to Y. + * 4. Y maps the page, either directly into kernel memory for use in a backend + * driver, or via a the gntdev device to map into the address space of an + * application running in Y. This is the first point at which Xen does any + * tracking of the page. + * 5. A program in X mmap()s a segment of the gntalloc device that corresponds + * to the shared page, and can now communicate with Y over the shared page. + * + * + * NOTE TO USERSPACE LIBRARIES: + * The grant allocation and mmap()ing are, naturally, two separate operations. + * You set up the sharing by calling the create ioctl() and then the mmap(). + * Teardown requires munmap() and either close() or ioctl(). + * + * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant + * reference, this device can be used to consume kernel memory by leaving grant + * references mapped by another domain when an application exits. Therefore, + * there is a global limit on the number of pages that can be allocated. When + * all references to the page are unmapped, it will be freed during the next + * grant operation. + */ + +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/highmem.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> +#include <xen/gntalloc.h> +#include <xen/events.h> + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by " + "the gntalloc device"); + +static LIST_HEAD(gref_list); +static DEFINE_SPINLOCK(gref_lock); +static int gref_size; + +struct notify_info { + uint16_t pgoff:12; /* Bits 0-11: Offset of the byte to clear */ + uint16_t flags:2; /* Bits 12-13: Unmap notification flags */ + int event; /* Port (event channel) to notify */ +}; + +/* Metadata on a grant reference. */ +struct gntalloc_gref { + struct list_head next_gref; /* list entry gref_list */ + struct list_head next_file; /* list entry file->list, if open */ + struct page *page; /* The shared page */ + uint64_t file_index; /* File offset for mmap() */ + unsigned int users; /* Use count - when zero, waiting on Xen */ + grant_ref_t gref_id; /* The grant reference number */ + struct notify_info notify; /* Unmap notification */ +}; + +struct gntalloc_file_private_data { + struct list_head list; + uint64_t index; +}; + +static void __del_gref(struct gntalloc_gref *gref); + +static void do_cleanup(void) +{ + struct gntalloc_gref *gref, *n; + list_for_each_entry_safe(gref, n, &gref_list, next_gref) { + if (!gref->users) + __del_gref(gref); + } +} + +static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, + uint32_t *gref_ids, struct gntalloc_file_private_data *priv) +{ + int i, rc, readonly; + LIST_HEAD(queue_gref); + LIST_HEAD(queue_file); + struct gntalloc_gref *gref; + + readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE); + rc = -ENOMEM; + for (i = 0; i < op->count; i++) { + gref = kzalloc(sizeof(*gref), GFP_KERNEL); + if (!gref) + goto undo; + list_add_tail(&gref->next_gref, &queue_gref); + list_add_tail(&gref->next_file, &queue_file); + gref->users = 1; + gref->file_index = op->index + i * PAGE_SIZE; + gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!gref->page) + goto undo; + + /* Grant foreign access to the page. */ + gref->gref_id = gnttab_grant_foreign_access(op->domid, + pfn_to_mfn(page_to_pfn(gref->page)), readonly); + if (gref->gref_id < 0) { + rc = gref->gref_id; + goto undo; + } + gref_ids[i] = gref->gref_id; + } + + /* Add to gref lists. */ + spin_lock(&gref_lock); + list_splice_tail(&queue_gref, &gref_list); + list_splice_tail(&queue_file, &priv->list); + spin_unlock(&gref_lock); + + return 0; + +undo: + spin_lock(&gref_lock); + gref_size -= (op->count - i); + + list_for_each_entry(gref, &queue_file, next_file) { + /* __del_gref does not remove from queue_file */ + __del_gref(gref); + } + + /* It's possible for the target domain to map the just-allocated grant + * references by blindly guessing their IDs; if this is done, then + * __del_gref will leave them in the queue_gref list. They need to be + * added to the global list so that we can free them when they are no + * longer referenced. + */ + if (unlikely(!list_empty(&queue_gref))) + list_splice_tail(&queue_gref, &gref_list); + spin_unlock(&gref_lock); + return rc; +} + +static void __del_gref(struct gntalloc_gref *gref) +{ + if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { + uint8_t *tmp = kmap(gref->page); + tmp[gref->notify.pgoff] = 0; + kunmap(gref->page); + } + if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) + notify_remote_via_evtchn(gref->notify.event); + + gref->notify.flags = 0; + + if (gref->gref_id > 0) { + if (gnttab_query_foreign_access(gref->gref_id)) + return; + + if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) + return; + } + + gref_size--; + list_del(&gref->next_gref); + + if (gref->page) + __free_page(gref->page); + + kfree(gref); +} + +/* finds contiguous grant references in a file, returns the first */ +static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv, + uint64_t index, uint32_t count) +{ + struct gntalloc_gref *rv = NULL, *gref; + list_for_each_entry(gref, &priv->list, next_file) { + if (gref->file_index == index && !rv) + rv = gref; + if (rv) { + if (gref->file_index != index) + return NULL; + index += PAGE_SIZE; + count--; + if (count == 0) + return rv; + } + } + return NULL; +} + +/* + * ------------------------------------- + * File operations. + * ------------------------------------- + */ +static int gntalloc_open(struct inode *inode, struct file *filp) +{ + struct gntalloc_file_private_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_nomem; + INIT_LIST_HEAD(&priv->list); + + filp->private_data = priv; + + pr_debug("%s: priv %p\n", __func__, priv); + + return 0; + +out_nomem: + return -ENOMEM; +} + +static int gntalloc_release(struct inode *inode, struct file *filp) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_gref *gref; + + pr_debug("%s: priv %p\n", __func__, priv); + + spin_lock(&gref_lock); + while (!list_empty(&priv->list)) { + gref = list_entry(priv->list.next, + struct gntalloc_gref, next_file); + list_del(&gref->next_file); + gref->users--; + if (gref->users == 0) + __del_gref(gref); + } + kfree(priv); + spin_unlock(&gref_lock); + + return 0; +} + +static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, + struct ioctl_gntalloc_alloc_gref __user *arg) +{ + int rc = 0; + struct ioctl_gntalloc_alloc_gref op; + uint32_t *gref_ids; + + pr_debug("%s: priv %p\n", __func__, priv); + + if (copy_from_user(&op, arg, sizeof(op))) { + rc = -EFAULT; + goto out; + } + + gref_ids = kzalloc(sizeof(gref_ids[0]) * op.count, GFP_TEMPORARY); + if (!gref_ids) { + rc = -ENOMEM; + goto out; + } + + spin_lock(&gref_lock); + /* Clean up pages that were at zero (local) users but were still mapped + * by remote domains. Since those pages count towards the limit that we + * are about to enforce, removing them here is a good idea. + */ + do_cleanup(); + if (gref_size + op.count > limit) { + spin_unlock(&gref_lock); + rc = -ENOSPC; + goto out_free; + } + gref_size += op.count; + op.index = priv->index; + priv->index += op.count * PAGE_SIZE; + spin_unlock(&gref_lock); + + rc = add_grefs(&op, gref_ids, priv); + if (rc < 0) + goto out_free; + + /* Once we finish add_grefs, it is unsafe to touch the new reference, + * since it is possible for a concurrent ioctl to remove it (by guessing + * its index). If the userspace application doesn't provide valid memory + * to write the IDs to, then it will need to close the file in order to + * release - which it will do by segfaulting when it tries to access the + * IDs to close them. + */ + if (copy_to_user(arg, &op, sizeof(op))) { + rc = -EFAULT; + goto out_free; + } + if (copy_to_user(arg->gref_ids, gref_ids, + sizeof(gref_ids[0]) * op.count)) { + rc = -EFAULT; + goto out_free; + } + +out_free: + kfree(gref_ids); +out: + return rc; +} + +static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv, + void __user *arg) +{ + int i, rc = 0; + struct ioctl_gntalloc_dealloc_gref op; + struct gntalloc_gref *gref, *n; + + pr_debug("%s: priv %p\n", __func__, priv); + + if (copy_from_user(&op, arg, sizeof(op))) { + rc = -EFAULT; + goto dealloc_grant_out; + } + + spin_lock(&gref_lock); + gref = find_grefs(priv, op.index, op.count); + if (gref) { + /* Remove from the file list only, and decrease reference count. + * The later call to do_cleanup() will remove from gref_list and + * free the memory if the pages aren't mapped anywhere. + */ + for (i = 0; i < op.count; i++) { + n = list_entry(gref->next_file.next, + struct gntalloc_gref, next_file); + list_del(&gref->next_file); + gref->users--; + gref = n; + } + } else { + rc = -EINVAL; + } + + do_cleanup(); + + spin_unlock(&gref_lock); +dealloc_grant_out: + return rc; +} + +static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv, + void __user *arg) +{ + struct ioctl_gntalloc_unmap_notify op; + struct gntalloc_gref *gref; + uint64_t index; + int pgoff; + int rc; + + if (copy_from_user(&op, arg, sizeof(op))) + return -EFAULT; + + index = op.index & ~(PAGE_SIZE - 1); + pgoff = op.index & (PAGE_SIZE - 1); + + spin_lock(&gref_lock); + + gref = find_grefs(priv, index, 1); + if (!gref) { + rc = -ENOENT; + goto unlock_out; + } + + if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) { + rc = -EINVAL; + goto unlock_out; + } + + gref->notify.flags = op.action; + gref->notify.pgoff = pgoff; + gref->notify.event = op.event_channel_port; + rc = 0; + unlock_out: + spin_unlock(&gref_lock); + return rc; +} + +static long gntalloc_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + + switch (cmd) { + case IOCTL_GNTALLOC_ALLOC_GREF: + return gntalloc_ioctl_alloc(priv, (void __user *)arg); + + case IOCTL_GNTALLOC_DEALLOC_GREF: + return gntalloc_ioctl_dealloc(priv, (void __user *)arg); + + case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY: + return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg); + + default: + return -ENOIOCTLCMD; + } + + return 0; +} + +static void gntalloc_vma_close(struct vm_area_struct *vma) +{ + struct gntalloc_gref *gref = vma->vm_private_data; + if (!gref) + return; + + spin_lock(&gref_lock); + gref->users--; + if (gref->users == 0) + __del_gref(gref); + spin_unlock(&gref_lock); +} + +static struct vm_operations_struct gntalloc_vmops = { + .close = gntalloc_vma_close, +}; + +static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_gref *gref; + int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int rv, i; + + pr_debug("%s: priv %p, page %lu+%d\n", __func__, + priv, vma->vm_pgoff, count); + + if (!(vma->vm_flags & VM_SHARED)) { + printk(KERN_ERR "%s: Mapping must be shared.\n", __func__); + return -EINVAL; + } + + spin_lock(&gref_lock); + gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count); + if (gref == NULL) { + rv = -ENOENT; + pr_debug("%s: Could not find grant reference", + __func__); + goto out_unlock; + } + + vma->vm_private_data = gref; + + vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTCOPY; + vma->vm_flags |= VM_PFNMAP | VM_PFN_AT_MMAP; + + vma->vm_ops = &gntalloc_vmops; + + for (i = 0; i < count; i++) { + gref->users++; + rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, + gref->page); + if (rv) + goto out_unlock; + + gref = list_entry(gref->next_file.next, + struct gntalloc_gref, next_file); + } + rv = 0; + +out_unlock: + spin_unlock(&gref_lock); + return rv; +} + +static const struct file_operations gntalloc_fops = { + .owner = THIS_MODULE, + .open = gntalloc_open, + .release = gntalloc_release, + .unlocked_ioctl = gntalloc_ioctl, + .mmap = gntalloc_mmap +}; + +/* + * ------------------------------------- + * Module creation/destruction. + * ------------------------------------- + */ +static struct miscdevice gntalloc_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/gntalloc", + .fops = &gntalloc_fops, +}; + +static int __init gntalloc_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&gntalloc_miscdev); + if (err != 0) { + printk(KERN_ERR "Could not register misc gntalloc device\n"); + return err; + } + + pr_debug("Created grant allocation device at %d,%d\n", + MISC_MAJOR, gntalloc_miscdev.minor); + + return 0; +} + +static void __exit gntalloc_exit(void) +{ + misc_deregister(&gntalloc_miscdev); +} + +module_init(gntalloc_init); +module_exit(gntalloc_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, " + "Daniel De Graaf <dgdegra@tycho.nsa.gov>"); +MODULE_DESCRIPTION("User-space grant reference allocator driver"); diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1e31cdcdae1e..017ce600fbc6 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -32,10 +32,13 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> +#include <linux/highmem.h> #include <xen/xen.h> #include <xen/grant_table.h> +#include <xen/balloon.h> #include <xen/gntdev.h> +#include <xen/events.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include <asm/xen/page.h> @@ -45,35 +48,46 @@ MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " "Gerd Hoffmann <kraxel@redhat.com>"); MODULE_DESCRIPTION("User-space granted page access driver"); -static int limit = 1024; +static int limit = 1024*1024; module_param(limit, int, 0644); -MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at " - "once by a gntdev instance"); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " + "the gntdev device"); + +static atomic_t pages_mapped = ATOMIC_INIT(0); + +static int use_ptemod; struct gntdev_priv { struct list_head maps; - uint32_t used; - uint32_t limit; /* lock protects maps from concurrent changes */ spinlock_t lock; struct mm_struct *mm; struct mmu_notifier mn; }; +struct unmap_notify { + int flags; + /* Address relative to the start of the grant_map */ + int addr; + int event; +}; + struct grant_map { struct list_head next; - struct gntdev_priv *priv; struct vm_area_struct *vma; int index; int count; int flags; - int is_mapped; + atomic_t users; + struct unmap_notify notify; struct ioctl_gntdev_grant_ref *grants; struct gnttab_map_grant_ref *map_ops; struct gnttab_unmap_grant_ref *unmap_ops; struct page **pages; }; +static int unmap_grant_pages(struct grant_map *map, int offset, int pages); + /* ------------------------------------------------------------------ */ static void gntdev_print_maps(struct gntdev_priv *priv, @@ -82,9 +96,7 @@ static void gntdev_print_maps(struct gntdev_priv *priv, #ifdef DEBUG struct grant_map *map; - pr_debug("maps list (priv %p, usage %d/%d)\n", - priv, priv->used, priv->limit); - + pr_debug("%s: maps list (priv %p)\n", __func__, priv); list_for_each_entry(map, &priv->maps, next) pr_debug(" index %2d, count %2d %s\n", map->index, map->count, @@ -111,27 +123,21 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) NULL == add->pages) goto err; + if (alloc_xenballooned_pages(count, add->pages)) + goto err; + for (i = 0; i < count; i++) { - add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (add->pages[i] == NULL) - goto err; + add->map_ops[i].handle = -1; + add->unmap_ops[i].handle = -1; } add->index = 0; add->count = count; - add->priv = priv; - - if (add->count + priv->used > priv->limit) - goto err; + atomic_set(&add->users, 1); return add; err: - if (add->pages) - for (i = 0; i < count; i++) { - if (add->pages[i]) - __free_page(add->pages[i]); - } kfree(add->pages); kfree(add->grants); kfree(add->map_ops); @@ -154,7 +160,6 @@ static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) list_add_tail(&add->next, &priv->maps); done: - priv->used += add->count; gntdev_print_maps(priv, "[new]", add->index); } @@ -166,57 +171,33 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, list_for_each_entry(map, &priv->maps, next) { if (map->index != index) continue; - if (map->count != count) + if (count && map->count != count) continue; return map; } return NULL; } -static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv, - unsigned long vaddr) +static void gntdev_put_map(struct grant_map *map) { - struct grant_map *map; - - list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - if (vaddr < map->vma->vm_start) - continue; - if (vaddr >= map->vma->vm_end) - continue; - return map; - } - return NULL; -} - -static int gntdev_del_map(struct grant_map *map) -{ - int i; + if (!map) + return; - if (map->vma) - return -EBUSY; - for (i = 0; i < map->count; i++) - if (map->unmap_ops[i].handle) - return -EBUSY; + if (!atomic_dec_and_test(&map->users)) + return; - map->priv->used -= map->count; - list_del(&map->next); - return 0; -} + atomic_sub(map->count, &pages_mapped); -static void gntdev_free_map(struct grant_map *map) -{ - int i; + if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { + notify_remote_via_evtchn(map->notify.event); + } - if (!map) - return; + if (map->pages) { + if (!use_ptemod) + unmap_grant_pages(map, 0, map->count); - if (map->pages) - for (i = 0; i < map->count; i++) { - if (map->pages[i]) - __free_page(map->pages[i]); - } + free_xenballooned_pages(map->count, map->pages); + } kfree(map->pages); kfree(map->grants); kfree(map->map_ops); @@ -231,18 +212,17 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token, { struct grant_map *map = data; unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; + int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte; u64 pte_maddr; BUG_ON(pgnr >= map->count); pte_maddr = arbitrary_virt_to_machine(pte).maddr; - gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, - GNTMAP_contains_pte | map->flags, + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, map->grants[pgnr].ref, map->grants[pgnr].domid); - gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, - GNTMAP_contains_pte | map->flags, - 0 /* handle */); + gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags, + -1 /* handle */); return 0; } @@ -250,6 +230,21 @@ static int map_grant_pages(struct grant_map *map) { int i, err = 0; + if (!use_ptemod) { + /* Note: it could already be mapped */ + if (map->map_ops[0].handle != -1) + return 0; + for (i = 0; i < map->count; i++) { + unsigned long addr = (unsigned long) + pfn_to_kaddr(page_to_pfn(map->pages[i])); + gnttab_set_map_op(&map->map_ops[i], addr, map->flags, + map->grants[i].ref, + map->grants[i].domid); + gnttab_set_unmap_op(&map->unmap_ops[i], addr, + map->flags, -1 /* handle */); + } + } + pr_debug("map %d+%d\n", map->index, map->count); err = gnttab_map_refs(map->map_ops, map->pages, map->count); if (err) @@ -258,28 +253,81 @@ static int map_grant_pages(struct grant_map *map) for (i = 0; i < map->count; i++) { if (map->map_ops[i].status) err = -EINVAL; - map->unmap_ops[i].handle = map->map_ops[i].handle; + else { + BUG_ON(map->map_ops[i].handle == -1); + map->unmap_ops[i].handle = map->map_ops[i].handle; + pr_debug("map handle=%d\n", map->map_ops[i].handle); + } } return err; } -static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) { int i, err = 0; - pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages); - err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages); + if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { + int pgno = (map->notify.addr >> PAGE_SHIFT); + if (pgno >= offset && pgno < offset + pages && use_ptemod) { + void __user *tmp = (void __user *) + map->vma->vm_start + map->notify.addr; + err = copy_to_user(tmp, &err, 1); + if (err) + return err; + map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; + } else if (pgno >= offset && pgno < offset + pages) { + uint8_t *tmp = kmap(map->pages[pgno]); + tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; + kunmap(map->pages[pgno]); + map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; + } + } + + err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages + offset, pages); if (err) return err; for (i = 0; i < pages; i++) { if (map->unmap_ops[offset+i].status) err = -EINVAL; - map->unmap_ops[offset+i].handle = 0; + pr_debug("unmap handle=%d st=%d\n", + map->unmap_ops[offset+i].handle, + map->unmap_ops[offset+i].status); + map->unmap_ops[offset+i].handle = -1; } return err; } +static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ + int range, err = 0; + + pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + + /* It is possible the requested range will have a "hole" where we + * already unmapped some of the grants. Only unmap valid ranges. + */ + while (pages && !err) { + while (pages && map->unmap_ops[offset].handle == -1) { + offset++; + pages--; + } + range = 0; + while (range < pages) { + if (map->unmap_ops[offset+range].handle == -1) { + range--; + break; + } + range++; + } + err = __unmap_grant_pages(map, offset, range); + offset += range; + pages -= range; + } + + return err; +} + /* ------------------------------------------------------------------ */ static void gntdev_vma_close(struct vm_area_struct *vma) @@ -287,22 +335,13 @@ static void gntdev_vma_close(struct vm_area_struct *vma) struct grant_map *map = vma->vm_private_data; pr_debug("close %p\n", vma); - map->is_mapped = 0; map->vma = NULL; vma->vm_private_data = NULL; -} - -static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n", - vmf->virtual_address, vmf->pgoff); - vmf->flags = VM_FAULT_ERROR; - return 0; + gntdev_put_map(map); } static struct vm_operations_struct gntdev_vmops = { .close = gntdev_vma_close, - .fault = gntdev_vma_fault, }; /* ------------------------------------------------------------------ */ @@ -320,8 +359,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn, list_for_each_entry(map, &priv->maps, next) { if (!map->vma) continue; - if (!map->is_mapped) - continue; if (map->vma->vm_start >= end) continue; if (map->vma->vm_end <= start) @@ -386,16 +423,17 @@ static int gntdev_open(struct inode *inode, struct file *flip) INIT_LIST_HEAD(&priv->maps); spin_lock_init(&priv->lock); - priv->limit = limit; - priv->mm = get_task_mm(current); - if (!priv->mm) { - kfree(priv); - return -ENOMEM; + if (use_ptemod) { + priv->mm = get_task_mm(current); + if (!priv->mm) { + kfree(priv); + return -ENOMEM; + } + priv->mn.ops = &gntdev_mmu_ops; + ret = mmu_notifier_register(&priv->mn, priv->mm); + mmput(priv->mm); } - priv->mn.ops = &gntdev_mmu_ops; - ret = mmu_notifier_register(&priv->mn, priv->mm); - mmput(priv->mm); if (ret) { kfree(priv); @@ -412,21 +450,19 @@ static int gntdev_release(struct inode *inode, struct file *flip) { struct gntdev_priv *priv = flip->private_data; struct grant_map *map; - int err; pr_debug("priv %p\n", priv); spin_lock(&priv->lock); while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); - err = gntdev_del_map(map); - if (WARN_ON(err)) - gntdev_free_map(map); - + list_del(&map->next); + gntdev_put_map(map); } spin_unlock(&priv->lock); - mmu_notifier_unregister(&priv->mn, priv->mm); + if (use_ptemod) + mmu_notifier_unregister(&priv->mn, priv->mm); kfree(priv); return 0; } @@ -443,16 +479,21 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, pr_debug("priv %p, add %d\n", priv, op.count); if (unlikely(op.count <= 0)) return -EINVAL; - if (unlikely(op.count > priv->limit)) - return -EINVAL; err = -ENOMEM; map = gntdev_alloc_map(priv, op.count); if (!map) return err; + + if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { + pr_debug("can't map: over limit\n"); + gntdev_put_map(map); + return err; + } + if (copy_from_user(map->grants, &u->refs, sizeof(map->grants[0]) * op.count) != 0) { - gntdev_free_map(map); + gntdev_put_map(map); return err; } @@ -461,13 +502,9 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, op.index = map->index << PAGE_SHIFT; spin_unlock(&priv->lock); - if (copy_to_user(u, &op, sizeof(op)) != 0) { - spin_lock(&priv->lock); - gntdev_del_map(map); - spin_unlock(&priv->lock); - gntdev_free_map(map); - return err; - } + if (copy_to_user(u, &op, sizeof(op)) != 0) + return -EFAULT; + return 0; } @@ -484,11 +521,12 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, spin_lock(&priv->lock); map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); - if (map) - err = gntdev_del_map(map); + if (map) { + list_del(&map->next); + gntdev_put_map(map); + err = 0; + } spin_unlock(&priv->lock); - if (!err) - gntdev_free_map(map); return err; } @@ -496,43 +534,66 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, struct ioctl_gntdev_get_offset_for_vaddr __user *u) { struct ioctl_gntdev_get_offset_for_vaddr op; + struct vm_area_struct *vma; struct grant_map *map; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); - spin_lock(&priv->lock); - map = gntdev_find_map_vaddr(priv, op.vaddr); - if (map == NULL || - map->vma->vm_start != op.vaddr) { - spin_unlock(&priv->lock); + vma = find_vma(current->mm, op.vaddr); + if (!vma || vma->vm_ops != &gntdev_vmops) return -EINVAL; - } + + map = vma->vm_private_data; + if (!map) + return -EINVAL; + op.offset = map->index << PAGE_SHIFT; op.count = map->count; - spin_unlock(&priv->lock); if (copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; return 0; } -static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv, - struct ioctl_gntdev_set_max_grants __user *u) +static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) { - struct ioctl_gntdev_set_max_grants op; + struct ioctl_gntdev_unmap_notify op; + struct grant_map *map; + int rc; - if (copy_from_user(&op, u, sizeof(op)) != 0) + if (copy_from_user(&op, u, sizeof(op))) return -EFAULT; - pr_debug("priv %p, limit %d\n", priv, op.count); - if (op.count > limit) - return -E2BIG; + + if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) + return -EINVAL; spin_lock(&priv->lock); - priv->limit = op.count; + + list_for_each_entry(map, &priv->maps, next) { + uint64_t begin = map->index << PAGE_SHIFT; + uint64_t end = (map->index + map->count) << PAGE_SHIFT; + if (op.index >= begin && op.index < end) + goto found; + } + rc = -ENOENT; + goto unlock_out; + + found: + if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) && + (map->flags & GNTMAP_readonly)) { + rc = -EINVAL; + goto unlock_out; + } + + map->notify.flags = op.action; + map->notify.addr = op.index - (map->index << PAGE_SHIFT); + map->notify.event = op.event_channel_port; + rc = 0; + unlock_out: spin_unlock(&priv->lock); - return 0; + return rc; } static long gntdev_ioctl(struct file *flip, @@ -551,8 +612,8 @@ static long gntdev_ioctl(struct file *flip, case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); - case IOCTL_GNTDEV_SET_MAX_GRANTS: - return gntdev_ioctl_set_max_grants(priv, ptr); + case IOCTL_GNTDEV_SET_UNMAP_NOTIFY: + return gntdev_ioctl_notify(priv, ptr); default: pr_debug("priv %p, unknown cmd %x\n", priv, cmd); @@ -568,7 +629,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) int index = vma->vm_pgoff; int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; struct grant_map *map; - int err = -EINVAL; + int i, err = -EINVAL; if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) return -EINVAL; @@ -580,47 +641,70 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map = gntdev_find_map_index(priv, index, count); if (!map) goto unlock_out; - if (map->vma) + if (use_ptemod && map->vma) goto unlock_out; - if (priv->mm != vma->vm_mm) { + if (use_ptemod && priv->mm != vma->vm_mm) { printk(KERN_WARNING "Huh? Other mm?\n"); goto unlock_out; } + atomic_inc(&map->users); + vma->vm_ops = &gntdev_vmops; vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP; vma->vm_private_data = map; - map->vma = vma; - map->flags = GNTMAP_host_map | GNTMAP_application_map; - if (!(vma->vm_flags & VM_WRITE)) - map->flags |= GNTMAP_readonly; + if (use_ptemod) + map->vma = vma; + + if (map->flags) { + if ((vma->vm_flags & VM_WRITE) && + (map->flags & GNTMAP_readonly)) + return -EINVAL; + } else { + map->flags = GNTMAP_host_map; + if (!(vma->vm_flags & VM_WRITE)) + map->flags |= GNTMAP_readonly; + } spin_unlock(&priv->lock); - err = apply_to_page_range(vma->vm_mm, vma->vm_start, - vma->vm_end - vma->vm_start, - find_grant_ptes, map); - if (err) { - printk(KERN_WARNING "find_grant_ptes() failure.\n"); - return err; + if (use_ptemod) { + err = apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, + find_grant_ptes, map); + if (err) { + printk(KERN_WARNING "find_grant_ptes() failure.\n"); + goto out_put_map; + } } err = map_grant_pages(map); - if (err) { - printk(KERN_WARNING "map_grant_pages() failure.\n"); - return err; - } + if (err) + goto out_put_map; - map->is_mapped = 1; + if (!use_ptemod) { + for (i = 0; i < count; i++) { + err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, + map->pages[i]); + if (err) + goto out_put_map; + } + } return 0; unlock_out: spin_unlock(&priv->lock); return err; + +out_put_map: + if (use_ptemod) + map->vma = NULL; + gntdev_put_map(map); + return err; } static const struct file_operations gntdev_fops = { @@ -646,6 +730,8 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; + use_ptemod = xen_pv_domain(); + err = misc_register(&gntdev_miscdev); if (err != 0) { printk(KERN_ERR "Could not register gntdev device\n"); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 9ef54ebc1194..3745a318defc 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -458,7 +458,14 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, if (ret) return ret; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return ret; + for (i = 0; i < count; i++) { + /* Do not add to override if the map failed. */ + if (map_ops[i].status) + continue; + /* m2p override only supported for GNTMAP_contains_pte mappings */ if (!(map_ops[i].flags & GNTMAP_contains_pte)) continue; @@ -483,6 +490,9 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, if (ret) return ret; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return ret; + for (i = 0; i < count; i++) { ret = m2p_remove_override(pages[i]); if (ret) diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c new file mode 100644 index 000000000000..a4ff225ee868 --- /dev/null +++ b/drivers/xen/xen-balloon.c @@ -0,0 +1,256 @@ +/****************************************************************************** + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sysdev.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <xen/balloon.h> +#include <xen/xenbus.h> +#include <xen/features.h> +#include <xen/page.h> + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +#define BALLOON_CLASS_NAME "xen_memory" + +static struct sys_device balloon_sysdev; + +static int register_balloon(struct sys_device *sysdev); + +static struct xenbus_watch target_watch = +{ + .node = "memory/target" +}; + +/* React to a change in the target key */ +static void watch_target(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned long long new_target; + int err; + + err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); + if (err != 1) { + /* This is ok (for domain0 at least) - so just return */ + return; + } + + /* The given memory/target value is in KiB, so it needs converting to + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); +} + +static int balloon_init_watcher(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + int err; + + err = register_xenbus_watch(&target_watch); + if (err) + printk(KERN_ERR "Failed to set balloon watcher\n"); + + return NOTIFY_DONE; +} + +static struct notifier_block xenstore_notifier; + +static int __init balloon_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("xen-balloon: Initialising balloon driver.\n"); + + register_balloon(&balloon_sysdev); + + target_watch.callback = watch_target; + xenstore_notifier.notifier_call = balloon_init_watcher; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} +subsys_initcall(balloon_init); + +static void balloon_exit(void) +{ + /* XXX - release balloon here */ + return; +} + +module_exit(balloon_exit); + +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); + +static SYSDEV_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); +static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); +static SYSDEV_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); +static SYSDEV_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); + +static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); +} + +static ssize_t store_target_kb(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; + + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, + show_target_kb, store_target_kb); + + +static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)balloon_stats.target_pages + << PAGE_SHIFT); +} + +static ssize_t store_target(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + target_bytes = memparse(buf, &endchar); + + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, + show_target, store_target); + + +static struct sysdev_attribute *balloon_attrs[] = { + &attr_target_kb, + &attr_target, + &attr_schedule_delay.attr, + &attr_max_schedule_delay.attr, + &attr_retry_count.attr, + &attr_max_retry_count.attr +}; + +static struct attribute *balloon_info_attrs[] = { + &attr_current_kb.attr, + &attr_low_kb.attr, + &attr_high_kb.attr, + NULL +}; + +static struct attribute_group balloon_info_group = { + .name = "info", + .attrs = balloon_info_attrs +}; + +static struct sysdev_class balloon_sysdev_class = { + .name = BALLOON_CLASS_NAME +}; + +static int register_balloon(struct sys_device *sysdev) +{ + int i, error; + + error = sysdev_class_register(&balloon_sysdev_class); + if (error) + return error; + + sysdev->id = 0; + sysdev->cls = &balloon_sysdev_class; + + error = sysdev_register(sysdev); + if (error) { + sysdev_class_unregister(&balloon_sysdev_class); + return error; + } + + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { + error = sysdev_create_file(sysdev, balloon_attrs[i]); + if (error) + goto fail; + } + + error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); + if (error) + goto fail; + + return 0; + + fail: + while (--i >= 0) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); + return error; +} + +MODULE_LICENSE("GPL"); |