From 693394b8c3dcee1a3baa52e30fdc3323d88cd579 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Sep 2011 11:57:55 +0100 Subject: xen: add an "highmem" parameter to alloc_xenballooned_pages Add an highmem parameter to alloc_xenballooned_pages, to allow callers to request lowmem or highmem pages. Fix the code style of free_xenballooned_pages' prototype. Signed-off-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/balloon.c | 12 ++++++++---- drivers/xen/gntdev.c | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 5dfd8f8ff07..cd8b4704ef4 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -501,20 +501,24 @@ EXPORT_SYMBOL_GPL(balloon_set_new_target); * alloc_xenballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned + * @highmem: highmem or lowmem pages * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page** pages) +int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) { int pgno = 0; struct page* page; mutex_lock(&balloon_mutex); while (pgno < nr_pages) { - page = balloon_retrieve(true); - if (page) { + page = balloon_retrieve(highmem); + if (page && PageHighMem(page) == highmem) { pages[pgno++] = page; } else { enum bp_state st; - st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER); + if (page) + balloon_append(page); + st = decrease_reservation(nr_pages - pgno, + highmem ? GFP_HIGHUSER : GFP_USER); if (st != BP_DONE) goto out_undo; } diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index f914b26cf0c..7b9b1d1b75a 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -123,7 +123,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) NULL == add->pages) goto err; - if (alloc_xenballooned_pages(count, add->pages)) + if (alloc_xenballooned_pages(count, add->pages, false /* lowmem */)) goto err; for (i = 0; i < count; i++) { -- cgit v1.2.3 From 0930bba674e248b921ea659b036ff02564e5a5f4 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Thu, 29 Sep 2011 11:57:56 +0100 Subject: xen: modify kernel mappings corresponding to granted pages If we want to use granted pages for AIO, changing the mappings of a user vma and the corresponding p2m is not enough, we also need to update the kernel mappings accordingly. Currently this is only needed for pages that are created for user usages through /dev/xen/gntdev. As in, pages that have been in use by the kernel and use the P2M will not need this special mapping. However there are no guarantees that in the future the kernel won't start accessing pages through the 1:1 even for internal usage. In order to avoid the complexity of dealing with highmem, we allocated the pages lowmem. We issue a HYPERVISOR_grant_table_op right away in m2p_add_override and we remove the mappings using another HYPERVISOR_grant_table_op in m2p_remove_override. Considering that m2p_add_override and m2p_remove_override are called once per page we use multicalls and hypercall batching. Use the kmap_op pointer directly as argument to do the mapping as it is guaranteed to be present up until the unmapping is done. Before issuing any unmapping multicalls, we need to make sure that the mapping has already being done, because we need the kmap->handle to be set correctly. Signed-off-by: Stefano Stabellini [v1: Removed GRANT_FRAME_BIT usage] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 2 +- drivers/xen/gntdev.c | 32 +++++++++++++++++++++++++++++++- drivers/xen/grant-table.c | 6 +++--- 3 files changed, 35 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 2330a9ad5e9..1540792b1e5 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -396,7 +396,7 @@ static int xen_blkbk_map(struct blkif_request *req, continue; ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr), - blkbk->pending_page(pending_req, i), false); + blkbk->pending_page(pending_req, i), NULL); if (ret) { pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n", (unsigned long)map[i].dev_bus_addr, ret); diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 7b9b1d1b75a..3e3603f3524 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -83,6 +83,7 @@ struct grant_map { struct ioctl_gntdev_grant_ref *grants; struct gnttab_map_grant_ref *map_ops; struct gnttab_unmap_grant_ref *unmap_ops; + struct gnttab_map_grant_ref *kmap_ops; struct page **pages; }; @@ -116,10 +117,12 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); + add->kmap_ops = kzalloc(sizeof(add->kmap_ops[0]) * count, GFP_KERNEL); add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL); if (NULL == add->grants || NULL == add->map_ops || NULL == add->unmap_ops || + NULL == add->kmap_ops || NULL == add->pages) goto err; @@ -129,6 +132,7 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) for (i = 0; i < count; i++) { add->map_ops[i].handle = -1; add->unmap_ops[i].handle = -1; + add->kmap_ops[i].handle = -1; } add->index = 0; @@ -142,6 +146,7 @@ err: kfree(add->grants); kfree(add->map_ops); kfree(add->unmap_ops); + kfree(add->kmap_ops); kfree(add); return NULL; } @@ -243,10 +248,35 @@ static int map_grant_pages(struct grant_map *map) gnttab_set_unmap_op(&map->unmap_ops[i], addr, map->flags, -1 /* handle */); } + } else { + /* + * Setup the map_ops corresponding to the pte entries pointing + * to the kernel linear addresses of the struct pages. + * These ptes are completely different from the user ptes dealt + * with find_grant_ptes. + */ + for (i = 0; i < map->count; i++) { + unsigned level; + unsigned long address = (unsigned long) + pfn_to_kaddr(page_to_pfn(map->pages[i])); + pte_t *ptep; + u64 pte_maddr = 0; + BUG_ON(PageHighMem(map->pages[i])); + + ptep = lookup_address(address, &level); + pte_maddr = arbitrary_virt_to_machine(ptep).maddr; + gnttab_set_map_op(&map->kmap_ops[i], pte_maddr, + map->flags | + GNTMAP_host_map | + GNTMAP_contains_pte, + map->grants[i].ref, + map->grants[i].domid); + } } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs(map->map_ops, map->pages, map->count); + err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, + map->pages, map->count); if (err) return err; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 4f44b347b24..8c71ab80175 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -448,7 +448,8 @@ unsigned int gnttab_max_grant_frames(void) EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, - struct page **pages, unsigned int count) + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { int i, ret; pte_t *pte; @@ -488,8 +489,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, */ return -EOPNOTSUPP; } - ret = m2p_add_override(mfn, pages[i], - map_ops[i].flags & GNTMAP_contains_pte); + ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); if (ret) return ret; } -- cgit v1.2.3 From 1f1503ba096d3a394d1454dac77467092ca996e6 Mon Sep 17 00:00:00 2001 From: Daniel De Graaf Date: Tue, 11 Oct 2011 15:16:06 -0400 Subject: xen/gntdev: Fix sleep-inside-spinlock BUG: sleeping function called from invalid context at /local/scratch/dariof/linux/kernel/mutex.c:271 in_atomic(): 1, irqs_disabled(): 0, pid: 3256, name: qemu-dm 1 lock held by qemu-dm/3256: #0: (&(&priv->lock)->rlock){......}, at: [] gntdev_ioctl+0x2bd/0x4d5 Pid: 3256, comm: qemu-dm Tainted: G W 3.1.0-rc8+ #5 Call Trace: [] __might_sleep+0x131/0x135 [] mutex_lock_nested+0x25/0x45 [] free_xenballooned_pages+0x20/0xb1 [] gntdev_put_map+0xa8/0xdb [] ? _raw_spin_lock+0x71/0x7a [] ? gntdev_ioctl+0x2bd/0x4d5 [] gntdev_ioctl+0x31f/0x4d5 [] ? check_events+0x12/0x20 [] do_vfs_ioctl+0x488/0x4d7 [] ? xen_restore_fl_direct_reloc+0x4/0x4 [] ? lock_release+0x21c/0x229 [] ? rcu_read_unlock+0x21/0x32 [] sys_ioctl+0x47/0x6a [] system_call_fastpath+0x16/0x1b gntdev_put_map tries to acquire a mutex when freeing pages back to the xenballoon pool, so it cannot be called with a spinlock held. In gntdev_release, the spinlock is not needed as we are freeing the structure later; in the ioctl, only the list manipulation needs to be under the lock. Reported-and-Tested-By: Dario Faggioli Signed-off-by: Daniel De Graaf Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/gntdev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 3e3603f3524..880798aae2f 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -492,13 +492,11 @@ static int gntdev_release(struct inode *inode, struct file *flip) pr_debug("priv %p\n", priv); - spin_lock(&priv->lock); while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); list_del(&map->next); gntdev_put_map(map); } - spin_unlock(&priv->lock); if (use_ptemod) mmu_notifier_unregister(&priv->mn, priv->mm); @@ -562,10 +560,11 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); - gntdev_put_map(map); err = 0; } spin_unlock(&priv->lock); + if (map) + gntdev_put_map(map); return err; } -- cgit v1.2.3 From 38a1ed4f039db32b418007ac365076cf53647ebd Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Tue, 27 Sep 2011 08:47:58 -0600 Subject: xen: Fix selfballooning and ensure it doesn't go too far The balloon driver's "current_pages" is very different from totalram_pages. Self-ballooning needs to be driven by the latter. Also, Committed_AS doesn't account for pages used by the kernel so: 1) Add totalreserve_pages to Committed_AS for the normal target. 2) Enforce a floor for when there are little or no user-space threads using memory (e.g. single-user mode) to avoid OOMs. The floor function includes a "min_usable_mb" tuneable in case we discover later that the floor function is still too aggressive in some workloads, though likely it will not be needed. Changes since version 4: - change floor calculation so that it is not as aggressive; this version uses a piecewise linear function similar to minimum_target in the 2.6.18 balloon driver, but modified to add to totalreserve_pages instead of subtract from max_pfn, the 2.6.18 version causes OOMs on recent kernels because the kernel has expanded over time - change safety_margin to min_usable_mb and comment on its use - since committed_as does NOT include kernel space (and other reserved pages), totalreserve_pages is now added to committed_as. The result is less aggressive self-ballooning, but theoretically more appropriate. Changes since version 3: - missing include causes compile problem when CONFIG_FRONTSWAP is disabled - add comments after includes Changes since version 2: - missing include causes compile problem only on 32-bit Changes since version 1: - tuneable safety margin added [v5: avi.miller@oracle.com: still too aggressive, seeing some OOMs] [v4: konrad.wilk@oracle.com: fix compile when CONFIG_FRONTSWAP is disabled] [v3: guru.anbalagane@oracle.com: fix 32-bit compile] [v2: konrad.wilk@oracle.com: make safety margin tuneable] Signed-off-by: Dan Magenheimer [v1: Altered description and added an extra include] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-selfballoon.c | 67 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 1b4afd81f87..ff3f2e423af 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -68,6 +68,8 @@ */ #include +#include +#include #include #include #include @@ -92,6 +94,15 @@ static unsigned int selfballoon_uphysteresis __read_mostly = 1; /* In HZ, controls frequency of worker invocation. */ static unsigned int selfballoon_interval __read_mostly = 5; +/* + * Minimum usable RAM in MB for selfballooning target for balloon. + * If non-zero, it is added to totalreserve_pages and self-ballooning + * will not balloon below the sum. If zero, a piecewise linear function + * is calculated as a minimum and added to totalreserve_pages. Note that + * setting this value indiscriminately may cause OOMs and crashes. + */ +static unsigned int selfballoon_min_usable_mb; + static void selfballoon_process(struct work_struct *work); static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); @@ -188,20 +199,23 @@ static int __init xen_selfballooning_setup(char *s) __setup("selfballooning", xen_selfballooning_setup); #endif /* CONFIG_FRONTSWAP */ +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + /* * Use current balloon size, the goal (vm_committed_as), and hysteresis * parameters to set a new target balloon size */ static void selfballoon_process(struct work_struct *work) { - unsigned long cur_pages, goal_pages, tgt_pages; + unsigned long cur_pages, goal_pages, tgt_pages, floor_pages; + unsigned long useful_pages; bool reset_timer = false; if (xen_selfballooning_enabled) { - cur_pages = balloon_stats.current_pages; + cur_pages = totalram_pages; tgt_pages = cur_pages; /* default is no change */ goal_pages = percpu_counter_read_positive(&vm_committed_as) + - balloon_stats.current_pages - totalram_pages; + totalreserve_pages; #ifdef CONFIG_FRONTSWAP /* allow space for frontswap pages to be repatriated */ if (frontswap_selfshrinking && frontswap_enabled) @@ -216,7 +230,26 @@ static void selfballoon_process(struct work_struct *work) ((goal_pages - cur_pages) / selfballoon_uphysteresis); /* else if cur_pages == goal_pages, no change */ - balloon_set_new_target(tgt_pages); + useful_pages = max_pfn - totalreserve_pages; + if (selfballoon_min_usable_mb != 0) + floor_pages = totalreserve_pages + + MB2PAGES(selfballoon_min_usable_mb); + /* piecewise linear function ending in ~3% slope */ + else if (useful_pages < MB2PAGES(16)) + floor_pages = max_pfn; /* not worth ballooning */ + else if (useful_pages < MB2PAGES(64)) + floor_pages = totalreserve_pages + MB2PAGES(16) + + ((useful_pages - MB2PAGES(16)) >> 1); + else if (useful_pages < MB2PAGES(512)) + floor_pages = totalreserve_pages + MB2PAGES(40) + + ((useful_pages - MB2PAGES(40)) >> 3); + else /* useful_pages >= MB2PAGES(512) */ + floor_pages = totalreserve_pages + MB2PAGES(99) + + ((useful_pages - MB2PAGES(99)) >> 5); + if (tgt_pages < floor_pages) + tgt_pages = floor_pages; + balloon_set_new_target(tgt_pages + + balloon_stats.current_pages - totalram_pages); reset_timer = true; } #ifdef CONFIG_FRONTSWAP @@ -339,6 +372,31 @@ static ssize_t store_selfballoon_uphys(struct sys_device *dev, static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, show_selfballoon_uphys, store_selfballoon_uphys); +SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n", + selfballoon_min_usable_mb); + +static ssize_t store_selfballoon_min_usable_mb(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + selfballoon_min_usable_mb = val; + return count; +} + +static SYSDEV_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, + show_selfballoon_min_usable_mb, + store_selfballoon_min_usable_mb); + + #ifdef CONFIG_FRONTSWAP SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); @@ -420,6 +478,7 @@ static struct attribute *selfballoon_attrs[] = { &attr_selfballoon_interval.attr, &attr_selfballoon_downhysteresis.attr, &attr_selfballoon_uphysteresis.attr, + &attr_selfballoon_min_usable_mb.attr, #ifdef CONFIG_FRONTSWAP &attr_frontswap_selfshrinking.attr, &attr_frontswap_hysteresis.attr, -- cgit v1.2.3 From 9d093e2958baf76154d1008339f594f798a52790 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:31:21 -0400 Subject: xen/events: BUG() when we can't allocate our event->irq array. In case we can't allocate we are doomed. We should BUG_ON instead of trying to dereference it later on. Acked-by: Ian Campbell [v1: Use BUG_ON instead of BUG] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 7523719bf8a..6b002cca1f5 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -1670,6 +1670,7 @@ void __init xen_init_IRQ(void) evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); for (i = 0; i < NR_EVENT_CHANNELS; i++) evtchn_to_irq[i] = -1; -- cgit v1.2.3 From 9bb9efe4bab8a877cdde5c6bfbfa202645517571 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:13:30 -0400 Subject: xen/events: Don't check the info for NULL as it is already done. The list operation checks whether the 'info' structure that is retrieved from the list is NULL (otherwise it would not been able to retrieve it). This check is not neccessary. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 6b002cca1f5..503614f2c12 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -779,7 +779,7 @@ int xen_irq_from_pirq(unsigned pirq) mutex_lock(&irq_mapping_update_lock); list_for_each_entry(info, &xen_irq_list_head, list) { - if (info == NULL || info->type != IRQT_PIRQ) + if (info->type != IRQT_PIRQ) continue; irq = info->irq; if (info->u.pirq.pirq == pirq) -- cgit v1.2.3 From e6599225db36bbdc991d1cc8fbfcacb24f86cdb5 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:26:45 -0400 Subject: xen/irq: If we fail during msi_capability_init return proper error code. There are three different modes: PV, HVM, and initial domain 0. In all the cases we would return -1 for failure instead of a proper error code. Fix this by propagating the error code from the generic IRQ code. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 503614f2c12..212a5c871bf 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -432,7 +432,8 @@ static int __must_check xen_allocate_irq_dynamic(void) irq = irq_alloc_desc_from(first, -1); - xen_irq_init(irq); + if (irq >= 0) + xen_irq_init(irq); return irq; } @@ -713,7 +714,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, mutex_lock(&irq_mapping_update_lock); irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, @@ -729,7 +730,7 @@ out: error_irq: mutex_unlock(&irq_mapping_update_lock); xen_free_irq(irq); - return -1; + return ret; } #endif -- cgit v1.2.3 From d98b15db376b9cc35f74fd2bd432b9fc287a5999 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 29 Sep 2011 13:16:17 -0400 Subject: xen/xenbus: Remove the unnecessary check. .. we check whether 'xdev' is NULL - but there is no need for it as the 'dev' check is done before. The 'dev' is embedded in the 'xdev' so having xdev != NULL with dev being being checked is not going to happen. Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xenbus/xenbus_probe_backend.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 60adf919d78..32417b5064f 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -104,8 +104,6 @@ static int xenbus_uevent_backend(struct device *dev, xdev = to_xenbus_device(dev); bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); - if (xdev == NULL) - return -ENODEV; if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype)) return -ENOMEM; -- cgit v1.2.3