mm/migrate: new memory migration helper for use with device memory

This patch add a new memory migration helpers, which migrate memory backing a range of virtual address of a process to different memory (which can be allocated through special allocator). It differs from numa migration by working on a range of virtual address and thus by doing migration in chunk that can be large enough to use DMA engine or special copy offloading engine. Expected users are any one with heterogeneous memory where different memory have different characteristics (latency, bandwidth, ...). As an example IBM platform with CAPI bus can make use of this feature to migrate between regular memory and CAPI device memory. New CPU architecture with a pool of high performance memory not manage as cache but presented as regular memory (while being faster and with lower latency than DDR) will also be prime user of this patch. Migration to private device memory will be useful for device that have large pool of such like GPU, NVidia plans to use HMM for that. Link: http://lkml.kernel.org/r/20170817000548.32038-15-jglisse@redhat.com Signed-off-by: Jérôme Glisse <jglisse@redhat.com> Signed-off-by: Evgeny Baskakov <ebaskakov@nvidia.com> Signed-off-by: John Hubbard <jhubbard@nvidia.com> Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com> Signed-off-by: Sherry Cheung <SCheung@nvidia.com> Signed-off-by: Subhash Gutti <sgutti@nvidia.com> Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com> Cc: Balbir Singh <bsingharora@gmail.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Nellans <dnellans@nvidia.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Bob Liu <liubo95@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Jérôme Glisse <jglisse@redhat.com> 2017-09-09 02:12:09 +0300
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-09-09 04:26:46 +0300
commit: 8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6 (patch)
tree: a2b5041d068fd69ee8a60c6c3ec8adb004ad0ced /mm
parent: 2916ecc0f9d435d849c98f4da50e453124c87531 (diff)
download: linux-8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6.tar.xz
1 files changed, 492 insertions, 0 deletions
diff --git a/mm/migrate.c b/mm/migrate.c
index 71de36cfb673..991e8886093f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -428,6 +428,14 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	int expected_count = 1 + extra_count;
 	void **pslot;
 
+	/*
+	 * ZONE_DEVICE pages have 1 refcount always held by their device
+	 *
+	 * Note that DAX memory will never reach that point as it does not have
+	 * the MEMORY_DEVICE_ALLOW_MIGRATE flag set (see memory_hotplug.h).
+	 */
+	expected_count += is_zone_device_page(page);
+
 	if (!mapping) {
 		/* Anonymous page without mapping */
 		if (page_count(page) != expected_count)
@@ -2106,3 +2114,487 @@ out_unlock:
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* CONFIG_NUMA */
+
+
+struct migrate_vma {
+	struct vm_area_struct	*vma;
+	unsigned long		*dst;
+	unsigned long		*src;
+	unsigned long		cpages;
+	unsigned long		npages;
+	unsigned long		start;
+	unsigned long		end;
+};
+
+static int migrate_vma_collect_hole(unsigned long start,
+				    unsigned long end,
+				    struct mm_walk *walk)
+{
+	struct migrate_vma *migrate = walk->private;
+	unsigned long addr;
+
+	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+		migrate->dst[migrate->npages] = 0;
+		migrate->src[migrate->npages++] = 0;
+	}
+
+	return 0;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+				   unsigned long start,
+				   unsigned long end,
+				   struct mm_walk *walk)
+{
+	struct migrate_vma *migrate = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = start;
+	spinlock_t *ptl;
+	pte_t *ptep;
+
+again:
+	if (pmd_none(*pmdp))
+		return migrate_vma_collect_hole(start, end, walk);
+
+	if (pmd_trans_huge(*pmdp)) {
+		struct page *page;
+
+		ptl = pmd_lock(mm, pmdp);
+		if (unlikely(!pmd_trans_huge(*pmdp))) {
+			spin_unlock(ptl);
+			goto again;
+		}
+
+		page = pmd_page(*pmdp);
+		if (is_huge_zero_page(page)) {
+			spin_unlock(ptl);
+			split_huge_pmd(vma, pmdp, addr);
+			if (pmd_trans_unstable(pmdp))
+				return migrate_vma_collect_hole(start, end,
+								walk);
+		} else {
+			int ret;
+
+			get_page(page);
+			spin_unlock(ptl);
+			if (unlikely(!trylock_page(page)))
+				return migrate_vma_collect_hole(start, end,
+								walk);
+			ret = split_huge_page(page);
+			unlock_page(page);
+			put_page(page);
+			if (ret || pmd_none(*pmdp))
+				return migrate_vma_collect_hole(start, end,
+								walk);
+		}
+	}
+
+	if (unlikely(pmd_bad(*pmdp)))
+		return migrate_vma_collect_hole(start, end, walk);
+
+	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	for (; addr < end; addr += PAGE_SIZE, ptep++) {
+		unsigned long mpfn, pfn;
+		struct page *page;
+		pte_t pte;
+
+		pte = *ptep;
+		pfn = pte_pfn(pte);
+
+		if (!pte_present(pte)) {
+			mpfn = pfn = 0;
+			goto next;
+		}
+
+		/* FIXME support THP */
+		page = vm_normal_page(migrate->vma, addr, pte);
+		if (!page || !page->mapping || PageTransCompound(page)) {
+			mpfn = pfn = 0;
+			goto next;
+		}
+
+		/*
+		 * By getting a reference on the page we pin it and that blocks
+		 * any kind of migration. Side effect is that it "freezes" the
+		 * pte.
+		 *
+		 * We drop this reference after isolating the page from the lru
+		 * for non device page (device page are not on the lru and thus
+		 * can't be dropped from it).
+		 */
+		get_page(page);
+		migrate->cpages++;
+		mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+		mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+
+next:
+		migrate->src[migrate->npages++] = mpfn;
+	}
+	pte_unmap_unlock(ptep - 1, ptl);
+
+	return 0;
+}
+
+/*
+ * migrate_vma_collect() - collect pages over a range of virtual addresses
+ * @migrate: migrate struct containing all migration information
+ *
+ * This will walk the CPU page table. For each virtual address backed by a
+ * valid page, it updates the src array and takes a reference on the page, in
+ * order to pin the page until we lock it and unmap it.
+ */
+static void migrate_vma_collect(struct migrate_vma *migrate)
+{
+	struct mm_walk mm_walk;
+
+	mm_walk.pmd_entry = migrate_vma_collect_pmd;
+	mm_walk.pte_entry = NULL;
+	mm_walk.pte_hole = migrate_vma_collect_hole;
+	mm_walk.hugetlb_entry = NULL;
+	mm_walk.test_walk = NULL;
+	mm_walk.vma = migrate->vma;
+	mm_walk.mm = migrate->vma->vm_mm;
+	mm_walk.private = migrate;
+
+	walk_page_range(migrate->start, migrate->end, &mm_walk);
+
+	migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
+}
+
+/*
+ * migrate_vma_check_page() - check if page is pinned or not
+ * @page: struct page to check
+ *
+ * Pinned pages cannot be migrated. This is the same test as in
+ * migrate_page_move_mapping(), except that here we allow migration of a
+ * ZONE_DEVICE page.
+ */
+static bool migrate_vma_check_page(struct page *page)
+{
+	/*
+	 * One extra ref because caller holds an extra reference, either from
+	 * isolate_lru_page() for a regular page, or migrate_vma_collect() for
+	 * a device page.
+	 */
+	int extra = 1;
+
+	/*
+	 * FIXME support THP (transparent huge page), it is bit more complex to
+	 * check them than regular pages, because they can be mapped with a pmd
+	 * or with a pte (split pte mapping).
+	 */
+	if (PageCompound(page))
+		return false;
+
+	if ((page_count(page) - extra) > page_mapcount(page))
+		return false;
+
+	return true;
+}
+
+/*
+ * migrate_vma_prepare() - lock pages and isolate them from the lru
+ * @migrate: migrate struct containing all migration information
+ *
+ * This locks pages that have been collected by migrate_vma_collect(). Once each
+ * page is locked it is isolated from the lru (for non-device pages). Finally,
+ * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be
+ * migrated by concurrent kernel threads.
+ */
+static void migrate_vma_prepare(struct migrate_vma *migrate)
+{
+	const unsigned long npages = migrate->npages;
+	bool allow_drain = true;
+	unsigned long i;
+
+	lru_add_drain();
+
+	for (i = 0; (i < npages) && migrate->cpages; i++) {
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+		if (!page)
+			continue;
+
+		/*
+		 * Because we are migrating several pages there can be
+		 * a deadlock between 2 concurrent migration where each
+		 * are waiting on each other page lock.
+		 *
+		 * Make migrate_vma() a best effort thing and backoff
+		 * for any page we can not lock right away.
+		 */
+		if (!trylock_page(page)) {
+			migrate->src[i] = 0;
+			migrate->cpages--;
+			put_page(page);
+			continue;
+		}
+		migrate->src[i] |= MIGRATE_PFN_LOCKED;
+
+		if (!PageLRU(page) && allow_drain) {
+			/* Drain CPU's pagevec */
+			lru_add_drain_all();
+			allow_drain = false;
+		}
+
+		if (isolate_lru_page(page)) {
+			migrate->src[i] = 0;
+			unlock_page(page);
+			migrate->cpages--;
+			put_page(page);
+			continue;
+		}
+
+		if (!migrate_vma_check_page(page)) {
+			migrate->src[i] = 0;
+			unlock_page(page);
+			migrate->cpages--;
+
+			putback_lru_page(page);
+		}
+	}
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Replace page mapping (CPU page table pte) with a special migration pte entry
+ * and check again if it has been pinned. Pinned pages are restored because we
+ * cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+	int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+	const unsigned long npages = migrate->npages;
+	const unsigned long start = migrate->start;
+	unsigned long addr, i, restore = 0;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+		if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		try_to_unmap(page, flags);
+		if (page_mapped(page) || !migrate_vma_check_page(page)) {
+			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+			migrate->cpages--;
+			restore++;
+		}
+	}
+
+	for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+		if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		remove_migration_ptes(page, page, false);
+
+		migrate->src[i] = 0;
+		unlock_page(page);
+		restore--;
+
+		putback_lru_page(page);
+	}
+}
+
+/*
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
+ * @migrate: migrate struct containing all migration information
+ *
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+static void migrate_vma_pages(struct migrate_vma *migrate)
+{
+	const unsigned long npages = migrate->npages;
+	const unsigned long start = migrate->start;
+	unsigned long addr, i;
+
+	for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
+		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+		struct address_space *mapping;
+		int r;
+
+		if (!page || !newpage)
+			continue;
+		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
+			continue;
+
+		mapping = page_mapping(page);
+
+		r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+		if (r != MIGRATEPAGE_SUCCESS)
+			migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+	}
+}
+
+/*
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+static void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+	const unsigned long npages = migrate->npages;
+	unsigned long i;
+
+	for (i = 0; i < npages; i++) {
+		struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
+		struct page *page = migrate_pfn_to_page(migrate->src[i]);
+
+		if (!page)
+			continue;
+		if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+			if (newpage) {
+				unlock_page(newpage);
+				put_page(newpage);
+			}
+			newpage = page;
+		}
+
+		remove_migration_ptes(page, newpage, false);
+		unlock_page(page);
+		migrate->cpages--;
+
+		putback_lru_page(page);
+
+		if (newpage != page) {
+			unlock_page(newpage);
+			putback_lru_page(newpage);
+		}
+	}
+}
+
+/*
+ * migrate_vma() - migrate a range of memory inside vma
+ *
+ * @ops: migration callback for allocating destination memory and copying
+ * @vma: virtual memory area containing the range to be migrated
+ * @start: start address of the range to migrate (inclusive)
+ * @end: end address of the range to migrate (exclusive)
+ * @src: array of hmm_pfn_t containing source pfns
+ * @dst: array of hmm_pfn_t containing destination pfns
+ * @private: pointer passed back to each of the callback
+ * Returns: 0 on success, error code otherwise
+ *
+ * This function tries to migrate a range of memory virtual address range, using
+ * callbacks to allocate and copy memory from source to destination. First it
+ * collects all the pages backing each virtual address in the range, saving this
+ * inside the src array. Then it locks those pages and unmaps them. Once the pages
+ * are locked and unmapped, it checks whether each page is pinned or not. Pages
+ * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
+ * in the corresponding src array entry. It then restores any pages that are
+ * pinned, by remapping and unlocking those pages.
+ *
+ * At this point it calls the alloc_and_copy() callback. For documentation on
+ * what is expected from that callback, see struct migrate_vma_ops comments in
+ * include/linux/migrate.h
+ *
+ * After the alloc_and_copy() callback, this function goes over each entry in
+ * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then the function tries to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the struct
+ * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
+ * array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * It then calls the finalize_and_map() callback. See comments for "struct
+ * migrate_vma_ops", in include/linux/migrate.h for details about
+ * finalize_and_map() behavior.
+ *
+ * After the finalize_and_map() callback, for successfully migrated pages, this
+ * function updates the CPU page table to point to new pages, otherwise it
+ * restores the CPU page table to point to the original source pages.
+ *
+ * Function returns 0 after the above steps, even if no pages were migrated
+ * (The function only returns an error if any of the arguments are invalid.)
+ *
+ * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
+ * unsigned long entries.
+ */
+int migrate_vma(const struct migrate_vma_ops *ops,
+		struct vm_area_struct *vma,
+		unsigned long start,
+		unsigned long end,
+		unsigned long *src,
+		unsigned long *dst,
+		void *private)
+{
+	struct migrate_vma migrate;
+
+	/* Sanity check the arguments */
+	start &= PAGE_MASK;
+	end &= PAGE_MASK;
+	if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL))
+		return -EINVAL;
+	if (start < vma->vm_start || start >= vma->vm_end)
+		return -EINVAL;
+	if (end <= vma->vm_start || end > vma->vm_end)
+		return -EINVAL;
+	if (!ops || !src || !dst || start >= end)
+		return -EINVAL;
+
+	memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
+	migrate.src = src;
+	migrate.dst = dst;
+	migrate.start = start;
+	migrate.npages = 0;
+	migrate.cpages = 0;
+	migrate.end = end;
+	migrate.vma = vma;
+
+	/* Collect, and try to unmap source pages */
+	migrate_vma_collect(&migrate);
+	if (!migrate.cpages)
+		return 0;
+
+	/* Lock and isolate page */
+	migrate_vma_prepare(&migrate);
+	if (!migrate.cpages)
+		return 0;
+
+	/* Unmap pages */
+	migrate_vma_unmap(&migrate);
+	if (!migrate.cpages)
+		return 0;
+
+	/*
+	 * At this point pages are locked and unmapped, and thus they have
+	 * stable content and can safely be copied to destination memory that
+	 * is allocated by the callback.
+	 *
+	 * Note that migration can fail in migrate_vma_struct_page() for each
+	 * individual page.
+	 */
+	ops->alloc_and_copy(vma, src, dst, start, end, private);
+
+	/* This does the real migration of struct page */
+	migrate_vma_pages(&migrate);
+
+	ops->finalize_and_map(vma, src, dst, start, end, private);
+
+	/* Unlock and remap pages */
+	migrate_vma_finalize(&migrate);
+
+	return 0;
+}
+EXPORT_SYMBOL(migrate_vma);
author	Jérôme Glisse <jglisse@redhat.com>	2017-09-09 02:12:09 +0300
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-09-09 04:26:46 +0300
commit	8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6 (patch)
tree	a2b5041d068fd69ee8a60c6c3ec8adb004ad0ced /mm
parent	2916ecc0f9d435d849c98f4da50e453124c87531 (diff)
download	linux-8763cb45ab967a92a5ee49e9c544c0f0ea90e2d6.tar.xz