From e4b866ed197cef9989348e0479fed8d864ea465b Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:11 -0800 Subject: x86 PAT: change track_pfn_vma_new to take pgprot_t pointer param Impact: cleanup Change the protection parameter for track_pfn_vma_new() into a pgprot_t pointer. Subsequent patch changes the x86 PAT handling to return a compatible memtype in pgprot_t, if what was requested cannot be allowed due to conflicts. No fuctionality change in this patch. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm/pat.c') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 85cbd3cd3723..f88ac80530c0 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -741,7 +741,7 @@ cleanup_ret: * Note that this function can be called with caller trying to map only a * subrange/page inside the vma. */ -int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, +int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { int retval = 0; @@ -758,14 +758,14 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t prot, if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, prot); + return reserve_pfn_range(paddr, vma_size, *prot); } /* reserve page by page using pfn and size */ base_paddr = (resource_size_t)pfn << PAGE_SHIFT; for (i = 0; i < size; i += PAGE_SIZE) { paddr = base_paddr + i; - retval = reserve_pfn_range(paddr, PAGE_SIZE, prot); + retval = reserve_pfn_range(paddr, PAGE_SIZE, *prot); if (retval) goto cleanup_ret; } -- cgit v1.2.3 From cdecff6864a1cd352a41d44a65e7451b8ef5cee2 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:12 -0800 Subject: x86 PAT: return compatible mapping to remap_pfn_range callers Impact: avoid warning message, potentially solve 3D performance regression Change x86 PAT code to return compatible memtype if the exact memtype that was requested in remap_pfn_rage and friends is not available due to some conflict. This is done by returning the compatible type in pgprot parameter of track_pfn_vma_new(), and the caller uses that memtype for page table. Note that track_pfn_vma_copy() which is basically called during fork gets the prot from existing page table and should not have any conflict. Hence we use strict memtype check there and do not allow compatible memtypes. This patch fixes the bug reported here: http://marc.info/?l=linux-kernel&m=123108883716357&w=2 Specifically the error message: X:5010 map pfn expected mapping type write-back for d0000000-d0101000, got write-combining Should go away. Reported-and-bisected-by: Kevin Winchester Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86/mm/pat.c') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f88ac80530c0..8b08fb955274 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -601,12 +601,13 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) * Reserved non RAM regions only and after successful reserve_memtype, * this func also keeps identity mapping (if any) in sync with this new prot. */ -static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot) +static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, + int strict_prot) { int is_ram = 0; int id_sz, ret; unsigned long flags; - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); is_ram = pagerange_is_ram(paddr, paddr + size); @@ -625,15 +626,24 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t vma_prot) return ret; if (flags != want_flags) { - free_memtype(paddr, paddr + size); - printk(KERN_ERR - "%s:%d map pfn expected mapping type %s for %Lx-%Lx, got %s\n", - current->comm, current->pid, - cattr_name(want_flags), - (unsigned long long)paddr, - (unsigned long long)(paddr + size), - cattr_name(flags)); - return -EINVAL; + if (strict_prot || !is_new_memtype_allowed(want_flags, flags)) { + free_memtype(paddr, paddr + size); + printk(KERN_ERR "%s:%d map pfn expected mapping type %s" + " for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + return -EINVAL; + } + /* + * We allow returning different type than the one requested in + * non strict case. + */ + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); } /* Need to keep identity mapping in sync */ @@ -689,6 +699,7 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) unsigned long vma_start = vma->vm_start; unsigned long vma_end = vma->vm_end; unsigned long vma_size = vma_end - vma_start; + pgprot_t pgprot; if (!pat_enabled) return 0; @@ -702,7 +713,8 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) WARN_ON_ONCE(1); return -EINVAL; } - return reserve_pfn_range(paddr, vma_size, __pgprot(prot)); + pgprot = __pgprot(prot); + return reserve_pfn_range(paddr, vma_size, &pgprot, 1); } /* reserve entire vma page by page, using pfn and prot from pte */ @@ -710,7 +722,8 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) if (follow_phys(vma, vma_start + i, 0, &prot, &paddr)) continue; - retval = reserve_pfn_range(paddr, PAGE_SIZE, __pgprot(prot)); + pgprot = __pgprot(prot); + retval = reserve_pfn_range(paddr, PAGE_SIZE, &pgprot, 1); if (retval) goto cleanup_ret; } @@ -758,14 +771,14 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; - return reserve_pfn_range(paddr, vma_size, *prot); + return reserve_pfn_range(paddr, vma_size, prot, 0); } /* reserve page by page using pfn and size */ base_paddr = (resource_size_t)pfn << PAGE_SHIFT; for (i = 0; i < size; i += PAGE_SIZE) { paddr = base_paddr + i; - retval = reserve_pfn_range(paddr, PAGE_SIZE, *prot); + retval = reserve_pfn_range(paddr, PAGE_SIZE, prot, 0); if (retval) goto cleanup_ret; } -- cgit v1.2.3 From 58dab916dfb57328d50deb0aa9b3fc92efa248ff Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:14 -0800 Subject: x86 PAT: remove CPA WARN_ON for zero pte Impact: reduce scope of debug check - avoid warnings The logic to find whether identity map exists or not using high_memory or max_low_pfn_mapped/max_pfn_mapped are not complete as the memory withing the range may not be mapped if there is a unusable hole in e820. Specifically, on my test system I started seeing these warnings with tools like hwinfo, acpidump trying to map ACPI region. [ 27.400018] ------------[ cut here ]------------ [ 27.400344] WARNING: at /home/venkip/src/linus/linux-2.6/arch/x86/mm/pageattr.c:560 __change_page_attr_set_clr+0xf3/0x8b8() [ 27.400821] Hardware name: X7DB8 [ 27.401070] CPA: called for zero pte. vaddr = ffff8800cff6a000 cpa->vaddr = ffff8800cff6a000 [ 27.401569] Modules linked in: [ 27.401882] Pid: 4913, comm: dmidecode Not tainted 2.6.28-05716-gfe0bdec #586 [ 27.402141] Call Trace: [ 27.402488] [] warn_slowpath+0xd3/0x10f [ 27.402749] [] ? find_get_page+0xb3/0xc9 [ 27.403028] [] ? find_get_page+0x0/0xc9 [ 27.403333] [] __change_page_attr_set_clr+0xf3/0x8b8 [ 27.403628] [] ? __purge_vmap_area_lazy+0x192/0x1a1 [ 27.403883] [] ? __purge_vmap_area_lazy+0x4b/0x1a1 [ 27.404172] [] ? vm_unmap_aliases+0x1ab/0x1bb [ 27.404512] [] ? vm_unmap_aliases+0x48/0x1bb [ 27.404766] [] change_page_attr_set_clr+0x13e/0x2e6 [ 27.405026] [] ? _spin_unlock+0x26/0x2a [ 27.405292] [] ? reserve_memtype+0x19b/0x4e3 [ 27.405590] [] _set_memory_wb+0x22/0x24 [ 27.405844] [] ioremap_change_attr+0x26/0x28 [ 27.406097] [] reserve_pfn_range+0x1a3/0x235 [ 27.406427] [] track_pfn_vma_new+0x49/0xb3 [ 27.406686] [] remap_pfn_range+0x94/0x32c [ 27.406940] [] ? phys_mem_access_prot_allowed+0xb5/0x1a8 [ 27.407209] [] mmap_mem+0x75/0x9d [ 27.407523] [] mmap_region+0x2cf/0x53e [ 27.407776] [] do_mmap_pgoff+0x2a9/0x30d [ 27.408034] [] sys_mmap+0x92/0xce [ 27.408339] [] system_call_fastpath+0x16/0x1b [ 27.408614] ---[ end trace 4b16ad70c09a602d ]--- [ 27.408871] dmidecode:4913 reserve_pfn_range ioremap_change_attr failed write-back for cff6a000-cff6b000 This is wih track_pfn_vma_new trying to keep identity map in sync. The address cff6a000 is the ACPI region according to e820. [ 0.000000] BIOS-provided physical RAM map: [ 0.000000] BIOS-e820: 0000000000000000 - 000000000009c000 (usable) [ 0.000000] BIOS-e820: 000000000009c000 - 00000000000a0000 (reserved) [ 0.000000] BIOS-e820: 00000000000cc000 - 00000000000d0000 (reserved) [ 0.000000] BIOS-e820: 00000000000e4000 - 0000000000100000 (reserved) [ 0.000000] BIOS-e820: 0000000000100000 - 00000000cff60000 (usable) [ 0.000000] BIOS-e820: 00000000cff60000 - 00000000cff69000 (ACPI data) [ 0.000000] BIOS-e820: 00000000cff69000 - 00000000cff80000 (ACPI NVS) [ 0.000000] BIOS-e820: 00000000cff80000 - 00000000d0000000 (reserved) [ 0.000000] BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved) [ 0.000000] BIOS-e820: 00000000fec00000 - 00000000fec10000 (reserved) [ 0.000000] BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved) [ 0.000000] BIOS-e820: 00000000ff000000 - 0000000100000000 (reserved) [ 0.000000] BIOS-e820: 0000000100000000 - 0000000230000000 (usable) And is not mapped as per init_memory_mapping. [ 0.000000] init_memory_mapping: 0000000000000000-00000000cff60000 [ 0.000000] init_memory_mapping: 0000000100000000-0000000230000000 We can add logic to check for this. But, there can also be other holes in identity map when we have 1GB of aligned reserved space in e820. This patch handles it by removing the WARN_ON and returning a specific error value (EFAULT) to indicate that the address does not have any identity mapping. The code that tries to keep identity map in sync can ignore this error, with other callers of cpa still getting error here. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 10 ++++++---- arch/x86/mm/pat.c | 45 ++++++++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 17 deletions(-) (limited to 'arch/x86/mm/pat.c') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e89d24815f26..4cf30dee8161 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -555,10 +555,12 @@ repeat: if (!pte_val(old_pte)) { if (!primary) return 0; - WARN(1, KERN_WARNING "CPA: called for zero pte. " - "vaddr = %lx cpa->vaddr = %lx\n", address, - *cpa->vaddr); - return -EINVAL; + + /* + * Special error value returned, indicating that the mapping + * did not exist at this address. + */ + return -EFAULT; } if (level == PG_LEVEL_4K) { diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 8b08fb955274..160c42d3eb8f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -505,6 +505,35 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) } #endif /* CONFIG_STRICT_DEVMEM */ +/* + * Change the memory type for the physial address range in kernel identity + * mapping space if that range is a part of identity map. + */ +static int kernel_map_sync_memtype(u64 base, unsigned long size, + unsigned long flags) +{ + unsigned long id_sz; + int ret; + + if (!pat_enabled || base >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < base + size) ? + __pa(high_memory) - base : + size; + + ret = ioremap_change_attr((unsigned long)__va(base), id_sz, flags); + /* + * -EFAULT return means that the addr was not valid and did not have + * any identity mapping. That case is a success for + * kernel_map_sync_memtype. + */ + if (ret == -EFAULT) + ret = 0; + + return ret; +} + int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { @@ -555,9 +584,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (retval < 0) return 0; - if (((pfn < max_low_pfn_mapped) || - (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) && - ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { + if (kernel_map_sync_memtype(offset, size, flags)) { free_memtype(offset, offset + size); printk(KERN_INFO "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", @@ -605,7 +632,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, int strict_prot) { int is_ram = 0; - int id_sz, ret; + int ret; unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); @@ -646,15 +673,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags); } - /* Need to keep identity mapping in sync */ - if (paddr >= __pa(high_memory)) - return 0; - - id_sz = (__pa(high_memory) < paddr + size) ? - __pa(high_memory) - paddr : - size; - - if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { + if (kernel_map_sync_memtype(paddr, size, flags)) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d reserve_pfn_range ioremap_change_attr failed %s " -- cgit v1.2.3 From b5db0e38653bfada34a92f360b4111566ede3842 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Jan 2009 15:32:12 -0800 Subject: Revert "x86 PAT: remove CPA WARN_ON for zero pte" This reverts commit 58dab916dfb57328d50deb0aa9b3fc92efa248ff, which makes my Nehalem come to a nasty crawling almost-halt. It looks like it turns off caching of regular kernel RAM, with the understandable slowdown of a few orders of magnitude as a result. Acked-by: Ingo Molnar Cc: Yinghai Lu Cc: Peter Anvin Cc: Venkatesh Pallipadi Cc: Suresh Siddha Signed-off-by: Linus Torvalds --- arch/x86/mm/pageattr.c | 10 ++++------ arch/x86/mm/pat.c | 45 +++++++++++++-------------------------------- 2 files changed, 17 insertions(+), 38 deletions(-) (limited to 'arch/x86/mm/pat.c') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4cf30dee8161..e89d24815f26 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -555,12 +555,10 @@ repeat: if (!pte_val(old_pte)) { if (!primary) return 0; - - /* - * Special error value returned, indicating that the mapping - * did not exist at this address. - */ - return -EFAULT; + WARN(1, KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", address, + *cpa->vaddr); + return -EINVAL; } if (level == PG_LEVEL_4K) { diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 160c42d3eb8f..8b08fb955274 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -505,35 +505,6 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) } #endif /* CONFIG_STRICT_DEVMEM */ -/* - * Change the memory type for the physial address range in kernel identity - * mapping space if that range is a part of identity map. - */ -static int kernel_map_sync_memtype(u64 base, unsigned long size, - unsigned long flags) -{ - unsigned long id_sz; - int ret; - - if (!pat_enabled || base >= __pa(high_memory)) - return 0; - - id_sz = (__pa(high_memory) < base + size) ? - __pa(high_memory) - base : - size; - - ret = ioremap_change_attr((unsigned long)__va(base), id_sz, flags); - /* - * -EFAULT return means that the addr was not valid and did not have - * any identity mapping. That case is a success for - * kernel_map_sync_memtype. - */ - if (ret == -EFAULT) - ret = 0; - - return ret; -} - int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { @@ -584,7 +555,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, if (retval < 0) return 0; - if (kernel_map_sync_memtype(offset, size, flags)) { + if (((pfn < max_low_pfn_mapped) || + (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) && + ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { free_memtype(offset, offset + size); printk(KERN_INFO "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", @@ -632,7 +605,7 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, int strict_prot) { int is_ram = 0; - int ret; + int id_sz, ret; unsigned long flags; unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK); @@ -673,7 +646,15 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, flags); } - if (kernel_map_sync_memtype(paddr, size, flags)) { + /* Need to keep identity mapping in sync */ + if (paddr >= __pa(high_memory)) + return 0; + + id_sz = (__pa(high_memory) < paddr + size) ? + __pa(high_memory) - paddr : + size; + + if (ioremap_change_attr((unsigned long)__va(paddr), id_sz, flags) < 0) { free_memtype(paddr, paddr + size); printk(KERN_ERR "%s:%d reserve_pfn_range ioremap_change_attr failed %s " -- cgit v1.2.3