diff options
author | Gerald Schaefer <gerald.schaefer@de.ibm.com> | 2016-04-15 17:38:40 +0300 |
---|---|---|
committer | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2016-04-21 10:50:09 +0300 |
commit | 723cacbd9dc79582e562c123a0bacf8bfc69e72a (patch) | |
tree | cae93823c3e8a37ef012b85b38d48e9e1bc19762 /arch/s390/include | |
parent | dba599091c191d209b1499511a524ad9657c0e5a (diff) | |
download | linux-723cacbd9dc79582e562c123a0bacf8bfc69e72a.tar.xz |
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch/s390/include')
-rw-r--r-- | arch/s390/include/asm/mmu.h | 2 | ||||
-rw-r--r-- | arch/s390/include/asm/mmu_context.h | 28 | ||||
-rw-r--r-- | arch/s390/include/asm/pgalloc.h | 4 | ||||
-rw-r--r-- | arch/s390/include/asm/processor.h | 2 | ||||
-rw-r--r-- | arch/s390/include/asm/tlbflush.h | 9 |
5 files changed, 29 insertions, 16 deletions
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index d29ad9545b41..081b2ad99d73 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -11,7 +11,7 @@ typedef struct { spinlock_t list_lock; struct list_head pgtable_list; struct list_head gmap_list; - unsigned long asce_bits; + unsigned long asce; unsigned long asce_limit; unsigned long vdso_base; /* The mmu context allocates 4K page tables. */ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index d321469eeda7..c837b79b455d 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -26,12 +26,28 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.has_pgste = 0; mm->context.use_skey = 0; #endif - if (mm->context.asce_limit == 0) { + switch (mm->context.asce_limit) { + case 1UL << 42: + /* + * forked 3-level task, fall through to set new asce with new + * mm->pgd + */ + case 0: /* context created by exec, set asce limit to 4TB */ - mm->context.asce_bits = _ASCE_TABLE_LENGTH | - _ASCE_USER_BITS | _ASCE_TYPE_REGION3; mm->context.asce_limit = STACK_TOP_MAX; - } else if (mm->context.asce_limit == (1UL << 31)) { + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION3; + break; + case 1UL << 53: + /* forked 4-level task, set new asce with new mm->pgd */ + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_REGION2; + break; + case 1UL << 31: + /* forked 2-level compat task, set new asce with new mm->pgd */ + mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; + /* pgd_alloc() did not increase mm->nr_pmds */ mm_inc_nr_pmds(mm); } crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); @@ -42,7 +58,7 @@ static inline int init_new_context(struct task_struct *tsk, static inline void set_user_asce(struct mm_struct *mm) { - S390_lowcore.user_asce = mm->context.asce_bits | __pa(mm->pgd); + S390_lowcore.user_asce = mm->context.asce; if (current->thread.mm_segment.ar4) __ctl_load(S390_lowcore.user_asce, 7, 7); set_cpu_flag(CIF_ASCE); @@ -71,7 +87,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, { int cpu = smp_processor_id(); - S390_lowcore.user_asce = next->context.asce_bits | __pa(next->pgd); + S390_lowcore.user_asce = next->context.asce; if (prev == next) return; if (MACHINE_HAS_TLB_LC) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 9b3d9b6099f2..da34cb6b1f3b 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -52,8 +52,8 @@ static inline unsigned long pgd_entry_type(struct mm_struct *mm) return _REGION2_ENTRY_EMPTY; } -int crst_table_upgrade(struct mm_struct *, unsigned long limit); -void crst_table_downgrade(struct mm_struct *, unsigned long limit); +int crst_table_upgrade(struct mm_struct *); +void crst_table_downgrade(struct mm_struct *); static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index d6fd22ea270d..18cdede1aeda 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -175,7 +175,7 @@ extern __vector128 init_task_fpu_regs[__NUM_VXRS]; regs->psw.mask = PSW_USER_BITS | PSW_MASK_BA; \ regs->psw.addr = new_psw; \ regs->gprs[15] = new_stackp; \ - crst_table_downgrade(current->mm, 1UL << 31); \ + crst_table_downgrade(current->mm); \ execve_tail(); \ } while (0) diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index ca148f7c3eaa..a2e6ef32e054 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -110,8 +110,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce) static inline void __tlb_flush_kernel(void) { if (MACHINE_HAS_IDTE) - __tlb_flush_idte((unsigned long) init_mm.pgd | - init_mm.context.asce_bits); + __tlb_flush_idte(init_mm.context.asce); else __tlb_flush_global(); } @@ -133,8 +132,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce) static inline void __tlb_flush_kernel(void) { if (MACHINE_HAS_TLB_LC) - __tlb_flush_idte_local((unsigned long) init_mm.pgd | - init_mm.context.asce_bits); + __tlb_flush_idte_local(init_mm.context.asce); else __tlb_flush_local(); } @@ -148,8 +146,7 @@ static inline void __tlb_flush_mm(struct mm_struct * mm) * only ran on the local cpu. */ if (MACHINE_HAS_IDTE && list_empty(&mm->context.gmap_list)) - __tlb_flush_asce(mm, (unsigned long) mm->pgd | - mm->context.asce_bits); + __tlb_flush_asce(mm, mm->context.asce); else __tlb_flush_full(mm); } |