diff options
Diffstat (limited to 'arch/s390/mm/pgalloc.c')
-rw-r--r-- | arch/s390/mm/pgalloc.c | 360 |
1 files changed, 360 insertions, 0 deletions
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c new file mode 100644 index 000000000000..f6c3de26cda8 --- /dev/null +++ b/arch/s390/mm/pgalloc.c @@ -0,0 +1,360 @@ +/* + * Page table allocation functions + * + * Copyright IBM Corp. 2016 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ + +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <asm/mmu_context.h> +#include <asm/pgalloc.h> +#include <asm/gmap.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> + +#ifdef CONFIG_PGSTE + +static int page_table_allocate_pgste_min = 0; +static int page_table_allocate_pgste_max = 1; +int page_table_allocate_pgste = 0; +EXPORT_SYMBOL(page_table_allocate_pgste); + +static struct ctl_table page_table_sysctl[] = { + { + .procname = "allocate_pgste", + .data = &page_table_allocate_pgste, + .maxlen = sizeof(int), + .mode = S_IRUGO | S_IWUSR, + .proc_handler = proc_dointvec, + .extra1 = &page_table_allocate_pgste_min, + .extra2 = &page_table_allocate_pgste_max, + }, + { } +}; + +static struct ctl_table page_table_sysctl_dir[] = { + { + .procname = "vm", + .maxlen = 0, + .mode = 0555, + .child = page_table_sysctl, + }, + { } +}; + +static int __init page_table_register_sysctl(void) +{ + return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM; +} +__initcall(page_table_register_sysctl); + +#endif /* CONFIG_PGSTE */ + +unsigned long *crst_table_alloc(struct mm_struct *mm) +{ + struct page *page = alloc_pages(GFP_KERNEL, 2); + + if (!page) + return NULL; + return (unsigned long *) page_to_phys(page); +} + +void crst_table_free(struct mm_struct *mm, unsigned long *table) +{ + free_pages((unsigned long) table, 2); +} + +static void __crst_table_upgrade(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + clear_user_asce(); + set_user_asce(mm); + } + __tlb_flush_local(); +} + +int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) +{ + unsigned long *table, *pgd; + unsigned long entry; + int flush; + + BUG_ON(limit > TASK_MAX_SIZE); + flush = 0; +repeat: + table = crst_table_alloc(mm); + if (!table) + return -ENOMEM; + spin_lock_bh(&mm->page_table_lock); + if (mm->context.asce_limit < limit) { + pgd = (unsigned long *) mm->pgd; + if (mm->context.asce_limit <= (1UL << 31)) { + entry = _REGION3_ENTRY_EMPTY; + mm->context.asce_limit = 1UL << 42; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION3; + } else { + entry = _REGION2_ENTRY_EMPTY; + mm->context.asce_limit = 1UL << 53; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION2; + } + crst_table_init(table, entry); + pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); + mm->pgd = (pgd_t *) table; + mm->task_size = mm->context.asce_limit; + table = NULL; + flush = 1; + } + spin_unlock_bh(&mm->page_table_lock); + if (table) + crst_table_free(mm, table); + if (mm->context.asce_limit < limit) + goto repeat; + if (flush) + on_each_cpu(__crst_table_upgrade, mm, 0); + return 0; +} + +void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) +{ + pgd_t *pgd; + + if (current->active_mm == mm) { + clear_user_asce(); + __tlb_flush_mm(mm); + } + while (mm->context.asce_limit > limit) { + pgd = mm->pgd; + switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { + case _REGION_ENTRY_TYPE_R2: + mm->context.asce_limit = 1UL << 42; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_REGION3; + break; + case _REGION_ENTRY_TYPE_R3: + mm->context.asce_limit = 1UL << 31; + mm->context.asce_bits = _ASCE_TABLE_LENGTH | + _ASCE_USER_BITS | + _ASCE_TYPE_SEGMENT; + break; + default: + BUG(); + } + mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); + mm->task_size = mm->context.asce_limit; + crst_table_free(mm, (unsigned long *) pgd); + } + if (current->active_mm == mm) + set_user_asce(mm); +} + +static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) +{ + unsigned int old, new; + + do { + old = atomic_read(v); + new = old ^ bits; + } while (atomic_cmpxchg(v, old, new) != old); + return new; +} + +/* + * page table entry allocation/free routines. + */ +unsigned long *page_table_alloc(struct mm_struct *mm) +{ + unsigned long *table; + struct page *page; + unsigned int mask, bit; + + /* Try to get a fragment of a 4K page as a 2K page table */ + if (!mm_alloc_pgste(mm)) { + table = NULL; + spin_lock_bh(&mm->context.list_lock); + if (!list_empty(&mm->context.pgtable_list)) { + page = list_first_entry(&mm->context.pgtable_list, + struct page, lru); + mask = atomic_read(&page->_mapcount); + mask = (mask | (mask >> 4)) & 3; + if (mask != 3) { + table = (unsigned long *) page_to_phys(page); + bit = mask & 1; /* =1 -> second 2K */ + if (bit) + table += PTRS_PER_PTE; + atomic_xor_bits(&page->_mapcount, 1U << bit); + list_del(&page->lru); + } + } + spin_unlock_bh(&mm->context.list_lock); + if (table) + return table; + } + /* Allocate a fresh page */ + page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + /* Initialize page table */ + table = (unsigned long *) page_to_phys(page); + if (mm_alloc_pgste(mm)) { + /* Return 4K page table with PGSTEs */ + atomic_set(&page->_mapcount, 3); + clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + } else { + /* Return the first 2K fragment of the page */ + atomic_set(&page->_mapcount, 1); + clear_table(table, _PAGE_INVALID, PAGE_SIZE); + spin_lock_bh(&mm->context.list_lock); + list_add(&page->lru, &mm->context.pgtable_list); + spin_unlock_bh(&mm->context.list_lock); + } + return table; +} + +void page_table_free(struct mm_struct *mm, unsigned long *table) +{ + struct page *page; + unsigned int bit, mask; + + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + if (!mm_alloc_pgste(mm)) { + /* Free 2K page table fragment of a 4K page */ + bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.list_lock); + mask = atomic_xor_bits(&page->_mapcount, 1U << bit); + if (mask & 3) + list_add(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + if (mask != 0) + return; + } + + pgtable_page_dtor(page); + atomic_set(&page->_mapcount, -1); + __free_page(page); +} + +void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, + unsigned long vmaddr) +{ + struct mm_struct *mm; + struct page *page; + unsigned int bit, mask; + + mm = tlb->mm; + page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + if (mm_alloc_pgste(mm)) { + gmap_unlink(mm, table, vmaddr); + table = (unsigned long *) (__pa(table) | 3); + tlb_remove_table(tlb, table); + return; + } + bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.list_lock); + mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); + if (mask & 3) + list_add_tail(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + table = (unsigned long *) (__pa(table) | (1U << bit)); + tlb_remove_table(tlb, table); +} + +static void __tlb_remove_table(void *_table) +{ + unsigned int mask = (unsigned long) _table & 3; + void *table = (void *)((unsigned long) _table ^ mask); + struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + + switch (mask) { + case 0: /* pmd or pud */ + free_pages((unsigned long) table, 2); + break; + case 1: /* lower 2K of a 4K page table */ + case 2: /* higher 2K of a 4K page table */ + if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) + break; + /* fallthrough */ + case 3: /* 4K page table with pgstes */ + pgtable_page_dtor(page); + atomic_set(&page->_mapcount, -1); + __free_page(page); + break; + } +} + +static void tlb_remove_table_smp_sync(void *arg) +{ + /* Simply deliver the interrupt */ +} + +static void tlb_remove_table_one(void *table) +{ + /* + * This isn't an RCU grace period and hence the page-tables cannot be + * assumed to be actually RCU-freed. + * + * It is however sufficient for software page-table walkers that rely + * on IRQ disabling. See the comment near struct mmu_table_batch. + */ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); + __tlb_remove_table(table); +} + +static void tlb_remove_table_rcu(struct rcu_head *head) +{ + struct mmu_table_batch *batch; + int i; + + batch = container_of(head, struct mmu_table_batch, rcu); + + for (i = 0; i < batch->nr; i++) + __tlb_remove_table(batch->tables[i]); + + free_page((unsigned long)batch); +} + +void tlb_table_flush(struct mmu_gather *tlb) +{ + struct mmu_table_batch **batch = &tlb->batch; + + if (*batch) { + call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); + *batch = NULL; + } +} + +void tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + struct mmu_table_batch **batch = &tlb->batch; + + tlb->mm->context.flush_mm = 1; + if (*batch == NULL) { + *batch = (struct mmu_table_batch *) + __get_free_page(GFP_NOWAIT | __GFP_NOWARN); + if (*batch == NULL) { + __tlb_flush_mm_lazy(tlb->mm); + tlb_remove_table_one(table); + return; + } + (*batch)->nr = 0; + } + (*batch)->tables[(*batch)->nr++] = table; + if ((*batch)->nr == MAX_TABLE_BATCH) + tlb_flush_mmu(tlb); +} |