summaryrefslogtreecommitdiff
path: root/arch/s390/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/Makefile4
-rw-r--r--arch/s390/mm/cmm.c10
-rw-r--r--arch/s390/mm/dump_pagetables.c54
-rw-r--r--arch/s390/mm/extable.c77
-rw-r--r--arch/s390/mm/extmem.c27
-rw-r--r--arch/s390/mm/fault.c37
-rw-r--r--arch/s390/mm/gmap.c857
-rw-r--r--arch/s390/mm/gmap_helpers.c221
-rw-r--r--arch/s390/mm/hugetlbpage.c11
-rw-r--r--arch/s390/mm/init.c49
-rw-r--r--arch/s390/mm/maccess.c1
-rw-r--r--arch/s390/mm/mmap.c51
-rw-r--r--arch/s390/mm/pageattr.c15
-rw-r--r--arch/s390/mm/pfault.c5
-rw-r--r--arch/s390/mm/pgalloc.c74
-rw-r--r--arch/s390/mm/pgtable.c91
-rw-r--r--arch/s390/mm/vmem.c19
17 files changed, 637 insertions, 966 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index f6c2db7a8669..bd0401cc7ca5 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -9,6 +9,8 @@ obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
+obj-$(CONFIG_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
obj-$(CONFIG_PFAULT) += pfault.o
+
+obj-$(subst m,y,$(CONFIG_KVM)) += gmap_helpers.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index d01724a715d0..e2a6eb92420f 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -201,10 +201,10 @@ static void cmm_set_timer(void)
{
if (cmm_timed_pages_target <= 0 || cmm_timeout_seconds <= 0) {
if (timer_pending(&cmm_timer))
- del_timer(&cmm_timer);
+ timer_delete(&cmm_timer);
return;
}
- mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC));
+ mod_timer(&cmm_timer, jiffies + secs_to_jiffies(cmm_timeout_seconds));
}
static void cmm_timer_fn(struct timer_list *unused)
@@ -332,7 +332,7 @@ static int cmm_timeout_handler(const struct ctl_table *ctl, int write,
return 0;
}
-static struct ctl_table cmm_table[] = {
+static const struct ctl_table cmm_table[] = {
{
.procname = "cmm_pages",
.mode = 0644,
@@ -424,7 +424,7 @@ out_smsg:
#endif
unregister_sysctl_table(cmm_sysctl_header);
out_sysctl:
- del_timer_sync(&cmm_timer);
+ timer_delete_sync(&cmm_timer);
return rc;
}
module_init(cmm_init);
@@ -437,7 +437,7 @@ static void __exit cmm_exit(void)
#endif
unregister_oom_notifier(&cmm_oom_nb);
kthread_stop(cmm_thread_ptr);
- del_timer_sync(&cmm_timer);
+ timer_delete_sync(&cmm_timer);
cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list);
cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list);
}
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index fa54f3bc0c8d..ac604b176660 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -1,4 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/cpufeature.h>
#include <linux/set_memory.h>
#include <linux/ptdump.h>
#include <linux/seq_file.h>
@@ -82,7 +84,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr)
* in which case we have two lpswe instructions in lowcore that need
* to be executable.
*/
- if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)))
+ if (addr == PAGE_SIZE && (nospec_uses_trampoline() || !cpu_has_bear()))
return;
WARN_ONCE(IS_ENABLED(CONFIG_DEBUG_WX),
"s390/mm: Found insecure W+X mapping at address %pS\n",
@@ -145,11 +147,48 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
}
}
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
bool ptdump_check_wx(void)
{
struct pg_state st = {
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{.start = 0, .end = max_addr},
{.start = 0, .end = 0},
@@ -167,7 +206,7 @@ bool ptdump_check_wx(void)
},
};
- if (!MACHINE_HAS_NX)
+ if (!cpu_has_nx())
return true;
ptdump_walk_pgd(&st.ptdump, &init_mm, NULL);
if (st.wx_pages) {
@@ -176,7 +215,7 @@ bool ptdump_check_wx(void)
return false;
} else {
pr_info("Checked W+X mappings: passed, no %sW+X pages found\n",
- (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) ?
+ (nospec_uses_trampoline() || !cpu_has_bear()) ?
"unexpected " : "");
return true;
@@ -188,7 +227,12 @@ static int ptdump_show(struct seq_file *m, void *v)
{
struct pg_state st = {
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{.start = 0, .end = max_addr},
{.start = 0, .end = 0},
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
index 0a0738a473af..7498e858c401 100644
--- a/arch/s390/mm/extable.c
+++ b/arch/s390/mm/extable.c
@@ -7,6 +7,7 @@
#include <linux/panic.h>
#include <asm/asm-extable.h>
#include <asm/extable.h>
+#include <asm/fpu.h>
const struct exception_table_entry *s390_search_extables(unsigned long addr)
{
@@ -26,7 +27,7 @@ static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_r
return true;
}
-static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs)
+static bool ex_handler_ua_fault(const struct exception_table_entry *ex, struct pt_regs *regs)
{
unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
@@ -35,18 +36,6 @@ static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct p
return true;
}
-static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs)
-{
- unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
- unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
- size_t len = FIELD_GET(EX_DATA_LEN, ex->data);
-
- regs->gprs[reg_err] = -EFAULT;
- memset((void *)regs->gprs[reg_addr], 0, len);
- regs->psw.addr = extable_fixup(ex);
- return true;
-}
-
static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex,
bool pair, struct pt_regs *regs)
{
@@ -77,6 +66,56 @@ static bool ex_handler_zeropad(const struct exception_table_entry *ex, struct pt
return true;
}
+static bool ex_handler_fpc(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ fpu_sfpc(0);
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+struct insn_ssf {
+ u64 opc1 : 8;
+ u64 r3 : 4;
+ u64 opc2 : 4;
+ u64 b1 : 4;
+ u64 d1 : 12;
+ u64 b2 : 4;
+ u64 d2 : 12;
+} __packed;
+
+static bool ex_handler_ua_mvcos(const struct exception_table_entry *ex,
+ bool from, struct pt_regs *regs)
+{
+ unsigned long uaddr, remainder;
+ struct insn_ssf *insn;
+
+ /*
+ * If the faulting user space access crossed a page boundary retry by
+ * limiting the access to the first page (adjust length accordingly).
+ * Then the mvcos instruction will either complete with condition code
+ * zero, or generate another fault where the user space access did not
+ * cross a page boundary.
+ * If the faulting user space access did not cross a page boundary set
+ * length to zero and retry. In this case no user space access will
+ * happen, and the mvcos instruction will complete with condition code
+ * zero.
+ * In both cases the instruction will complete with condition code
+ * zero (copying finished), and the register which contains the
+ * length, indicates the number of bytes copied.
+ */
+ regs->psw.addr = extable_fixup(ex);
+ insn = (struct insn_ssf *)regs->psw.addr;
+ if (from)
+ uaddr = regs->gprs[insn->b2] + insn->d2;
+ else
+ uaddr = regs->gprs[insn->b1] + insn->d1;
+ remainder = PAGE_SIZE - (uaddr & (PAGE_SIZE - 1));
+ if (regs->gprs[insn->r3] <= remainder)
+ remainder = 0;
+ regs->gprs[insn->r3] = remainder;
+ return true;
+}
+
bool fixup_exception(struct pt_regs *regs)
{
const struct exception_table_entry *ex;
@@ -89,16 +128,20 @@ bool fixup_exception(struct pt_regs *regs)
return ex_handler_fixup(ex, regs);
case EX_TYPE_BPF:
return ex_handler_bpf(ex, regs);
- case EX_TYPE_UA_STORE:
- return ex_handler_ua_store(ex, regs);
- case EX_TYPE_UA_LOAD_MEM:
- return ex_handler_ua_load_mem(ex, regs);
+ case EX_TYPE_UA_FAULT:
+ return ex_handler_ua_fault(ex, regs);
case EX_TYPE_UA_LOAD_REG:
return ex_handler_ua_load_reg(ex, false, regs);
case EX_TYPE_UA_LOAD_REGPAIR:
return ex_handler_ua_load_reg(ex, true, regs);
case EX_TYPE_ZEROPAD:
return ex_handler_zeropad(ex, regs);
+ case EX_TYPE_FPC:
+ return ex_handler_fpc(ex, regs);
+ case EX_TYPE_UA_MVCOS_TO:
+ return ex_handler_ua_mvcos(ex, false, regs);
+ case EX_TYPE_UA_MVCOS_FROM:
+ return ex_handler_ua_mvcos(ex, true, regs);
}
panic("invalid exception table entry");
}
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 4692136c0af1..f7da53e212f5 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -21,6 +21,7 @@
#include <linux/ioport.h>
#include <linux/refcount.h>
#include <linux/pgtable.h>
+#include <asm/machine.h>
#include <asm/diag.h>
#include <asm/page.h>
#include <asm/ebcdic.h>
@@ -255,7 +256,7 @@ segment_type (char* name)
int rc;
struct dcss_segment seg;
- if (!MACHINE_IS_VM)
+ if (!machine_is_vm())
return -ENOSYS;
dcss_mkname(name, seg.dcss_name);
@@ -418,7 +419,7 @@ segment_load (char *name, int do_nonshared, unsigned long *addr,
struct dcss_segment *seg;
int rc;
- if (!MACHINE_IS_VM)
+ if (!machine_is_vm())
return -ENOSYS;
mutex_lock(&dcss_lock);
@@ -529,6 +530,14 @@ segment_modify_shared (char *name, int do_nonshared)
return rc;
}
+static void __dcss_diag_purge_on_cpu_0(void *data)
+{
+ struct dcss_segment *seg = (struct dcss_segment *)data;
+ unsigned long dummy;
+
+ dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+}
+
/*
* Decrease the use count of a DCSS segment and remove
* it from the address space if nobody is using it
@@ -537,10 +546,9 @@ segment_modify_shared (char *name, int do_nonshared)
void
segment_unload(char *name)
{
- unsigned long dummy;
struct dcss_segment *seg;
- if (!MACHINE_IS_VM)
+ if (!machine_is_vm())
return;
mutex_lock(&dcss_lock);
@@ -555,7 +563,14 @@ segment_unload(char *name)
kfree(seg->res);
vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
list_del(&seg->list);
- dcss_diag(&purgeseg_scode, seg->dcss_name, &dummy, &dummy);
+ /*
+ * Workaround for z/VM issue, where calling the DCSS unload diag on
+ * a non-IPL CPU would cause bogus sclp maximum memory detection on
+ * next IPL.
+ * IPL CPU 0 cannot be set offline, so the dcss_diag() call can
+ * directly be scheduled to that CPU.
+ */
+ smp_call_function_single(0, __dcss_diag_purge_on_cpu_0, seg, 1);
kfree(seg);
out_unlock:
mutex_unlock(&dcss_lock);
@@ -572,7 +587,7 @@ segment_save(char *name)
char cmd2[80];
int i, response;
- if (!MACHINE_IS_VM)
+ if (!machine_is_vm())
return;
mutex_lock(&dcss_lock);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 9b681f74dccc..e1ad05bfd28a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -11,11 +11,11 @@
#include <linux/kernel_stat.h>
#include <linux/mmu_context.h>
+#include <linux/cpufeature.h>
#include <linux/perf_event.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
-#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
@@ -40,22 +40,11 @@
#include <asm/ptrace.h>
#include <asm/fault.h>
#include <asm/diag.h>
-#include <asm/gmap.h>
#include <asm/irq.h>
#include <asm/facility.h>
#include <asm/uv.h>
#include "../kernel/entry.h"
-static DEFINE_STATIC_KEY_FALSE(have_store_indication);
-
-static int __init fault_init(void)
-{
- if (test_facility(75))
- static_branch_enable(&have_store_indication);
- return 0;
-}
-early_initcall(fault_init);
-
/*
* Find out which address space caused the exception.
*/
@@ -81,7 +70,7 @@ static __always_inline bool fault_is_write(struct pt_regs *regs)
{
union teid teid = { .val = regs->int_parm_long };
- if (static_branch_likely(&have_store_indication))
+ if (test_facility(75))
return teid.fsi == TEID_FSI_STORE;
return false;
}
@@ -175,6 +164,23 @@ static void dump_fault_info(struct pt_regs *regs)
int show_unhandled_signals = 1;
+static const struct ctl_table s390_fault_sysctl_table[] = {
+ {
+ .procname = "userprocess_debug",
+ .data = &show_unhandled_signals,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_s390_fault_sysctls(void)
+{
+ register_sysctl_init("kernel", s390_fault_sysctl_table);
+ return 0;
+}
+arch_initcall(init_s390_fault_sysctls);
+
void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
{
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
@@ -369,6 +375,7 @@ void do_protection_exception(struct pt_regs *regs)
if (unlikely(!teid.b61)) {
if (user_mode(regs)) {
/* Low-address protection in user mode: cannot happen */
+ dump_fault_info(regs);
die(regs, "Low-address protection");
}
/*
@@ -377,7 +384,7 @@ void do_protection_exception(struct pt_regs *regs)
*/
return handle_fault_error_nolock(regs, 0);
}
- if (unlikely(MACHINE_HAS_NX && teid.b56)) {
+ if (unlikely(cpu_has_nx() && teid.b56)) {
regs->int_parm_long = (teid.addr * PAGE_SIZE) | (regs->psw.addr & PAGE_MASK);
return handle_fault_error_nolock(regs, SEGV_ACCERR);
}
@@ -434,6 +441,8 @@ void do_secure_storage_access(struct pt_regs *regs)
if (rc)
BUG();
} else {
+ if (faulthandler_disabled())
+ return handle_fault_error_nolock(regs, 0);
mm = current->mm;
mmap_read_lock(mm);
vma = find_vma(mm, addr);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 16b8a36c56de..012a4366a2ad 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -8,6 +8,7 @@
* Janosch Frank <frankja@linux.vnet.ibm.com>
*/
+#include <linux/cpufeature.h>
#include <linux/kernel.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
@@ -20,9 +21,20 @@
#include <linux/pgtable.h>
#include <asm/page-states.h>
#include <asm/pgalloc.h>
+#include <asm/machine.h>
+#include <asm/gmap_helpers.h>
#include <asm/gmap.h>
#include <asm/page.h>
-#include <asm/tlb.h>
+
+/*
+ * The address is saved in a radix tree directly; NULL would be ambiguous,
+ * since 0 is a valid address, and NULL is returned when nothing was found.
+ * The lower bits are ignored by all users of the macro, so it can be used
+ * to distinguish a valid address 0 from a NULL.
+ */
+#define VALID_GADDR_FLAG 1
+#define IS_GADDR_VALID(gaddr) ((gaddr) & VALID_GADDR_FLAG)
+#define MAKE_VALID_GADDR(gaddr) (((gaddr) & HPAGE_MASK) | VALID_GADDR_FLAG)
#define GMAP_SHADOW_FAKE_TABLE 1ULL
@@ -43,7 +55,7 @@ static struct page *gmap_alloc_crst(void)
*
* Returns a guest address space structure.
*/
-static struct gmap *gmap_alloc(unsigned long limit)
+struct gmap *gmap_alloc(unsigned long limit)
{
struct gmap *gmap;
struct page *page;
@@ -70,9 +82,7 @@ static struct gmap *gmap_alloc(unsigned long limit)
gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
if (!gmap)
goto out;
- INIT_LIST_HEAD(&gmap->crst_list);
INIT_LIST_HEAD(&gmap->children);
- INIT_LIST_HEAD(&gmap->pt_list);
INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
@@ -82,8 +92,6 @@ static struct gmap *gmap_alloc(unsigned long limit)
page = gmap_alloc_crst();
if (!page)
goto out_free;
- page->index = 0;
- list_add(&page->lru, &gmap->crst_list);
table = page_to_virt(page);
crst_table_init(table, etype);
gmap->table = table;
@@ -97,6 +105,7 @@ out_free:
out:
return NULL;
}
+EXPORT_SYMBOL_GPL(gmap_alloc);
/**
* gmap_create - create a guest address space
@@ -128,7 +137,7 @@ EXPORT_SYMBOL_GPL(gmap_create);
static void gmap_flush_tlb(struct gmap *gmap)
{
- if (MACHINE_HAS_IDTE)
+ if (cpu_has_idte())
__tlb_flush_idte(gmap->asce);
else
__tlb_flush_global();
@@ -185,32 +194,46 @@ static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
} while (nr > 0);
}
+static void gmap_free_crst(unsigned long *table, bool free_ptes)
+{
+ bool is_segment = (table[0] & _SEGMENT_ENTRY_TYPE_MASK) == 0;
+ int i;
+
+ if (is_segment) {
+ if (!free_ptes)
+ goto out;
+ for (i = 0; i < _CRST_ENTRIES; i++)
+ if (!(table[i] & _SEGMENT_ENTRY_INVALID))
+ page_table_free_pgste(page_ptdesc(phys_to_page(table[i])));
+ } else {
+ for (i = 0; i < _CRST_ENTRIES; i++)
+ if (!(table[i] & _REGION_ENTRY_INVALID))
+ gmap_free_crst(__va(table[i] & PAGE_MASK), free_ptes);
+ }
+
+out:
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+}
+
/**
* gmap_free - free a guest address space
* @gmap: pointer to the guest address space structure
*
* No locks required. There are no references to this gmap anymore.
*/
-static void gmap_free(struct gmap *gmap)
+void gmap_free(struct gmap *gmap)
{
- struct page *page, *next;
-
/* Flush tlb of all gmaps (if not already done for shadows) */
if (!(gmap_is_shadow(gmap) && gmap->removed))
gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
- list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
- __free_pages(page, CRST_ALLOC_ORDER);
+ gmap_free_crst(gmap->table, gmap_is_shadow(gmap));
+
gmap_radix_tree_free(&gmap->guest_to_host);
gmap_radix_tree_free(&gmap->host_to_guest);
/* Free additional data for a shadow gmap */
if (gmap_is_shadow(gmap)) {
- struct ptdesc *ptdesc, *n;
-
- /* Free all page tables. */
- list_for_each_entry_safe(ptdesc, n, &gmap->pt_list, pt_list)
- page_table_free_pgste(ptdesc);
gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
/* Release reference to the parent */
gmap_put(gmap->parent);
@@ -218,6 +241,7 @@ static void gmap_free(struct gmap *gmap)
kfree(gmap);
}
+EXPORT_SYMBOL_GPL(gmap_free);
/**
* gmap_get - increase reference counter for guest address space
@@ -298,10 +322,8 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
crst_table_init(new, init);
spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
- list_add(&page->lru, &gmap->crst_list);
*table = __pa(new) | _REGION_ENTRY_LENGTH |
(*table & _REGION_ENTRY_TYPE_MASK);
- page->index = gaddr;
page = NULL;
}
spin_unlock(&gmap->guest_table_lock);
@@ -310,21 +332,23 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
return 0;
}
-/**
- * __gmap_segment_gaddr - find virtual address from segment pointer
- * @entry: pointer to a segment table entry in the guest address space
- *
- * Returns the virtual address in the guest address space for the segment
- */
-static unsigned long __gmap_segment_gaddr(unsigned long *entry)
+static unsigned long host_to_guest_lookup(struct gmap *gmap, unsigned long vmaddr)
{
- struct page *page;
- unsigned long offset;
+ return (unsigned long)radix_tree_lookup(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
+
+static unsigned long host_to_guest_delete(struct gmap *gmap, unsigned long vmaddr)
+{
+ return (unsigned long)radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
+}
- offset = (unsigned long) entry / sizeof(unsigned long);
- offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
- page = pmd_pgtable_page((pmd_t *) entry);
- return page->index + offset;
+static pmd_t *host_to_guest_pmd_delete(struct gmap *gmap, unsigned long vmaddr,
+ unsigned long *gaddr)
+{
+ *gaddr = host_to_guest_delete(gmap, vmaddr);
+ if (IS_GADDR_VALID(*gaddr))
+ return (pmd_t *)gmap_table_walk(gmap, *gaddr, 1);
+ return NULL;
}
/**
@@ -336,16 +360,19 @@ static unsigned long __gmap_segment_gaddr(unsigned long *entry)
*/
static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
{
- unsigned long *entry;
+ unsigned long gaddr;
int flush = 0;
+ pmd_t *pmdp;
BUG_ON(gmap_is_shadow(gmap));
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
- if (entry) {
- flush = (*entry != _SEGMENT_ENTRY_EMPTY);
- *entry = _SEGMENT_ENTRY_EMPTY;
+
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
+ flush = (pmd_val(*pmdp) != _SEGMENT_ENTRY_EMPTY);
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
+
spin_unlock(&gmap->guest_table_lock);
return flush;
}
@@ -464,26 +491,6 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
EXPORT_SYMBOL_GPL(__gmap_translate);
/**
- * gmap_translate - translate a guest address to a user space address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- *
- * Returns user space address which corresponds to the guest address or
- * -EFAULT if no such mapping exists.
- * This function does not establish potentially missing page table entries.
- */
-unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
-{
- unsigned long rc;
-
- mmap_read_lock(gmap->mm);
- rc = __gmap_translate(gmap, gaddr);
- mmap_read_unlock(gmap->mm);
- return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_translate);
-
-/**
* gmap_unlink - disconnect a page table from the gmap shadow tables
* @mm: pointer to the parent mm_struct
* @table: pointer to the host page table
@@ -582,7 +589,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
spin_lock(&gmap->guest_table_lock);
if (*table == _SEGMENT_ENTRY_EMPTY) {
rc = radix_tree_insert(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT, table);
+ vmaddr >> PMD_SHIFT,
+ (void *)MAKE_VALID_GADDR(gaddr));
if (!rc) {
if (pmd_leaf(*pmd)) {
*table = (pmd_val(*pmd) &
@@ -605,193 +613,27 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
radix_tree_preload_end();
return rc;
}
-
-/**
- * fixup_user_fault_nowait - manually resolve a user page fault without waiting
- * @mm: mm_struct of target mm
- * @address: user address
- * @fault_flags:flags to pass down to handle_mm_fault()
- * @unlocked: did we unlock the mmap_lock while retrying
- *
- * This function behaves similarly to fixup_user_fault(), but it guarantees
- * that the fault will be resolved without waiting. The function might drop
- * and re-acquire the mm lock, in which case @unlocked will be set to true.
- *
- * The guarantee is that the fault is handled without waiting, but the
- * function itself might sleep, due to the lock.
- *
- * Context: Needs to be called with mm->mmap_lock held in read mode, and will
- * return with the lock held in read mode; @unlocked will indicate whether
- * the lock has been dropped and re-acquired. This is the same behaviour as
- * fixup_user_fault().
- *
- * Return: 0 on success, -EAGAIN if the fault cannot be resolved without
- * waiting, -EFAULT if the fault cannot be resolved, -ENOMEM if out of
- * memory.
- */
-static int fixup_user_fault_nowait(struct mm_struct *mm, unsigned long address,
- unsigned int fault_flags, bool *unlocked)
-{
- struct vm_area_struct *vma;
- unsigned int test_flags;
- vm_fault_t fault;
- int rc;
-
- fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
- test_flags = fault_flags & FAULT_FLAG_WRITE ? VM_WRITE : VM_READ;
-
- vma = find_vma(mm, address);
- if (unlikely(!vma || address < vma->vm_start))
- return -EFAULT;
- if (unlikely(!(vma->vm_flags & test_flags)))
- return -EFAULT;
-
- fault = handle_mm_fault(vma, address, fault_flags, NULL);
- /* the mm lock has been dropped, take it again */
- if (fault & VM_FAULT_COMPLETED) {
- *unlocked = true;
- mmap_read_lock(mm);
- return 0;
- }
- /* the mm lock has not been dropped */
- if (fault & VM_FAULT_ERROR) {
- rc = vm_fault_to_errno(fault, 0);
- BUG_ON(!rc);
- return rc;
- }
- /* the mm lock has not been dropped because of FAULT_FLAG_RETRY_NOWAIT */
- if (fault & VM_FAULT_RETRY)
- return -EAGAIN;
- /* nothing needed to be done and the mm lock has not been dropped */
- return 0;
-}
-
-/**
- * __gmap_fault - resolve a fault on a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @fault_flags: flags to pass down to handle_mm_fault()
- *
- * Context: Needs to be called with mm->mmap_lock held in read mode. Might
- * drop and re-acquire the lock. Will always return with the lock held.
- */
-static int __gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags)
-{
- unsigned long vmaddr;
- bool unlocked;
- int rc = 0;
-
-retry:
- unlocked = false;
-
- vmaddr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(vmaddr))
- return vmaddr;
-
- if (fault_flags & FAULT_FLAG_RETRY_NOWAIT)
- rc = fixup_user_fault_nowait(gmap->mm, vmaddr, fault_flags, &unlocked);
- else
- rc = fixup_user_fault(gmap->mm, vmaddr, fault_flags, &unlocked);
- if (rc)
- return rc;
- /*
- * In the case that fixup_user_fault unlocked the mmap_lock during
- * fault-in, redo __gmap_translate() to avoid racing with a
- * map/unmap_segment.
- * In particular, __gmap_translate(), fixup_user_fault{,_nowait}(),
- * and __gmap_link() must all be called atomically in one go; if the
- * lock had been dropped in between, a retry is needed.
- */
- if (unlocked)
- goto retry;
-
- return __gmap_link(gmap, gaddr, vmaddr);
-}
-
-/**
- * gmap_fault - resolve a fault on a guest address
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: guest address
- * @fault_flags: flags to pass down to handle_mm_fault()
- *
- * Returns 0 on success, -ENOMEM for out of memory conditions, -EFAULT if the
- * vm address is already mapped to a different guest segment, and -EAGAIN if
- * FAULT_FLAG_RETRY_NOWAIT was specified and the fault could not be processed
- * immediately.
- */
-int gmap_fault(struct gmap *gmap, unsigned long gaddr, unsigned int fault_flags)
-{
- int rc;
-
- mmap_read_lock(gmap->mm);
- rc = __gmap_fault(gmap, gaddr, fault_flags);
- mmap_read_unlock(gmap->mm);
- return rc;
-}
-EXPORT_SYMBOL_GPL(gmap_fault);
+EXPORT_SYMBOL(__gmap_link);
/*
* this function is assumed to be called with mmap_lock held
*/
void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
{
- struct vm_area_struct *vma;
unsigned long vmaddr;
- spinlock_t *ptl;
- pte_t *ptep;
+
+ mmap_assert_locked(gmap->mm);
/* Find the vm address for the guest address */
vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
gaddr >> PMD_SHIFT);
if (vmaddr) {
vmaddr |= gaddr & ~PMD_MASK;
-
- vma = vma_lookup(gmap->mm, vmaddr);
- if (!vma || is_vm_hugetlb_page(vma))
- return;
-
- /* Get pointer to the page table entry */
- ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
- if (likely(ptep)) {
- ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
- pte_unmap_unlock(ptep, ptl);
- }
+ gmap_helper_zap_one_page(gmap->mm, vmaddr);
}
}
EXPORT_SYMBOL_GPL(__gmap_zap);
-void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
-{
- unsigned long gaddr, vmaddr, size;
- struct vm_area_struct *vma;
-
- mmap_read_lock(gmap->mm);
- for (gaddr = from; gaddr < to;
- gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
- /* Find the vm address for the guest address */
- vmaddr = (unsigned long)
- radix_tree_lookup(&gmap->guest_to_host,
- gaddr >> PMD_SHIFT);
- if (!vmaddr)
- continue;
- vmaddr |= gaddr & ~PMD_MASK;
- /* Find vma in the parent mm */
- vma = find_vma(gmap->mm, vmaddr);
- if (!vma)
- continue;
- /*
- * We do not discard pages that are backed by
- * hugetlbfs, so we don't have to refault them.
- */
- if (is_vm_hugetlb_page(vma))
- continue;
- size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
- zap_page_range_single(vma, vmaddr, size, NULL);
- }
- mmap_read_unlock(gmap->mm);
-}
-EXPORT_SYMBOL_GPL(gmap_discard);
-
static LIST_HEAD(gmap_notifier_list);
static DEFINE_SPINLOCK(gmap_notifier_lock);
@@ -853,8 +695,7 @@ static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
*
* Note: Can also be called for shadow gmaps.
*/
-static inline unsigned long *gmap_table_walk(struct gmap *gmap,
- unsigned long gaddr, int level)
+unsigned long *gmap_table_walk(struct gmap *gmap, unsigned long gaddr, int level)
{
const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
unsigned long *table = gmap->table;
@@ -905,6 +746,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap,
}
return table;
}
+EXPORT_SYMBOL(gmap_table_walk);
/**
* gmap_pte_op_walk - walk the gmap page table, get the page table lock
@@ -1101,86 +943,40 @@ static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
* @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
* @bits: pgste notification bits to set
*
- * Returns 0 if successfully protected, -ENOMEM if out of memory and
- * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ * Returns:
+ * PAGE_SIZE if a small page was successfully protected;
+ * HPAGE_SIZE if a large page was successfully protected;
+ * -ENOMEM if out of memory;
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing);
+ * -EAGAIN if the guest mapping is missing and should be fixed by the caller.
*
- * Called with sg->mm->mmap_lock in read.
+ * Context: Called with sg->mm->mmap_lock in read.
*/
-static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
- unsigned long len, int prot, unsigned long bits)
+int gmap_protect_one(struct gmap *gmap, unsigned long gaddr, int prot, unsigned long bits)
{
- unsigned long vmaddr, dist;
pmd_t *pmdp;
- int rc;
+ int rc = 0;
BUG_ON(gmap_is_shadow(gmap));
- while (len) {
- rc = -EAGAIN;
- pmdp = gmap_pmd_op_walk(gmap, gaddr);
- if (pmdp) {
- if (!pmd_leaf(*pmdp)) {
- rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
- bits);
- if (!rc) {
- len -= PAGE_SIZE;
- gaddr += PAGE_SIZE;
- }
- } else {
- rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
- bits);
- if (!rc) {
- dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
- len = len < dist ? 0 : len - dist;
- gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
- }
- }
- gmap_pmd_op_end(gmap, pmdp);
- }
- if (rc) {
- if (rc == -EINVAL)
- return rc;
- /* -EAGAIN, fixup of userspace mm and gmap */
- vmaddr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(vmaddr))
- return vmaddr;
- rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
- if (rc)
- return rc;
- }
- }
- return 0;
-}
+ pmdp = gmap_pmd_op_walk(gmap, gaddr);
+ if (!pmdp)
+ return -EAGAIN;
-/**
- * gmap_mprotect_notify - change access rights for a range of ptes and
- * call the notifier if any pte changes again
- * @gmap: pointer to guest mapping meta data structure
- * @gaddr: virtual address in the guest address space
- * @len: size of area
- * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
- *
- * Returns 0 if for each page in the given range a gmap mapping exists,
- * the new access rights could be set and the notifier could be armed.
- * If the gmap mapping is missing for one or more pages -EFAULT is
- * returned. If no memory could be allocated -ENOMEM is returned.
- * This function establishes missing page table entries.
- */
-int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
- unsigned long len, int prot)
-{
- int rc;
+ if (!pmd_leaf(*pmdp)) {
+ rc = gmap_protect_pte(gmap, gaddr, pmdp, prot, bits);
+ if (!rc)
+ rc = PAGE_SIZE;
+ } else {
+ rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot, bits);
+ if (!rc)
+ rc = HPAGE_SIZE;
+ }
+ gmap_pmd_op_end(gmap, pmdp);
- if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
- return -EINVAL;
- if (!MACHINE_HAS_ESOP && prot == PROT_READ)
- return -EINVAL;
- mmap_read_lock(gmap->mm);
- rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
- mmap_read_unlock(gmap->mm);
return rc;
}
-EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+EXPORT_SYMBOL_GPL(gmap_protect_one);
/**
* gmap_read_table - get an unsigned long value from a guest page table using
@@ -1414,7 +1210,6 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
ptdesc = page_ptdesc(phys_to_page(pgt));
- list_del(&ptdesc->pt_list);
page_table_free_pgste(ptdesc);
}
@@ -1442,7 +1237,6 @@ static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
__gmap_unshadow_pgt(sg, raddr, __va(pgt));
/* Free page table */
ptdesc = page_ptdesc(phys_to_page(pgt));
- list_del(&ptdesc->pt_list);
page_table_free_pgste(ptdesc);
}
}
@@ -1472,7 +1266,6 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
page = phys_to_page(sgt);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1500,7 +1293,6 @@ static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
__gmap_unshadow_sgt(sg, raddr, __va(sgt));
/* Free segment table */
page = phys_to_page(sgt);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1530,7 +1322,6 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
page = phys_to_page(r3t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1558,7 +1349,6 @@ static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
__gmap_unshadow_r3t(sg, raddr, __va(r3t));
/* Free region 3 table */
page = phys_to_page(r3t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1588,7 +1378,6 @@ static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
__gmap_unshadow_r2t(sg, raddr, __va(r2t));
/* Free region 2 table */
page = phys_to_page(r2t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
@@ -1620,7 +1409,6 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
r1t[i] = _REGION1_ENTRY_EMPTY;
/* Free region 2 table */
page = phys_to_page(r2t);
- list_del(&page->lru);
__free_pages(page, CRST_ALLOC_ORDER);
}
}
@@ -1631,7 +1419,7 @@ static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
*
* Called with sg->guest_table_lock
*/
-static void gmap_unshadow(struct gmap *sg)
+void gmap_unshadow(struct gmap *sg)
{
unsigned long *table;
@@ -1657,143 +1445,7 @@ static void gmap_unshadow(struct gmap *sg)
break;
}
}
-
-/**
- * gmap_find_shadow - find a specific asce in the list of shadow tables
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns the pointer to a gmap if a shadow table with the given asce is
- * already available, ERR_PTR(-EAGAIN) if another one is just being created,
- * otherwise NULL
- */
-static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
- int edat_level)
-{
- struct gmap *sg;
-
- list_for_each_entry(sg, &parent->children, list) {
- if (sg->orig_asce != asce || sg->edat_level != edat_level ||
- sg->removed)
- continue;
- if (!sg->initialized)
- return ERR_PTR(-EAGAIN);
- refcount_inc(&sg->ref_count);
- return sg;
- }
- return NULL;
-}
-
-/**
- * gmap_shadow_valid - check if a shadow guest address space matches the
- * given properties and is still valid
- * @sg: pointer to the shadow guest address space structure
- * @asce: ASCE for which the shadow table is requested
- * @edat_level: edat level to be used for the shadow translation
- *
- * Returns 1 if the gmap shadow is still valid and matches the given
- * properties, the caller can continue using it. Returns 0 otherwise, the
- * caller has to request a new shadow gmap in this case.
- *
- */
-int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
-{
- if (sg->removed)
- return 0;
- return sg->orig_asce == asce && sg->edat_level == edat_level;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow_valid);
-
-/**
- * gmap_shadow - create/find a shadow guest address space
- * @parent: pointer to the parent gmap
- * @asce: ASCE for which the shadow table is created
- * @edat_level: edat level to be used for the shadow translation
- *
- * The pages of the top level page table referred by the asce parameter
- * will be set to read-only and marked in the PGSTEs of the kvm process.
- * The shadow table will be removed automatically on any change to the
- * PTE mapping for the source table.
- *
- * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
- * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
- * parent gmap table could not be protected.
- */
-struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
- int edat_level)
-{
- struct gmap *sg, *new;
- unsigned long limit;
- int rc;
-
- BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
- BUG_ON(gmap_is_shadow(parent));
- spin_lock(&parent->shadow_lock);
- sg = gmap_find_shadow(parent, asce, edat_level);
- spin_unlock(&parent->shadow_lock);
- if (sg)
- return sg;
- /* Create a new shadow gmap */
- limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
- if (asce & _ASCE_REAL_SPACE)
- limit = -1UL;
- new = gmap_alloc(limit);
- if (!new)
- return ERR_PTR(-ENOMEM);
- new->mm = parent->mm;
- new->parent = gmap_get(parent);
- new->private = parent->private;
- new->orig_asce = asce;
- new->edat_level = edat_level;
- new->initialized = false;
- spin_lock(&parent->shadow_lock);
- /* Recheck if another CPU created the same shadow */
- sg = gmap_find_shadow(parent, asce, edat_level);
- if (sg) {
- spin_unlock(&parent->shadow_lock);
- gmap_free(new);
- return sg;
- }
- if (asce & _ASCE_REAL_SPACE) {
- /* only allow one real-space gmap shadow */
- list_for_each_entry(sg, &parent->children, list) {
- if (sg->orig_asce & _ASCE_REAL_SPACE) {
- spin_lock(&sg->guest_table_lock);
- gmap_unshadow(sg);
- spin_unlock(&sg->guest_table_lock);
- list_del(&sg->list);
- gmap_put(sg);
- break;
- }
- }
- }
- refcount_set(&new->ref_count, 2);
- list_add(&new->list, &parent->children);
- if (asce & _ASCE_REAL_SPACE) {
- /* nothing to protect, return right away */
- new->initialized = true;
- spin_unlock(&parent->shadow_lock);
- return new;
- }
- spin_unlock(&parent->shadow_lock);
- /* protect after insertion, so it will get properly invalidated */
- mmap_read_lock(parent->mm);
- rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
- ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
- PROT_READ, GMAP_NOTIFY_SHADOW);
- mmap_read_unlock(parent->mm);
- spin_lock(&parent->shadow_lock);
- new->initialized = true;
- if (rc) {
- list_del(&new->list);
- gmap_free(new);
- new = ERR_PTR(rc);
- }
- spin_unlock(&parent->shadow_lock);
- return new;
-}
-EXPORT_SYMBOL_GPL(gmap_shadow);
+EXPORT_SYMBOL(gmap_unshadow);
/**
* gmap_shadow_r2t - create an empty shadow region 2 table
@@ -1827,9 +1479,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = r2t & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_r2t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -1851,7 +1500,6 @@ int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r2t & _REGION_ENTRY_PROTECT);
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -1911,9 +1559,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = r3t & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_r3t = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -1935,7 +1580,6 @@ int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= (r3t & _REGION_ENTRY_PROTECT);
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -1995,9 +1639,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = sgt & _REGION_ENTRY_ORIGIN;
- if (fake)
- page->index |= GMAP_SHADOW_FAKE_TABLE;
s_sgt = page_to_phys(page);
/* Install shadow region second table */
spin_lock(&sg->guest_table_lock);
@@ -2019,7 +1660,6 @@ int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
if (sg->edat_level >= 1)
*table |= sgt & _REGION_ENTRY_PROTECT;
- list_add(&page->lru, &sg->crst_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_REGION_ENTRY_INVALID;
@@ -2052,45 +1692,22 @@ out_free:
}
EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
-/**
- * gmap_shadow_pgt_lookup - find a shadow page table
- * @sg: pointer to the shadow guest address space structure
- * @saddr: the address in the shadow aguest address space
- * @pgt: parent gmap address of the page table to get shadowed
- * @dat_protection: if the pgtable is marked as protected by dat
- * @fake: pgt references contiguous guest memory block, not a pgtable
- *
- * Returns 0 if the shadow page table was found and -EAGAIN if the page
- * table was not found.
- *
- * Called with sg->mm->mmap_lock in read.
- */
-int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
- unsigned long *pgt, int *dat_protection,
- int *fake)
+static void gmap_pgste_set_pgt_addr(struct ptdesc *ptdesc, unsigned long pgt_addr)
{
- unsigned long *table;
- struct page *page;
- int rc;
+ unsigned long *pgstes = page_to_virt(ptdesc_page(ptdesc));
- BUG_ON(!gmap_is_shadow(sg));
- spin_lock(&sg->guest_table_lock);
- table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
- if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
- /* Shadow page tables are full pages (pte+pgste) */
- page = pfn_to_page(*table >> PAGE_SHIFT);
- *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
- *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
- *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
- rc = 0;
- } else {
- rc = -EAGAIN;
- }
- spin_unlock(&sg->guest_table_lock);
- return rc;
+ pgstes += _PAGE_ENTRIES;
+
+ pgstes[0] &= ~PGSTE_ST2_MASK;
+ pgstes[1] &= ~PGSTE_ST2_MASK;
+ pgstes[2] &= ~PGSTE_ST2_MASK;
+ pgstes[3] &= ~PGSTE_ST2_MASK;
+ pgstes[0] |= (pgt_addr >> 16) & PGSTE_ST2_MASK;
+ pgstes[1] |= pgt_addr & PGSTE_ST2_MASK;
+ pgstes[2] |= (pgt_addr << 16) & PGSTE_ST2_MASK;
+ pgstes[3] |= (pgt_addr << 32) & PGSTE_ST2_MASK;
}
-EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
/**
* gmap_shadow_pgt - instantiate a shadow page table
@@ -2119,9 +1736,10 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
ptdesc = page_table_alloc_pgste(sg->mm);
if (!ptdesc)
return -ENOMEM;
- ptdesc->pt_index = pgt & _SEGMENT_ENTRY_ORIGIN;
+ origin = pgt & _SEGMENT_ENTRY_ORIGIN;
if (fake)
- ptdesc->pt_index |= GMAP_SHADOW_FAKE_TABLE;
+ origin |= GMAP_SHADOW_FAKE_TABLE;
+ gmap_pgste_set_pgt_addr(ptdesc, origin);
s_pgt = page_to_phys(ptdesc_page(ptdesc));
/* Install shadow page table */
spin_lock(&sg->guest_table_lock);
@@ -2140,7 +1758,6 @@ int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
/* mark as invalid as long as the parent table is not protected */
*table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
(pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
- list_add(&ptdesc->pt_list, &sg->pt_list);
if (fake) {
/* nothing to protect for fake tables */
*table &= ~_SEGMENT_ENTRY_INVALID;
@@ -2318,7 +1935,6 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
pte_t *pte, unsigned long bits)
{
unsigned long offset, gaddr = 0;
- unsigned long *table;
struct gmap *gmap, *sg, *next;
offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
@@ -2326,12 +1942,9 @@ void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- table = radix_tree_lookup(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (table)
- gaddr = __gmap_segment_gaddr(table) + offset;
+ gaddr = host_to_guest_lookup(gmap, vmaddr) + offset;
spin_unlock(&gmap->guest_table_lock);
- if (!table)
+ if (!IS_GADDR_VALID(gaddr))
continue;
if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
@@ -2371,10 +1984,10 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
gaddr &= HPAGE_MASK;
pmdp_notify_gmap(gmap, pmdp, gaddr);
new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
- if (MACHINE_HAS_TLB_GUEST)
+ if (machine_has_tlb_guest())
__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
IDTE_GLOBAL);
- else if (MACHINE_HAS_IDTE)
+ else if (cpu_has_idte())
__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
@@ -2391,10 +2004,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
if (pmdp) {
- gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
pmdp_notify_gmap(gmap, pmdp, gaddr);
WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
_SEGMENT_ENTRY_GMAP_UC |
@@ -2438,28 +2049,25 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
*/
void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
{
- unsigned long *entry, gaddr;
+ unsigned long gaddr;
struct gmap *gmap;
pmd_t *pmdp;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (entry) {
- pmdp = (pmd_t *)entry;
- gaddr = __gmap_segment_gaddr(entry);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
pmdp_notify_gmap(gmap, pmdp, gaddr);
- WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
- _SEGMENT_ENTRY_GMAP_UC |
- _SEGMENT_ENTRY));
- if (MACHINE_HAS_TLB_GUEST)
+ WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC |
+ _SEGMENT_ENTRY));
+ if (machine_has_tlb_guest())
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_LOCAL);
- else if (MACHINE_HAS_IDTE)
+ else if (cpu_has_idte())
__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
- *entry = _SEGMENT_ENTRY_EMPTY;
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2474,30 +2082,27 @@ EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
*/
void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
{
- unsigned long *entry, gaddr;
+ unsigned long gaddr;
struct gmap *gmap;
pmd_t *pmdp;
rcu_read_lock();
list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
spin_lock(&gmap->guest_table_lock);
- entry = radix_tree_delete(&gmap->host_to_guest,
- vmaddr >> PMD_SHIFT);
- if (entry) {
- pmdp = (pmd_t *)entry;
- gaddr = __gmap_segment_gaddr(entry);
+ pmdp = host_to_guest_pmd_delete(gmap, vmaddr, &gaddr);
+ if (pmdp) {
pmdp_notify_gmap(gmap, pmdp, gaddr);
- WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
- _SEGMENT_ENTRY_GMAP_UC |
- _SEGMENT_ENTRY));
- if (MACHINE_HAS_TLB_GUEST)
+ WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
+ _SEGMENT_ENTRY_GMAP_UC |
+ _SEGMENT_ENTRY));
+ if (machine_has_tlb_guest())
__pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
gmap->asce, IDTE_GLOBAL);
- else if (MACHINE_HAS_IDTE)
+ else if (cpu_has_idte())
__pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
- *entry = _SEGMENT_ENTRY_EMPTY;
+ *pmdp = __pmd(_SEGMENT_ENTRY_EMPTY);
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2612,9 +2217,6 @@ int s390_enable_sie(void)
/* Do we have pgstes? if yes, we are done */
if (mm_has_pgste(mm))
return 0;
- /* Fail if the page tables are 2K */
- if (!mm_alloc_pgste(mm))
- return -EINVAL;
mmap_write_lock(mm);
mm->context.has_pgste = 1;
/* split thp mappings and disable thp for future mappings */
@@ -2624,138 +2226,6 @@ int s390_enable_sie(void)
}
EXPORT_SYMBOL_GPL(s390_enable_sie);
-static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
- unsigned long end, struct mm_walk *walk)
-{
- unsigned long *found_addr = walk->private;
-
- /* Return 1 of the page is a zeropage. */
- if (is_zero_pfn(pte_pfn(*pte))) {
- /*
- * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
- * right thing and likely don't care: FAULT_FLAG_UNSHARE
- * currently only works in COW mappings, which is also where
- * mm_forbids_zeropage() is checked.
- */
- if (!is_cow_mapping(walk->vma->vm_flags))
- return -EFAULT;
-
- *found_addr = addr;
- return 1;
- }
- return 0;
-}
-
-static const struct mm_walk_ops find_zeropage_ops = {
- .pte_entry = find_zeropage_pte_entry,
- .walk_lock = PGWALK_WRLOCK,
-};
-
-/*
- * Unshare all shared zeropages, replacing them by anonymous pages. Note that
- * we cannot simply zap all shared zeropages, because this could later
- * trigger unexpected userfaultfd missing events.
- *
- * This must be called after mm->context.allow_cow_sharing was
- * set to 0, to avoid future mappings of shared zeropages.
- *
- * mm contracts with s390, that even if mm were to remove a page table,
- * and racing with walk_page_range_vma() calling pte_offset_map_lock()
- * would fail, it will never insert a page table containing empty zero
- * pages once mm_forbids_zeropage(mm) i.e.
- * mm->context.allow_cow_sharing is set to 0.
- */
-static int __s390_unshare_zeropages(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, 0);
- unsigned long addr;
- vm_fault_t fault;
- int rc;
-
- for_each_vma(vmi, vma) {
- /*
- * We could only look at COW mappings, but it's more future
- * proof to catch unexpected zeropages in other mappings and
- * fail.
- */
- if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
- continue;
- addr = vma->vm_start;
-
-retry:
- rc = walk_page_range_vma(vma, addr, vma->vm_end,
- &find_zeropage_ops, &addr);
- if (rc < 0)
- return rc;
- else if (!rc)
- continue;
-
- /* addr was updated by find_zeropage_pte_entry() */
- fault = handle_mm_fault(vma, addr,
- FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
- NULL);
- if (fault & VM_FAULT_OOM)
- return -ENOMEM;
- /*
- * See break_ksm(): even after handle_mm_fault() returned 0, we
- * must start the lookup from the current address, because
- * handle_mm_fault() may back out if there's any difficulty.
- *
- * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
- * maybe they could trigger in the future on concurrent
- * truncation. In that case, the shared zeropage would be gone
- * and we can simply retry and make progress.
- */
- cond_resched();
- goto retry;
- }
-
- return 0;
-}
-
-static int __s390_disable_cow_sharing(struct mm_struct *mm)
-{
- int rc;
-
- if (!mm->context.allow_cow_sharing)
- return 0;
-
- mm->context.allow_cow_sharing = 0;
-
- /* Replace all shared zeropages by anonymous pages. */
- rc = __s390_unshare_zeropages(mm);
- /*
- * Make sure to disable KSM (if enabled for the whole process or
- * individual VMAs). Note that nothing currently hinders user space
- * from re-enabling it.
- */
- if (!rc)
- rc = ksm_disable(mm);
- if (rc)
- mm->context.allow_cow_sharing = 1;
- return rc;
-}
-
-/*
- * Disable most COW-sharing of memory pages for the whole process:
- * (1) Disable KSM and unmerge/unshare any KSM pages.
- * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
- *
- * Not that we currently don't bother with COW-shared pages that are shared
- * with parent/child processes due to fork().
- */
-int s390_disable_cow_sharing(void)
-{
- int rc;
-
- mmap_write_lock(current->mm);
- rc = __s390_disable_cow_sharing(current->mm);
- mmap_write_unlock(current->mm);
- return rc;
-}
-EXPORT_SYMBOL_GPL(s390_disable_cow_sharing);
-
/*
* Enable storage key handling from now on and initialize the storage
* keys with the default key.
@@ -2823,7 +2293,7 @@ int s390_enable_skey(void)
goto out_up;
mm->context.uses_skeys = 1;
- rc = __s390_disable_cow_sharing(mm);
+ rc = gmap_helper_disable_cow_sharing();
if (rc) {
mm->context.uses_skeys = 0;
goto out_up;
@@ -2943,49 +2413,6 @@ int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
/**
- * s390_unlist_old_asce - Remove the topmost level of page tables from the
- * list of page tables of the gmap.
- * @gmap: the gmap whose table is to be removed
- *
- * On s390x, KVM keeps a list of all pages containing the page tables of the
- * gmap (the CRST list). This list is used at tear down time to free all
- * pages that are now not needed anymore.
- *
- * This function removes the topmost page of the tree (the one pointed to by
- * the ASCE) from the CRST list.
- *
- * This means that it will not be freed when the VM is torn down, and needs
- * to be handled separately by the caller, unless a leak is actually
- * intended. Notice that this function will only remove the page from the
- * list, the page will still be used as a top level page table (and ASCE).
- */
-void s390_unlist_old_asce(struct gmap *gmap)
-{
- struct page *old;
-
- old = virt_to_page(gmap->table);
- spin_lock(&gmap->guest_table_lock);
- list_del(&old->lru);
- /*
- * Sometimes the topmost page might need to be "removed" multiple
- * times, for example if the VM is rebooted into secure mode several
- * times concurrently, or if s390_replace_asce fails after calling
- * s390_remove_old_asce and is attempted again later. In that case
- * the old asce has been removed from the list, and therefore it
- * will not be freed when the VM terminates, but the ASCE is still
- * in use and still pointed to.
- * A subsequent call to replace_asce will follow the pointer and try
- * to remove the same page from the list again.
- * Therefore it's necessary that the page of the ASCE has valid
- * pointers, so list_del can work (and do nothing) without
- * dereferencing stale or invalid pointers.
- */
- INIT_LIST_HEAD(&old->lru);
- spin_unlock(&gmap->guest_table_lock);
-}
-EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
-
-/**
* s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
* @gmap: the gmap whose ASCE needs to be replaced
*
@@ -3004,8 +2431,6 @@ int s390_replace_asce(struct gmap *gmap)
struct page *page;
void *table;
- s390_unlist_old_asce(gmap);
-
/* Replacing segment type ASCEs would cause serious issues */
if ((gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT)
return -EINVAL;
@@ -3013,19 +2438,9 @@ int s390_replace_asce(struct gmap *gmap)
page = gmap_alloc_crst();
if (!page)
return -ENOMEM;
- page->index = 0;
table = page_to_virt(page);
memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
- /*
- * The caller has to deal with the old ASCE, but here we make sure
- * the new one is properly added to the CRST list, so that
- * it will be freed when the VM is torn down.
- */
- spin_lock(&gmap->guest_table_lock);
- list_add(&page->lru, &gmap->crst_list);
- spin_unlock(&gmap->guest_table_lock);
-
/* Set new table origin while preserving existing ASCE control bits */
asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
WRITE_ONCE(gmap->asce, asce);
diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c
new file mode 100644
index 000000000000..a45d417ad951
--- /dev/null
+++ b/arch/s390/mm/gmap_helpers.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Helper functions for KVM guest address space mapping code
+ *
+ * Copyright IBM Corp. 2007, 2025
+ */
+#include <linux/mm_types.h>
+#include <linux/mmap_lock.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagewalk.h>
+#include <linux/ksm.h>
+#include <asm/gmap_helpers.h>
+
+/**
+ * ptep_zap_swap_entry() - discard a swap entry.
+ * @mm: the mm
+ * @entry: the swap entry that needs to be zapped
+ *
+ * Discards the given swap entry. If the swap entry was an actual swap
+ * entry (and not a migration entry, for example), the actual swapped
+ * page is also discarded from swap.
+ */
+static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
+{
+ if (!non_swap_entry(entry))
+ dec_mm_counter(mm, MM_SWAPENTS);
+ else if (is_migration_entry(entry))
+ dec_mm_counter(mm, mm_counter(pfn_swap_entry_folio(entry)));
+ free_swap_and_cache(entry);
+}
+
+/**
+ * gmap_helper_zap_one_page() - discard a page if it was swapped.
+ * @mm: the mm
+ * @vmaddr: the userspace virtual address that needs to be discarded
+ *
+ * If the given address maps to a swap entry, discard it.
+ *
+ * Context: needs to be called while holding the mmap lock.
+ */
+void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
+{
+ struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ pte_t *ptep;
+
+ mmap_assert_locked(mm);
+
+ /* Find the vm address for the guest address */
+ vma = vma_lookup(mm, vmaddr);
+ if (!vma || is_vm_hugetlb_page(vma))
+ return;
+
+ /* Get pointer to the page table entry */
+ ptep = get_locked_pte(mm, vmaddr, &ptl);
+ if (unlikely(!ptep))
+ return;
+ if (pte_swap(*ptep))
+ ptep_zap_swap_entry(mm, pte_to_swp_entry(*ptep));
+ pte_unmap_unlock(ptep, ptl);
+}
+EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page);
+
+/**
+ * gmap_helper_discard() - discard user pages in the given range
+ * @mm: the mm
+ * @vmaddr: starting userspace address
+ * @end: end address (first address outside the range)
+ *
+ * All userpace pages in the range [@vamddr, @end) are discarded and unmapped.
+ *
+ * Context: needs to be called while holding the mmap lock.
+ */
+void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
+{
+ struct vm_area_struct *vma;
+
+ mmap_assert_locked(mm);
+
+ while (vmaddr < end) {
+ vma = find_vma_intersection(mm, vmaddr, end);
+ if (!vma)
+ return;
+ if (!is_vm_hugetlb_page(vma))
+ zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL);
+ vmaddr = vma->vm_end;
+ }
+}
+EXPORT_SYMBOL_GPL(gmap_helper_discard);
+
+static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ unsigned long *found_addr = walk->private;
+
+ /* Return 1 of the page is a zeropage. */
+ if (is_zero_pfn(pte_pfn(*pte))) {
+ /*
+ * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
+ * right thing and likely don't care: FAULT_FLAG_UNSHARE
+ * currently only works in COW mappings, which is also where
+ * mm_forbids_zeropage() is checked.
+ */
+ if (!is_cow_mapping(walk->vma->vm_flags))
+ return -EFAULT;
+
+ *found_addr = addr;
+ return 1;
+ }
+ return 0;
+}
+
+static const struct mm_walk_ops find_zeropage_ops = {
+ .pte_entry = find_zeropage_pte_entry,
+ .walk_lock = PGWALK_WRLOCK,
+};
+
+/** __gmap_helper_unshare_zeropages() - unshare all shared zeropages
+ * @mm: the mm whose zero pages are to be unshared
+ *
+ * Unshare all shared zeropages, replacing them by anonymous pages. Note that
+ * we cannot simply zap all shared zeropages, because this could later
+ * trigger unexpected userfaultfd missing events.
+ *
+ * This must be called after mm->context.allow_cow_sharing was
+ * set to 0, to avoid future mappings of shared zeropages.
+ *
+ * mm contracts with s390, that even if mm were to remove a page table,
+ * and racing with walk_page_range_vma() calling pte_offset_map_lock()
+ * would fail, it will never insert a page table containing empty zero
+ * pages once mm_forbids_zeropage(mm) i.e.
+ * mm->context.allow_cow_sharing is set to 0.
+ */
+static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+ unsigned long addr;
+ vm_fault_t fault;
+ int rc;
+
+ for_each_vma(vmi, vma) {
+ /*
+ * We could only look at COW mappings, but it's more future
+ * proof to catch unexpected zeropages in other mappings and
+ * fail.
+ */
+ if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
+ continue;
+ addr = vma->vm_start;
+
+retry:
+ rc = walk_page_range_vma(vma, addr, vma->vm_end,
+ &find_zeropage_ops, &addr);
+ if (rc < 0)
+ return rc;
+ else if (!rc)
+ continue;
+
+ /* addr was updated by find_zeropage_pte_entry() */
+ fault = handle_mm_fault(vma, addr,
+ FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
+ NULL);
+ if (fault & VM_FAULT_OOM)
+ return -ENOMEM;
+ /*
+ * See break_ksm(): even after handle_mm_fault() returned 0, we
+ * must start the lookup from the current address, because
+ * handle_mm_fault() may back out if there's any difficulty.
+ *
+ * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
+ * maybe they could trigger in the future on concurrent
+ * truncation. In that case, the shared zeropage would be gone
+ * and we can simply retry and make progress.
+ */
+ cond_resched();
+ goto retry;
+ }
+
+ return 0;
+}
+
+/**
+ * gmap_helper_disable_cow_sharing() - disable all COW sharing
+ *
+ * Disable most COW-sharing of memory pages for the whole process:
+ * (1) Disable KSM and unmerge/unshare any KSM pages.
+ * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
+ *
+ * Not that we currently don't bother with COW-shared pages that are shared
+ * with parent/child processes due to fork().
+ */
+int gmap_helper_disable_cow_sharing(void)
+{
+ struct mm_struct *mm = current->mm;
+ int rc;
+
+ mmap_assert_write_locked(mm);
+
+ if (!mm->context.allow_cow_sharing)
+ return 0;
+
+ mm->context.allow_cow_sharing = 0;
+
+ /* Replace all shared zeropages by anonymous pages. */
+ rc = __gmap_helper_unshare_zeropages(mm);
+ /*
+ * Make sure to disable KSM (if enabled for the whole process or
+ * individual VMAs). Note that nothing currently hinders user space
+ * from re-enabling it.
+ */
+ if (!rc)
+ rc = ksm_disable(mm);
+ if (rc)
+ mm->context.allow_cow_sharing = 1;
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index d9ce199953de..e88c02c9e642 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -9,12 +9,13 @@
#define KMSG_COMPONENT "hugetlb"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-#include <asm/pgalloc.h>
+#include <linux/cpufeature.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/sched/mm.h>
#include <linux/security.h>
+#include <asm/pgalloc.h>
/*
* If the bit selected by single-bit bitmask "a" is set within "x", move
@@ -188,8 +189,8 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
return __rste_to_pte(pte_val(*ptep));
}
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+pte_t __huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
pte_t pte = huge_ptep_get(mm, addr, ptep);
pmd_t *pmdp = (pmd_t *) ptep;
@@ -248,9 +249,9 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
bool __init arch_hugetlb_valid_size(unsigned long size)
{
- if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
+ if (cpu_has_edat1() && size == PMD_SIZE)
return true;
- else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
+ else if (cpu_has_edat2() && size == PUD_SIZE)
return true;
else
return false;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 7a96623a9d2e..074bf4fb4ce2 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -8,6 +8,7 @@
* Copyright (C) 1995 Linus Torvalds
*/
+#include <linux/cpufeature.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -39,7 +40,6 @@
#include <asm/kfence.h>
#include <asm/dma.h>
#include <asm/abs_lowcore.h>
-#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/sclp.h>
@@ -56,6 +56,15 @@ pgd_t invalid_pg_dir[PTRS_PER_PGD] __section(".bss..invalid_pg_dir");
struct ctlreg __bootdata_preserved(s390_invalid_asce);
+unsigned long __bootdata_preserved(page_noexec_mask);
+EXPORT_SYMBOL(page_noexec_mask);
+
+unsigned long __bootdata_preserved(segment_noexec_mask);
+EXPORT_SYMBOL(segment_noexec_mask);
+
+unsigned long __bootdata_preserved(region_noexec_mask);
+EXPORT_SYMBOL(region_noexec_mask);
+
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
EXPORT_SYMBOL(zero_page_mask);
@@ -64,8 +73,6 @@ static void __init setup_zero_pages(void)
{
unsigned long total_pages = memblock_estimated_nr_free_pages();
unsigned int order;
- struct page *page;
- int i;
/* Latest machines require a mapping granularity of 512KB */
order = 7;
@@ -74,16 +81,7 @@ static void __init setup_zero_pages(void)
while (order > 2 && (total_pages >> 10) < (1UL << order))
order--;
- empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
- if (!empty_zero_page)
- panic("Out of memory in setup_zero_pages");
-
- page = virt_to_page((void *) empty_zero_page);
- split_page(page, order);
- for (i = 1 << order; i > 0; i--) {
- mark_page_reserved(page);
- page++;
- }
+ empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE);
zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK;
}
@@ -108,7 +106,7 @@ void mark_rodata_ro(void)
{
unsigned long size = __end_ro_after_init - __start_ro_after_init;
- if (MACHINE_HAS_NX)
+ if (cpu_has_nx())
system_ctl_set_bit(0, CR0_INSTRUCTION_EXEC_PROTECTION_BIT);
__set_memory_ro(__start_ro_after_init, __end_ro_after_init);
pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
@@ -156,19 +154,13 @@ static void pv_init(void)
swiotlb_update_mem_attributes();
}
-void __init mem_init(void)
+void __init arch_mm_preinit(void)
{
cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
cpumask_set_cpu(0, mm_cpumask(&init_mm));
- set_max_mapnr(max_low_pfn);
- high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-
pv_init();
- kfence_split_mapping();
- /* this will put all low memory onto the freelists */
- memblock_free_all();
setup_zero_pages(); /* Setup zeroed pages. */
}
@@ -230,16 +222,13 @@ struct s390_cma_mem_data {
static int s390_cma_check_range(struct cma *cma, void *data)
{
struct s390_cma_mem_data *mem_data;
- unsigned long start, end;
mem_data = data;
- start = cma_get_base(cma);
- end = start + cma_get_size(cma);
- if (end < mem_data->start)
- return 0;
- if (start >= mem_data->end)
- return 0;
- return -EBUSY;
+
+ if (cma_intersects(cma, mem_data->start, mem_data->end))
+ return -EBUSY;
+
+ return 0;
}
static int s390_cma_mem_notifier(struct notifier_block *nb,
@@ -276,7 +265,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
unsigned long size_pages = PFN_DOWN(size);
int rc;
- if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
+ if (WARN_ON_ONCE(pgprot_val(params->pgprot) != pgprot_val(PAGE_KERNEL)))
return -EINVAL;
VM_BUG_ON(!mhp_range_allowed(start, size, true));
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 28a18c42ba99..44426e0f2944 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -17,6 +17,7 @@
#include <asm/asm-extable.h>
#include <asm/abs_lowcore.h>
#include <asm/stacktrace.h>
+#include <asm/sections.h>
#include <asm/maccess.h>
#include <asm/ctlreg.h>
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 33f3504be90b..40a526d28184 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -51,7 +51,6 @@ static inline unsigned long mmap_base(unsigned long rnd,
{
unsigned long gap = rlim_stack->rlim_cur;
unsigned long pad = stack_maxrandom_size() + stack_guard_gap;
- unsigned long gap_min, gap_max;
/* Values close to RLIM_INFINITY can overflow. */
if (gap + pad > gap)
@@ -61,13 +60,7 @@ static inline unsigned long mmap_base(unsigned long rnd,
* Top of mmap area (just below the process stack).
* Leave at least a ~128 MB hole.
*/
- gap_min = SZ_128M;
- gap_max = (STACK_TOP / 6) * 5;
-
- if (gap < gap_min)
- gap = gap_min;
- else if (gap > gap_max)
- gap = gap_max;
+ gap = clamp(gap, SZ_128M, (STACK_TOP / 6) * 5);
return PAGE_ALIGN(STACK_TOP - gap - rnd);
}
@@ -196,22 +189,28 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
}
}
-static const pgprot_t protection_map[16] = {
- [VM_NONE] = PAGE_NONE,
- [VM_READ] = PAGE_RO,
- [VM_WRITE] = PAGE_RO,
- [VM_WRITE | VM_READ] = PAGE_RO,
- [VM_EXEC] = PAGE_RX,
- [VM_EXEC | VM_READ] = PAGE_RX,
- [VM_EXEC | VM_WRITE] = PAGE_RX,
- [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX,
- [VM_SHARED] = PAGE_NONE,
- [VM_SHARED | VM_READ] = PAGE_RO,
- [VM_SHARED | VM_WRITE] = PAGE_RW,
- [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW,
- [VM_SHARED | VM_EXEC] = PAGE_RX,
- [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX,
- [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX,
- [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX
-};
+static pgprot_t protection_map[16] __ro_after_init;
+
+void __init setup_protection_map(void)
+{
+ pgprot_t *pm = protection_map;
+
+ pm[VM_NONE] = PAGE_NONE;
+ pm[VM_READ] = PAGE_RO;
+ pm[VM_WRITE] = PAGE_RO;
+ pm[VM_WRITE | VM_READ] = PAGE_RO;
+ pm[VM_EXEC] = PAGE_RX;
+ pm[VM_EXEC | VM_READ] = PAGE_RX;
+ pm[VM_EXEC | VM_WRITE] = PAGE_RX;
+ pm[VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX;
+ pm[VM_SHARED] = PAGE_NONE;
+ pm[VM_SHARED | VM_READ] = PAGE_RO;
+ pm[VM_SHARED | VM_WRITE] = PAGE_RW;
+ pm[VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW;
+ pm[VM_SHARED | VM_EXEC] = PAGE_RX;
+ pm[VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX;
+ pm[VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX;
+ pm[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX;
+}
+
DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 8f56a21a077f..348e759840e7 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -3,6 +3,7 @@
* Copyright IBM Corp. 2011
* Author(s): Jan Glauber <jang@linux.vnet.ibm.com>
*/
+#include <linux/cpufeature.h>
#include <linux/hugetlb.h>
#include <linux/proc_fs.h>
#include <linux/vmalloc.h>
@@ -27,7 +28,7 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
unsigned long boundary, size;
while (start < end) {
- if (MACHINE_HAS_EDAT1) {
+ if (cpu_has_edat1()) {
/* set storage keys for a 1MB frame */
size = 1UL << 20;
boundary = (start + size) & ~(size - 1);
@@ -63,7 +64,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
unsigned long *table, mask;
mask = 0;
- if (MACHINE_HAS_EDAT2) {
+ if (cpu_has_edat2()) {
switch (dtt) {
case CRDTE_DTT_REGION3:
mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
@@ -77,7 +78,7 @@ static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
}
table = (unsigned long *)((unsigned long)old & mask);
crdte(*old, new, table, dtt, addr, get_lowcore()->kernel_asce.val);
- } else if (MACHINE_HAS_IDTE) {
+ } else if (cpu_has_idte()) {
cspg(old, *old, new);
} else {
csp((unsigned int *)old + 1, *old, new);
@@ -109,8 +110,6 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
} else if (flags & SET_MEMORY_DEF) {
new = __pte(pte_val(new) & PAGE_MASK);
new = set_pte_bit(new, PAGE_KERNEL);
- if (!MACHINE_HAS_NX)
- new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
}
pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
ptep++;
@@ -167,8 +166,6 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
} else if (flags & SET_MEMORY_DEF) {
new = __pmd(pmd_val(new) & PMD_MASK);
new = set_pmd_bit(new, SEGMENT_KERNEL);
- if (!MACHINE_HAS_NX)
- new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
}
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
}
@@ -256,8 +253,6 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
} else if (flags & SET_MEMORY_DEF) {
new = __pud(pud_val(new) & PUD_MASK);
new = set_pud_bit(new, REGION3_KERNEL);
- if (!MACHINE_HAS_NX)
- new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
}
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
}
@@ -379,7 +374,7 @@ int __set_memory(unsigned long addr, unsigned long numpages, unsigned long flags
unsigned long end;
int rc;
- if (!MACHINE_HAS_NX)
+ if (!cpu_has_nx())
flags &= ~(SET_MEMORY_NX | SET_MEMORY_X);
if (!flags)
return 0;
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
index 1aac13bb8f53..e6175d75e4b0 100644
--- a/arch/s390/mm/pfault.c
+++ b/arch/s390/mm/pfault.c
@@ -9,6 +9,7 @@
#include <linux/init.h>
#include <linux/irq.h>
#include <asm/asm-extable.h>
+#include <asm/asm-offsets.h>
#include <asm/pfault.h>
#include <asm/diag.h>
@@ -56,7 +57,7 @@ int __pfault_init(void)
if (pfault_disable)
return rc;
diag_stat_inc(DIAG_STAT_X258);
- asm volatile(
+ asm_inline volatile(
" diag %[refbk],%[rc],0x258\n"
"0: nopr %%r7\n"
EX_TABLE(0b, 0b)
@@ -78,7 +79,7 @@ void __pfault_fini(void)
if (pfault_disable)
return;
diag_stat_inc(DIAG_STAT_X258);
- asm volatile(
+ asm_inline volatile(
" diag %[refbk],0,0x258\n"
"0: nopr %%r7\n"
EX_TABLE(0b, 0b)
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 58696a0c4e4a..b449fd2605b0 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -12,35 +12,8 @@
#include <asm/mmu_context.h>
#include <asm/page-states.h>
#include <asm/pgalloc.h>
-#include <asm/gmap.h>
-#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#ifdef CONFIG_PGSTE
-
-int page_table_allocate_pgste = 0;
-EXPORT_SYMBOL(page_table_allocate_pgste);
-
-static struct ctl_table page_table_sysctl[] = {
- {
- .procname = "allocate_pgste",
- .data = &page_table_allocate_pgste,
- .maxlen = sizeof(int),
- .mode = S_IRUGO | S_IWUSR,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-};
-
-static int __init page_table_register_sysctl(void)
-{
- return register_sysctl("vm", page_table_sysctl) ? 0 : -ENOMEM;
-}
-__initcall(page_table_register_sysctl);
-
-#endif /* CONFIG_PGSTE */
-
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
@@ -63,11 +36,15 @@ void crst_table_free(struct mm_struct *mm, unsigned long *table)
static void __crst_table_upgrade(void *arg)
{
struct mm_struct *mm = arg;
+ struct ctlreg asce;
/* change all active ASCEs to avoid the creation of new TLBs */
if (current->active_mm == mm) {
- get_lowcore()->user_asce.val = mm->context.asce;
- local_ctl_load(7, &get_lowcore()->user_asce);
+ asce.val = mm->context.asce;
+ get_lowcore()->user_asce = asce;
+ local_ctl_load(7, &asce);
+ if (!test_thread_flag(TIF_ASCE_PRIMARY))
+ local_ctl_load(1, &asce);
}
__tlb_flush_local();
}
@@ -77,6 +54,8 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
unsigned long asce_limit = mm->context.asce_limit;
+ mmap_assert_write_locked(mm);
+
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
VM_BUG_ON(asce_limit < _REGION2_SIZE);
@@ -88,23 +67,18 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
if (unlikely(!p4d))
goto err_p4d;
crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
+ pagetable_p4d_ctor(virt_to_ptdesc(p4d));
}
if (end > _REGION1_SIZE) {
pgd = crst_table_alloc(mm);
if (unlikely(!pgd))
goto err_pgd;
crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
+ pagetable_pgd_ctor(virt_to_ptdesc(pgd));
}
spin_lock_bh(&mm->page_table_lock);
- /*
- * This routine gets called with mmap_lock lock held and there is
- * no reason to optimize for the case of otherwise. However, if
- * that would ever change, the below check will let us know.
- */
- VM_BUG_ON(asce_limit != mm->context.asce_limit);
-
if (p4d) {
__pgd = (unsigned long *) mm->pgd;
p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
@@ -130,6 +104,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
return 0;
err_pgd:
+ pagetable_dtor(virt_to_ptdesc(p4d));
crst_table_free(mm, p4d);
err_p4d:
return -ENOMEM;
@@ -167,43 +142,22 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
ptdesc = pagetable_alloc(GFP_KERNEL, 0);
if (!ptdesc)
return NULL;
- if (!pagetable_pte_ctor(ptdesc)) {
+ if (!pagetable_pte_ctor(mm, ptdesc)) {
pagetable_free(ptdesc);
return NULL;
}
table = ptdesc_to_virt(ptdesc);
__arch_set_page_dat(table, 1);
- /* pt_list is used by gmap only */
- INIT_LIST_HEAD(&ptdesc->pt_list);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
return table;
}
-static void pagetable_pte_dtor_free(struct ptdesc *ptdesc)
-{
- pagetable_pte_dtor(ptdesc);
- pagetable_free(ptdesc);
-}
-
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
struct ptdesc *ptdesc = virt_to_ptdesc(table);
- pagetable_pte_dtor_free(ptdesc);
-}
-
-void __tlb_remove_table(void *table)
-{
- struct ptdesc *ptdesc = virt_to_ptdesc(table);
- struct page *page = ptdesc_page(ptdesc);
-
- if (compound_order(page) == CRST_ALLOC_ORDER) {
- /* pmd, pud, or p4d */
- pagetable_free(ptdesc);
- return;
- }
- pagetable_pte_dtor_free(ptdesc);
+ pagetable_dtor_free(ptdesc);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -211,7 +165,7 @@ static void pte_free_now(struct rcu_head *head)
{
struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
- pagetable_pte_dtor_free(ptdesc);
+ pagetable_dtor_free(ptdesc);
}
void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index cea5dba80468..7df70cd8f739 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -4,6 +4,7 @@
* Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
*/
+#include <linux/cpufeature.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
@@ -19,10 +20,10 @@
#include <linux/ksm.h>
#include <linux/mman.h>
-#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/page-states.h>
+#include <asm/machine.h>
pgprot_t pgprot_writecombine(pgprot_t prot)
{
@@ -34,22 +35,12 @@ pgprot_t pgprot_writecombine(pgprot_t prot)
}
EXPORT_SYMBOL_GPL(pgprot_writecombine);
-pgprot_t pgprot_writethrough(pgprot_t prot)
-{
- /*
- * mio_wb_bit_mask may be set on a different CPU, but it is only set
- * once at init and only read afterwards.
- */
- return __pgprot(pgprot_val(prot) & ~mio_wb_bit_mask);
-}
-EXPORT_SYMBOL_GPL(pgprot_writethrough);
-
static inline void ptep_ipte_local(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, int nodat)
{
unsigned long opt, asce;
- if (MACHINE_HAS_TLB_GUEST) {
+ if (machine_has_tlb_guest()) {
opt = 0;
asce = READ_ONCE(mm->context.gmap_asce);
if (asce == 0UL || nodat)
@@ -69,7 +60,7 @@ static inline void ptep_ipte_global(struct mm_struct *mm, unsigned long addr,
{
unsigned long opt, asce;
- if (MACHINE_HAS_TLB_GUEST) {
+ if (machine_has_tlb_guest()) {
opt = 0;
asce = READ_ONCE(mm->context.gmap_asce);
if (asce == 0UL || nodat)
@@ -94,7 +85,7 @@ static inline pte_t ptep_flush_direct(struct mm_struct *mm,
if (unlikely(pte_val(old) & _PAGE_INVALID))
return old;
atomic_inc(&mm->context.flush_count);
- if (MACHINE_HAS_TLB_LC &&
+ if (cpu_has_tlb_lc() &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
ptep_ipte_local(mm, addr, ptep, nodat);
else
@@ -173,10 +164,10 @@ static inline pgste_t pgste_update_all(pte_t pte, pgste_t pgste,
skey = (unsigned long) page_get_storage_key(address);
bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
/* Transfer page changed & referenced bit to guest bits in pgste */
- pgste_val(pgste) |= bits << 48; /* GR bit & GC bit */
+ pgste = set_pgste_bit(pgste, bits << 48); /* GR bit & GC bit */
/* Copy page access key and fetch protection bit to pgste */
- pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
- pgste_val(pgste) |= (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+ pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
+ pgste = set_pgste_bit(pgste, (skey & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
#endif
return pgste;
@@ -210,7 +201,7 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
if ((pte_val(entry) & _PAGE_PRESENT) &&
(pte_val(entry) & _PAGE_WRITE) &&
!(pte_val(entry) & _PAGE_INVALID)) {
- if (!MACHINE_HAS_ESOP) {
+ if (!machine_has_esop()) {
/*
* Without enhanced suppression-on-protection force
* the dirty bit on for all writable ptes.
@@ -220,7 +211,7 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
}
if (!(pte_val(entry) & _PAGE_PROTECT))
/* This pte allows write access, set user-dirty */
- pgste_val(pgste) |= PGSTE_UC_BIT;
+ pgste = set_pgste_bit(pgste, PGSTE_UC_BIT);
}
#endif
set_pte(ptep, entry);
@@ -236,7 +227,7 @@ static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
if (bits) {
- pgste_val(pgste) ^= bits;
+ pgste = __pgste(pgste_val(pgste) ^ bits);
ptep_notify(mm, addr, ptep, bits);
}
#endif
@@ -360,8 +351,6 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pgste_t pgste;
struct mm_struct *mm = vma->vm_mm;
- if (!MACHINE_HAS_NX)
- pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC));
if (mm_has_pgste(mm)) {
pgste = pgste_get(ptep);
pgste_set_key(ptep, pgste, pte, mm);
@@ -376,7 +365,7 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
static inline void pmdp_idte_local(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- if (MACHINE_HAS_TLB_GUEST)
+ if (machine_has_tlb_guest())
__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
mm->context.asce, IDTE_LOCAL);
else
@@ -388,12 +377,12 @@ static inline void pmdp_idte_local(struct mm_struct *mm,
static inline void pmdp_idte_global(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- if (MACHINE_HAS_TLB_GUEST) {
+ if (machine_has_tlb_guest()) {
__pmdp_idte(addr, pmdp, IDTE_NODAT | IDTE_GUEST_ASCE,
mm->context.asce, IDTE_GLOBAL);
if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
gmap_pmdp_idte_global(mm, addr);
- } else if (MACHINE_HAS_IDTE) {
+ } else if (cpu_has_idte()) {
__pmdp_idte(addr, pmdp, 0, 0, IDTE_GLOBAL);
if (mm_has_pgste(mm) && mm->context.allow_gmap_hpage_1m)
gmap_pmdp_idte_global(mm, addr);
@@ -413,7 +402,7 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
return old;
atomic_inc(&mm->context.flush_count);
- if (MACHINE_HAS_TLB_LC &&
+ if (cpu_has_tlb_lc() &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
pmdp_idte_local(mm, addr, pmdp);
else
@@ -507,7 +496,7 @@ EXPORT_SYMBOL(pmdp_xchg_lazy);
static inline void pudp_idte_local(struct mm_struct *mm,
unsigned long addr, pud_t *pudp)
{
- if (MACHINE_HAS_TLB_GUEST)
+ if (machine_has_tlb_guest())
__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
mm->context.asce, IDTE_LOCAL);
else
@@ -517,10 +506,10 @@ static inline void pudp_idte_local(struct mm_struct *mm,
static inline void pudp_idte_global(struct mm_struct *mm,
unsigned long addr, pud_t *pudp)
{
- if (MACHINE_HAS_TLB_GUEST)
+ if (machine_has_tlb_guest())
__pudp_idte(addr, pudp, IDTE_NODAT | IDTE_GUEST_ASCE,
mm->context.asce, IDTE_GLOBAL);
- else if (MACHINE_HAS_IDTE)
+ else if (cpu_has_idte())
__pudp_idte(addr, pudp, 0, 0, IDTE_GLOBAL);
else
/*
@@ -539,7 +528,7 @@ static inline pud_t pudp_flush_direct(struct mm_struct *mm,
if (pud_val(old) & _REGION_ENTRY_INVALID)
return old;
atomic_inc(&mm->context.flush_count);
- if (MACHINE_HAS_TLB_LC &&
+ if (cpu_has_tlb_lc() &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
pudp_idte_local(mm, addr, pudp);
else
@@ -611,7 +600,7 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
/* the mm_has_pgste() check is done in set_pte_at() */
preempt_disable();
pgste = pgste_get_lock(ptep);
- pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
+ pgste = clear_pgste_bit(pgste, _PGSTE_GPS_ZERO);
pgste_set_key(ptep, pgste, entry, mm);
pgste = pgste_set_pte(ptep, pgste, entry);
pgste_set_unlock(ptep, pgste);
@@ -624,7 +613,7 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
preempt_disable();
pgste = pgste_get_lock(ptep);
- pgste_val(pgste) |= PGSTE_IN_BIT;
+ pgste = set_pgste_bit(pgste, PGSTE_IN_BIT);
pgste_set_unlock(ptep, pgste);
preempt_enable();
}
@@ -669,7 +658,7 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
- pgste_val(pgste) |= bit;
+ pgste = set_pgste_bit(pgste, bit);
pgste = pgste_set_pte(ptep, pgste, entry);
pgste_set_unlock(ptep, pgste);
return 0;
@@ -689,7 +678,7 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
if (!(pte_val(spte) & _PAGE_INVALID) &&
!((pte_val(spte) & _PAGE_PROTECT) &&
!(pte_val(pte) & _PAGE_PROTECT))) {
- pgste_val(spgste) |= PGSTE_VSIE_BIT;
+ spgste = set_pgste_bit(spgste, PGSTE_VSIE_BIT);
tpgste = pgste_get_lock(tptep);
tpte = __pte((pte_val(spte) & PAGE_MASK) |
(pte_val(pte) & _PAGE_PROTECT));
@@ -747,7 +736,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
pte_clear(mm, addr, ptep);
}
if (reset)
- pgste_val(pgste) &= ~(_PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
+ pgste = clear_pgste_bit(pgste, _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT);
pgste_set_unlock(ptep, pgste);
preempt_enable();
}
@@ -760,8 +749,8 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
/* Clear storage key ACC and F, but set R/C */
preempt_disable();
pgste = pgste_get_lock(ptep);
- pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT);
- pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
+ pgste = clear_pgste_bit(pgste, PGSTE_ACC_BITS | PGSTE_FP_BIT);
+ pgste = set_pgste_bit(pgste, PGSTE_GR_BIT | PGSTE_GC_BIT);
ptev = pte_val(*ptep);
if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
@@ -782,13 +771,13 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
pgste = pgste_get_lock(ptep);
dirty = !!(pgste_val(pgste) & PGSTE_UC_BIT);
- pgste_val(pgste) &= ~PGSTE_UC_BIT;
+ pgste = clear_pgste_bit(pgste, PGSTE_UC_BIT);
pte = *ptep;
if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
pgste = pgste_pte_notify(mm, addr, ptep, pgste);
nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
ptep_ipte_global(mm, addr, ptep, nodat);
- if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
+ if (machine_has_esop() || !(pte_val(pte) & _PAGE_WRITE))
pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
else
pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
@@ -844,11 +833,11 @@ again:
if (!ptep)
goto again;
new = old = pgste_get_lock(ptep);
- pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
- PGSTE_ACC_BITS | PGSTE_FP_BIT);
+ new = clear_pgste_bit(new, PGSTE_GR_BIT | PGSTE_GC_BIT |
+ PGSTE_ACC_BITS | PGSTE_FP_BIT);
keyul = (unsigned long) key;
- pgste_val(new) |= (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
- pgste_val(new) |= (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
+ new = set_pgste_bit(new, (keyul & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48);
+ new = set_pgste_bit(new, (keyul & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56);
if (!(pte_val(*ptep) & _PAGE_INVALID)) {
unsigned long bits, skey;
@@ -859,12 +848,12 @@ again:
/* Set storage key ACC and FP */
page_set_storage_key(paddr, skey, !nq);
/* Merge host changed & referenced into pgste */
- pgste_val(new) |= bits << 52;
+ new = set_pgste_bit(new, bits << 52);
}
/* changing the guest storage key is considered a change of the page */
if ((pgste_val(new) ^ pgste_val(old)) &
(PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
- pgste_val(new) |= PGSTE_UC_BIT;
+ new = set_pgste_bit(new, PGSTE_UC_BIT);
pgste_set_unlock(ptep, new);
pte_unmap_unlock(ptep, ptl);
@@ -952,19 +941,19 @@ again:
goto again;
new = old = pgste_get_lock(ptep);
/* Reset guest reference bit only */
- pgste_val(new) &= ~PGSTE_GR_BIT;
+ new = clear_pgste_bit(new, PGSTE_GR_BIT);
if (!(pte_val(*ptep) & _PAGE_INVALID)) {
paddr = pte_val(*ptep) & PAGE_MASK;
cc = page_reset_referenced(paddr);
/* Merge real referenced bit into host-set */
- pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
+ new = set_pgste_bit(new, ((unsigned long)cc << 53) & PGSTE_HR_BIT);
}
/* Reflect guest's logical view, not physical */
cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
/* Changing the guest storage key is considered a change of the page */
if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
- pgste_val(new) |= PGSTE_UC_BIT;
+ new = set_pgste_bit(new, PGSTE_UC_BIT);
pgste_set_unlock(ptep, new);
pte_unmap_unlock(ptep, ptl);
@@ -1128,7 +1117,7 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc,
if (res)
pgstev |= _PGSTE_GPS_ZERO;
- pgste_val(pgste) = pgstev;
+ pgste = __pgste(pgstev);
pgste_set_unlock(ptep, pgste);
pte_unmap_unlock(ptep, ptl);
return res;
@@ -1161,8 +1150,8 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long hva,
return -EFAULT;
new = pgste_get_lock(ptep);
- pgste_val(new) &= ~bits;
- pgste_val(new) |= value & bits;
+ new = clear_pgste_bit(new, bits);
+ new = set_pgste_bit(new, value & bits);
pgste_set_unlock(ptep, new);
pte_unmap_unlock(ptep, ptl);
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 665b8228afeb..448dd6ed1069 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -4,6 +4,7 @@
*/
#include <linux/memory_hotplug.h>
+#include <linux/cpufeature.h>
#include <linux/memblock.h>
#include <linux/pfn.h>
#include <linux/mm.h>
@@ -171,9 +172,6 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
pte_t *pte;
prot = pgprot_val(PAGE_KERNEL);
- if (!MACHINE_HAS_NX)
- prot &= ~_PAGE_NOEXEC;
-
pte = pte_offset_kernel(pmd, addr);
for (; addr < end; addr += PAGE_SIZE, pte++) {
if (!add) {
@@ -230,9 +228,6 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
pte_t *pte;
prot = pgprot_val(SEGMENT_KERNEL);
- if (!MACHINE_HAS_NX)
- prot &= ~_SEGMENT_ENTRY_NOEXEC;
-
pmd = pmd_offset(pud, addr);
for (; addr < end; addr = next, pmd++) {
next = pmd_addr_end(addr, end);
@@ -255,12 +250,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
} else if (pmd_none(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
IS_ALIGNED(next, PMD_SIZE) &&
- MACHINE_HAS_EDAT1 && direct &&
+ cpu_has_edat1() && direct &&
!debug_pagealloc_enabled()) {
set_pmd(pmd, __pmd(__pa(addr) | prot));
pages++;
continue;
- } else if (!direct && MACHINE_HAS_EDAT1) {
+ } else if (!direct && cpu_has_edat1()) {
void *new_page;
/*
@@ -324,8 +319,6 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
pmd_t *pmd;
prot = pgprot_val(REGION3_KERNEL);
- if (!MACHINE_HAS_NX)
- prot &= ~_REGION_ENTRY_NOEXEC;
pud = pud_offset(p4d, addr);
for (; addr < end; addr = next, pud++) {
next = pud_addr_end(addr, end);
@@ -343,7 +336,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
} else if (pud_none(*pud)) {
if (IS_ALIGNED(addr, PUD_SIZE) &&
IS_ALIGNED(next, PUD_SIZE) &&
- MACHINE_HAS_EDAT2 && direct &&
+ cpu_has_edat2() && direct &&
!debug_pagealloc_enabled()) {
set_pud(pud, __pud(__pa(addr) | prot));
pages++;
@@ -667,10 +660,10 @@ void __init vmem_map_init(void)
* prefix page is used to return to the previous context with
* an LPSWE instruction and therefore must be executable.
*/
- if (!static_key_enabled(&cpu_has_bear))
+ if (!cpu_has_bear())
set_memory_x(0, 1);
if (debug_pagealloc_enabled())
- __set_memory_4k(__va(0), __va(0) + ident_map_size);
+ __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size);
pr_info("Write protected kernel read-only data: %luk\n",
(unsigned long)(__end_rodata - _stext) >> 10);
}