summaryrefslogtreecommitdiff
path: root/arch/s390/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/Makefile1
-rw-r--r--arch/s390/mm/cmm.c2
-rw-r--r--arch/s390/mm/dump_pagetables.c2
-rw-r--r--arch/s390/mm/extmem.c9
-rw-r--r--arch/s390/mm/fault.c235
-rw-r--r--arch/s390/mm/gmap.c6
-rw-r--r--arch/s390/mm/maccess.c7
-rw-r--r--arch/s390/mm/pfault.c248
-rw-r--r--arch/s390/mm/pgalloc.c176
-rw-r--r--arch/s390/mm/vmem.c6
10 files changed, 392 insertions, 300 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index d90db06a8af5..352ff520fd94 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
obj-$(CONFIG_PGSTE) += gmap.o
+obj-$(CONFIG_PFAULT) += pfault.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 5300c6867d5e..f47515313226 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -90,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter,
} else
free_page((unsigned long) npa);
}
- diag10_range(virt_to_pfn(addr), 1);
+ diag10_range(virt_to_pfn((void *)addr), 1);
pa->pages[pa->index++] = addr;
(*counter)++;
spin_unlock(&cmm_lock);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index ba5f80268878..afa5db750d92 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -297,7 +297,7 @@ static int pt_dump_init(void)
address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore;
address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE;
address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area;
- address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE;
+ address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + MEMCPY_REAL_SIZE;
address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
address_markers[VMALLOC_NR].start_address = VMALLOC_START;
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 1bc42ce26599..e41869f5cc95 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -640,10 +640,13 @@ void segment_warning(int rc, char *seg_name)
pr_err("There is not enough memory to load or query "
"DCSS %s\n", seg_name);
break;
- case -ERANGE:
- pr_err("DCSS %s exceeds the kernel mapping range (%lu) "
- "and cannot be loaded\n", seg_name, VMEM_MAX_PHYS);
+ case -ERANGE: {
+ struct range mhp_range = arch_get_mappable_range();
+
+ pr_err("DCSS %s exceeds the kernel mapping range (%llu) "
+ "and cannot be loaded\n", seg_name, mhp_range.end + 1);
break;
+ }
default:
break;
}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index dbe8394234e2..099c4824dd8a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -43,8 +43,6 @@
#include "../kernel/entry.h"
#define __FAIL_ADDR_MASK -4096L
-#define __SUBCODE_MASK 0x0600
-#define __PF_RES_FIELD 0x8000000000000000ULL
/*
* Allocate private vm_fault_reason from top. Please make sure it won't
@@ -407,7 +405,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
access = VM_WRITE;
if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
-#ifdef CONFIG_PER_VMA_LOCK
if (!(flags & FAULT_FLAG_USER))
goto lock_mmap;
vma = lock_vma_under_rcu(mm, address);
@@ -418,9 +415,12 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
goto lock_mmap;
}
fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
- vma_end_read(vma);
+ if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+ vma_end_read(vma);
if (!(fault & VM_FAULT_RETRY)) {
count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ if (likely(!(fault & VM_FAULT_ERROR)))
+ fault = 0;
goto out;
}
count_vm_vma_lock_event(VMA_LOCK_RETRY);
@@ -430,7 +430,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
goto out;
}
lock_mmap:
-#endif /* CONFIG_PER_VMA_LOCK */
mmap_read_lock(mm);
gmap = NULL;
@@ -581,232 +580,6 @@ void do_dat_exception(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(do_dat_exception);
-#ifdef CONFIG_PFAULT
-/*
- * 'pfault' pseudo page faults routines.
- */
-static int pfault_disable;
-
-static int __init nopfault(char *str)
-{
- pfault_disable = 1;
- return 1;
-}
-
-__setup("nopfault", nopfault);
-
-struct pfault_refbk {
- u16 refdiagc;
- u16 reffcode;
- u16 refdwlen;
- u16 refversn;
- u64 refgaddr;
- u64 refselmk;
- u64 refcmpmk;
- u64 reserved;
-} __attribute__ ((packed, aligned(8)));
-
-static struct pfault_refbk pfault_init_refbk = {
- .refdiagc = 0x258,
- .reffcode = 0,
- .refdwlen = 5,
- .refversn = 2,
- .refgaddr = __LC_LPP,
- .refselmk = 1ULL << 48,
- .refcmpmk = 1ULL << 48,
- .reserved = __PF_RES_FIELD
-};
-
-int pfault_init(void)
-{
- int rc;
-
- if (pfault_disable)
- return -1;
- diag_stat_inc(DIAG_STAT_X258);
- asm volatile(
- " diag %1,%0,0x258\n"
- "0: j 2f\n"
- "1: la %0,8\n"
- "2:\n"
- EX_TABLE(0b,1b)
- : "=d" (rc)
- : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
- return rc;
-}
-
-static struct pfault_refbk pfault_fini_refbk = {
- .refdiagc = 0x258,
- .reffcode = 1,
- .refdwlen = 5,
- .refversn = 2,
-};
-
-void pfault_fini(void)
-{
-
- if (pfault_disable)
- return;
- diag_stat_inc(DIAG_STAT_X258);
- asm volatile(
- " diag %0,0,0x258\n"
- "0: nopr %%r7\n"
- EX_TABLE(0b,0b)
- : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
-}
-
-static DEFINE_SPINLOCK(pfault_lock);
-static LIST_HEAD(pfault_list);
-
-#define PF_COMPLETE 0x0080
-
-/*
- * The mechanism of our pfault code: if Linux is running as guest, runs a user
- * space process and the user space process accesses a page that the host has
- * paged out we get a pfault interrupt.
- *
- * This allows us, within the guest, to schedule a different process. Without
- * this mechanism the host would have to suspend the whole virtual cpu until
- * the page has been paged in.
- *
- * So when we get such an interrupt then we set the state of the current task
- * to uninterruptible and also set the need_resched flag. Both happens within
- * interrupt context(!). If we later on want to return to user space we
- * recognize the need_resched flag and then call schedule(). It's not very
- * obvious how this works...
- *
- * Of course we have a lot of additional fun with the completion interrupt (->
- * host signals that a page of a process has been paged in and the process can
- * continue to run). This interrupt can arrive on any cpu and, since we have
- * virtual cpus, actually appear before the interrupt that signals that a page
- * is missing.
- */
-static void pfault_interrupt(struct ext_code ext_code,
- unsigned int param32, unsigned long param64)
-{
- struct task_struct *tsk;
- __u16 subcode;
- pid_t pid;
-
- /*
- * Get the external interruption subcode & pfault initial/completion
- * signal bit. VM stores this in the 'cpu address' field associated
- * with the external interrupt.
- */
- subcode = ext_code.subcode;
- if ((subcode & 0xff00) != __SUBCODE_MASK)
- return;
- inc_irq_stat(IRQEXT_PFL);
- /* Get the token (= pid of the affected task). */
- pid = param64 & LPP_PID_MASK;
- rcu_read_lock();
- tsk = find_task_by_pid_ns(pid, &init_pid_ns);
- if (tsk)
- get_task_struct(tsk);
- rcu_read_unlock();
- if (!tsk)
- return;
- spin_lock(&pfault_lock);
- if (subcode & PF_COMPLETE) {
- /* signal bit is set -> a page has been swapped in by VM */
- if (tsk->thread.pfault_wait == 1) {
- /* Initial interrupt was faster than the completion
- * interrupt. pfault_wait is valid. Set pfault_wait
- * back to zero and wake up the process. This can
- * safely be done because the task is still sleeping
- * and can't produce new pfaults. */
- tsk->thread.pfault_wait = 0;
- list_del(&tsk->thread.list);
- wake_up_process(tsk);
- put_task_struct(tsk);
- } else {
- /* Completion interrupt was faster than initial
- * interrupt. Set pfault_wait to -1 so the initial
- * interrupt doesn't put the task to sleep.
- * If the task is not running, ignore the completion
- * interrupt since it must be a leftover of a PFAULT
- * CANCEL operation which didn't remove all pending
- * completion interrupts. */
- if (task_is_running(tsk))
- tsk->thread.pfault_wait = -1;
- }
- } else {
- /* signal bit not set -> a real page is missing. */
- if (WARN_ON_ONCE(tsk != current))
- goto out;
- if (tsk->thread.pfault_wait == 1) {
- /* Already on the list with a reference: put to sleep */
- goto block;
- } else if (tsk->thread.pfault_wait == -1) {
- /* Completion interrupt was faster than the initial
- * interrupt (pfault_wait == -1). Set pfault_wait
- * back to zero and exit. */
- tsk->thread.pfault_wait = 0;
- } else {
- /* Initial interrupt arrived before completion
- * interrupt. Let the task sleep.
- * An extra task reference is needed since a different
- * cpu may set the task state to TASK_RUNNING again
- * before the scheduler is reached. */
- get_task_struct(tsk);
- tsk->thread.pfault_wait = 1;
- list_add(&tsk->thread.list, &pfault_list);
-block:
- /* Since this must be a userspace fault, there
- * is no kernel task state to trample. Rely on the
- * return to userspace schedule() to block. */
- __set_current_state(TASK_UNINTERRUPTIBLE);
- set_tsk_need_resched(tsk);
- set_preempt_need_resched();
- }
- }
-out:
- spin_unlock(&pfault_lock);
- put_task_struct(tsk);
-}
-
-static int pfault_cpu_dead(unsigned int cpu)
-{
- struct thread_struct *thread, *next;
- struct task_struct *tsk;
-
- spin_lock_irq(&pfault_lock);
- list_for_each_entry_safe(thread, next, &pfault_list, list) {
- thread->pfault_wait = 0;
- list_del(&thread->list);
- tsk = container_of(thread, struct task_struct, thread);
- wake_up_process(tsk);
- put_task_struct(tsk);
- }
- spin_unlock_irq(&pfault_lock);
- return 0;
-}
-
-static int __init pfault_irq_init(void)
-{
- int rc;
-
- rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
- if (rc)
- goto out_extint;
- rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
- if (rc)
- goto out_pfault;
- irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
- cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
- NULL, pfault_cpu_dead);
- return 0;
-
-out_pfault:
- unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
-out_extint:
- pfault_disable = 1;
- return rc;
-}
-early_initcall(pfault_irq_init);
-
-#endif /* CONFIG_PFAULT */
-
#if IS_ENABLED(CONFIG_PGSTE)
void do_secure_storage_access(struct pt_regs *regs)
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 989ebd0912b4..906a7bfc2a78 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2514,6 +2514,7 @@ static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
static const struct mm_walk_ops thp_split_walk_ops = {
.pmd_entry = thp_split_walk_pmd_entry,
+ .walk_lock = PGWALK_WRLOCK_VERIFY,
};
static inline void thp_split_mm(struct mm_struct *mm)
@@ -2565,6 +2566,7 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
static const struct mm_walk_ops zap_zero_walk_ops = {
.pmd_entry = __zap_zero_pages,
+ .walk_lock = PGWALK_WRLOCK,
};
/*
@@ -2655,6 +2657,7 @@ static const struct mm_walk_ops enable_skey_walk_ops = {
.hugetlb_entry = __s390_enable_skey_hugetlb,
.pte_entry = __s390_enable_skey_pte,
.pmd_entry = __s390_enable_skey_pmd,
+ .walk_lock = PGWALK_WRLOCK,
};
int s390_enable_skey(void)
@@ -2692,6 +2695,7 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
static const struct mm_walk_ops reset_cmma_walk_ops = {
.pte_entry = __s390_reset_cmma,
+ .walk_lock = PGWALK_WRLOCK,
};
void s390_reset_cmma(struct mm_struct *mm)
@@ -2728,6 +2732,7 @@ static int s390_gather_pages(pte_t *ptep, unsigned long addr,
static const struct mm_walk_ops gather_pages_ops = {
.pte_entry = s390_gather_pages,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -2853,6 +2858,7 @@ int s390_replace_asce(struct gmap *gmap)
page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
if (!page)
return -ENOMEM;
+ page->index = 0;
table = page_to_virt(page);
memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index cbe1df1e9c18..c805b3e2592b 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -86,11 +86,12 @@ size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
void *chunk;
pte_t pte;
+ BUILD_BUG_ON(MEMCPY_REAL_SIZE != PAGE_SIZE);
while (count) {
- phys = src & PAGE_MASK;
- offset = src & ~PAGE_MASK;
+ phys = src & MEMCPY_REAL_MASK;
+ offset = src & ~MEMCPY_REAL_MASK;
chunk = (void *)(__memcpy_real_area + offset);
- len = min(count, PAGE_SIZE - offset);
+ len = min(count, MEMCPY_REAL_SIZE - offset);
pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
mutex_lock(&memcpy_real_mutex);
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
new file mode 100644
index 000000000000..1aac13bb8f53
--- /dev/null
+++ b/arch/s390/mm/pfault.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corp. 1999, 2023
+ */
+
+#include <linux/cpuhotplug.h>
+#include <linux/sched/task.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <asm/asm-extable.h>
+#include <asm/pfault.h>
+#include <asm/diag.h>
+
+#define __SUBCODE_MASK 0x0600
+#define __PF_RES_FIELD 0x8000000000000000UL
+
+/*
+ * 'pfault' pseudo page faults routines.
+ */
+static int pfault_disable;
+
+static int __init nopfault(char *str)
+{
+ pfault_disable = 1;
+ return 1;
+}
+early_param("nopfault", nopfault);
+
+struct pfault_refbk {
+ u16 refdiagc;
+ u16 reffcode;
+ u16 refdwlen;
+ u16 refversn;
+ u64 refgaddr;
+ u64 refselmk;
+ u64 refcmpmk;
+ u64 reserved;
+};
+
+static struct pfault_refbk pfault_init_refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 0,
+ .refdwlen = 5,
+ .refversn = 2,
+ .refgaddr = __LC_LPP,
+ .refselmk = 1UL << 48,
+ .refcmpmk = 1UL << 48,
+ .reserved = __PF_RES_FIELD
+};
+
+int __pfault_init(void)
+{
+ int rc = -EOPNOTSUPP;
+
+ if (pfault_disable)
+ return rc;
+ diag_stat_inc(DIAG_STAT_X258);
+ asm volatile(
+ " diag %[refbk],%[rc],0x258\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ : [rc] "+d" (rc)
+ : [refbk] "a" (&pfault_init_refbk), "m" (pfault_init_refbk)
+ : "cc");
+ return rc;
+}
+
+static struct pfault_refbk pfault_fini_refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 1,
+ .refdwlen = 5,
+ .refversn = 2,
+};
+
+void __pfault_fini(void)
+{
+ if (pfault_disable)
+ return;
+ diag_stat_inc(DIAG_STAT_X258);
+ asm volatile(
+ " diag %[refbk],0,0x258\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b, 0b)
+ :
+ : [refbk] "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk)
+ : "cc");
+}
+
+static DEFINE_SPINLOCK(pfault_lock);
+static LIST_HEAD(pfault_list);
+
+#define PF_COMPLETE 0x0080
+
+/*
+ * The mechanism of our pfault code: if Linux is running as guest, runs a user
+ * space process and the user space process accesses a page that the host has
+ * paged out we get a pfault interrupt.
+ *
+ * This allows us, within the guest, to schedule a different process. Without
+ * this mechanism the host would have to suspend the whole virtual cpu until
+ * the page has been paged in.
+ *
+ * So when we get such an interrupt then we set the state of the current task
+ * to uninterruptible and also set the need_resched flag. Both happens within
+ * interrupt context(!). If we later on want to return to user space we
+ * recognize the need_resched flag and then call schedule(). It's not very
+ * obvious how this works...
+ *
+ * Of course we have a lot of additional fun with the completion interrupt (->
+ * host signals that a page of a process has been paged in and the process can
+ * continue to run). This interrupt can arrive on any cpu and, since we have
+ * virtual cpus, actually appear before the interrupt that signals that a page
+ * is missing.
+ */
+static void pfault_interrupt(struct ext_code ext_code,
+ unsigned int param32, unsigned long param64)
+{
+ struct task_struct *tsk;
+ __u16 subcode;
+ pid_t pid;
+
+ /*
+ * Get the external interruption subcode & pfault initial/completion
+ * signal bit. VM stores this in the 'cpu address' field associated
+ * with the external interrupt.
+ */
+ subcode = ext_code.subcode;
+ if ((subcode & 0xff00) != __SUBCODE_MASK)
+ return;
+ inc_irq_stat(IRQEXT_PFL);
+ /* Get the token (= pid of the affected task). */
+ pid = param64 & LPP_PID_MASK;
+ rcu_read_lock();
+ tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk)
+ return;
+ spin_lock(&pfault_lock);
+ if (subcode & PF_COMPLETE) {
+ /* signal bit is set -> a page has been swapped in by VM */
+ if (tsk->thread.pfault_wait == 1) {
+ /*
+ * Initial interrupt was faster than the completion
+ * interrupt. pfault_wait is valid. Set pfault_wait
+ * back to zero and wake up the process. This can
+ * safely be done because the task is still sleeping
+ * and can't produce new pfaults.
+ */
+ tsk->thread.pfault_wait = 0;
+ list_del(&tsk->thread.list);
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ } else {
+ /*
+ * Completion interrupt was faster than initial
+ * interrupt. Set pfault_wait to -1 so the initial
+ * interrupt doesn't put the task to sleep.
+ * If the task is not running, ignore the completion
+ * interrupt since it must be a leftover of a PFAULT
+ * CANCEL operation which didn't remove all pending
+ * completion interrupts.
+ */
+ if (task_is_running(tsk))
+ tsk->thread.pfault_wait = -1;
+ }
+ } else {
+ /* signal bit not set -> a real page is missing. */
+ if (WARN_ON_ONCE(tsk != current))
+ goto out;
+ if (tsk->thread.pfault_wait == 1) {
+ /* Already on the list with a reference: put to sleep */
+ goto block;
+ } else if (tsk->thread.pfault_wait == -1) {
+ /*
+ * Completion interrupt was faster than the initial
+ * interrupt (pfault_wait == -1). Set pfault_wait
+ * back to zero and exit.
+ */
+ tsk->thread.pfault_wait = 0;
+ } else {
+ /*
+ * Initial interrupt arrived before completion
+ * interrupt. Let the task sleep.
+ * An extra task reference is needed since a different
+ * cpu may set the task state to TASK_RUNNING again
+ * before the scheduler is reached.
+ */
+ get_task_struct(tsk);
+ tsk->thread.pfault_wait = 1;
+ list_add(&tsk->thread.list, &pfault_list);
+block:
+ /*
+ * Since this must be a userspace fault, there
+ * is no kernel task state to trample. Rely on the
+ * return to userspace schedule() to block.
+ */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ set_tsk_need_resched(tsk);
+ set_preempt_need_resched();
+ }
+ }
+out:
+ spin_unlock(&pfault_lock);
+ put_task_struct(tsk);
+}
+
+static int pfault_cpu_dead(unsigned int cpu)
+{
+ struct thread_struct *thread, *next;
+ struct task_struct *tsk;
+
+ spin_lock_irq(&pfault_lock);
+ list_for_each_entry_safe(thread, next, &pfault_list, list) {
+ thread->pfault_wait = 0;
+ list_del(&thread->list);
+ tsk = container_of(thread, struct task_struct, thread);
+ wake_up_process(tsk);
+ put_task_struct(tsk);
+ }
+ spin_unlock_irq(&pfault_lock);
+ return 0;
+}
+
+static int __init pfault_irq_init(void)
+{
+ int rc;
+
+ rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+ if (rc)
+ goto out_extint;
+ rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
+ if (rc)
+ goto out_pfault;
+ irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
+ cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
+ NULL, pfault_cpu_dead);
+ return 0;
+
+out_pfault:
+ unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
+out_extint:
+ pfault_disable = 1;
+ return rc;
+}
+early_initcall(pfault_irq_init);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 66ab68db9842..07fc660a24aa 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl);
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
- struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER);
- if (!page)
+ if (!ptdesc)
return NULL;
- arch_set_page_dat(page, CRST_ALLOC_ORDER);
- return (unsigned long *) page_to_virt(page);
+ arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER);
+ return (unsigned long *) ptdesc_to_virt(ptdesc);
}
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
- free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ pagetable_free(virt_to_ptdesc(table));
}
static void __crst_table_upgrade(void *arg)
@@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
struct page *page_table_alloc_pgste(struct mm_struct *mm)
{
- struct page *page;
+ struct ptdesc *ptdesc;
u64 *table;
- page = alloc_page(GFP_KERNEL);
- if (page) {
- table = (u64 *)page_to_virt(page);
+ ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+ if (ptdesc) {
+ table = (u64 *)ptdesc_to_virt(ptdesc);
memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
}
- return page;
+ return ptdesc_page(ptdesc);
}
void page_table_free_pgste(struct page *page)
{
- __free_page(page);
+ pagetable_free(page_ptdesc(page));
}
#endif /* CONFIG_PGSTE */
@@ -229,11 +229,20 @@ void page_table_free_pgste(struct page *page)
* logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
* while the PP bits are never used, nor such a page is added to or removed
* from mm_context_t::pgtable_list.
+ *
+ * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
+ * and prevents both 2K fragments from being reused. pte_free_defer() has to
+ * guarantee that its pgtable cannot be reused before the RCU grace period
+ * has elapsed (which page_table_free_rcu() does not actually guarantee).
+ * But for simplicity, because page->rcu_head overlays page->lru, and because
+ * the RCU callback might not be called before the mm_context_t has been freed,
+ * pte_free_defer() in this implementation prevents both fragments from being
+ * reused, and delays making the call to RCU until both fragments are freed.
*/
unsigned long *page_table_alloc(struct mm_struct *mm)
{
unsigned long *table;
- struct page *page;
+ struct ptdesc *ptdesc;
unsigned int mask, bit;
/* Try to get a fragment of a 4K page as a 2K page table */
@@ -241,9 +250,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = NULL;
spin_lock_bh(&mm->context.lock);
if (!list_empty(&mm->context.pgtable_list)) {
- page = list_first_entry(&mm->context.pgtable_list,
- struct page, lru);
- mask = atomic_read(&page->_refcount) >> 24;
+ ptdesc = list_first_entry(&mm->context.pgtable_list,
+ struct ptdesc, pt_list);
+ mask = atomic_read(&ptdesc->_refcount) >> 24;
/*
* The pending removal bits must also be checked.
* Failure to do so might lead to an impossible
@@ -255,13 +264,13 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
*/
mask = (mask | (mask >> 4)) & 0x03U;
if (mask != 0x03U) {
- table = (unsigned long *) page_to_virt(page);
+ table = (unsigned long *) ptdesc_to_virt(ptdesc);
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
- atomic_xor_bits(&page->_refcount,
+ atomic_xor_bits(&ptdesc->_refcount,
0x01U << (bit + 24));
- list_del(&page->lru);
+ list_del_init(&ptdesc->pt_list);
}
}
spin_unlock_bh(&mm->context.lock);
@@ -269,27 +278,28 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
return table;
}
/* Allocate a fresh page */
- page = alloc_page(GFP_KERNEL);
- if (!page)
+ ptdesc = pagetable_alloc(GFP_KERNEL, 0);
+ if (!ptdesc)
return NULL;
- if (!pgtable_pte_page_ctor(page)) {
- __free_page(page);
+ if (!pagetable_pte_ctor(ptdesc)) {
+ pagetable_free(ptdesc);
return NULL;
}
- arch_set_page_dat(page, 0);
+ arch_set_page_dat(ptdesc_page(ptdesc), 0);
/* Initialize page table */
- table = (unsigned long *) page_to_virt(page);
+ table = (unsigned long *) ptdesc_to_virt(ptdesc);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
- atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ INIT_LIST_HEAD(&ptdesc->pt_list);
+ atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
- atomic_xor_bits(&page->_refcount, 0x01U << 24);
+ atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock);
- list_add(&page->lru, &mm->context.pgtable_list);
+ list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
spin_unlock_bh(&mm->context.lock);
}
return table;
@@ -300,7 +310,9 @@ static void page_table_release_check(struct page *page, void *table,
{
char msg[128];
- if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
+ if (!IS_ENABLED(CONFIG_DEBUG_VM))
+ return;
+ if (!mask && list_empty(&page->lru))
return;
snprintf(msg, sizeof(msg),
"Invalid pgtable %p release half 0x%02x mask 0x%02x",
@@ -308,12 +320,20 @@ static void page_table_release_check(struct page *page, void *table,
dump_page(page, msg);
}
+static void pte_free_now(struct rcu_head *head)
+{
+ struct ptdesc *ptdesc;
+
+ ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+ pagetable_pte_dtor(ptdesc);
+ pagetable_free(ptdesc);
+}
+
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
unsigned int mask, bit, half;
- struct page *page;
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
- page = virt_to_page(table);
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
@@ -323,42 +343,50 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
* will happen outside of the critical section from this
* function or from __tlb_remove_table()
*/
- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 0x03U)
- list_add(&page->lru, &mm->context.pgtable_list);
- else
- list_del(&page->lru);
+ if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
+ /*
+ * Other half is allocated, and neither half has had
+ * its free deferred: add page to head of list, to make
+ * this freed half available for immediate reuse.
+ */
+ list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
+ } else {
+ /* If page is on list, now remove it. */
+ list_del_init(&ptdesc->pt_list);
+ }
spin_unlock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24));
mask >>= 24;
if (mask != 0x00U)
return;
half = 0x01U << bit;
} else {
half = 0x03U;
- mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
mask >>= 24;
}
- page_table_release_check(page, table, half, mask);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
+ if (folio_test_clear_active(ptdesc_folio(ptdesc)))
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+ else
+ pte_free_now(&ptdesc->pt_rcu_head);
}
void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
unsigned long vmaddr)
{
struct mm_struct *mm;
- struct page *page;
unsigned int bit, mask;
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
mm = tlb->mm;
- page = virt_to_page(table);
if (mm_alloc_pgste(mm)) {
gmap_unlink(mm, table, vmaddr);
table = (unsigned long *) ((unsigned long)table | 0x03U);
- tlb_remove_table(tlb, table);
+ tlb_remove_ptdesc(tlb, table);
return;
}
bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
@@ -368,12 +396,20 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
* outside of the critical section from __tlb_remove_table() or from
* page_table_free()
*/
- mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 0x03U)
- list_add_tail(&page->lru, &mm->context.pgtable_list);
- else
- list_del(&page->lru);
+ if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
+ /*
+ * Other half is allocated, and neither half has had
+ * its free deferred: add page to end of list, to make
+ * this freed half available for reuse once its pending
+ * bit has been cleared by __tlb_remove_table().
+ */
+ list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list);
+ } else {
+ /* If page is on list, now remove it. */
+ list_del_init(&ptdesc->pt_list);
+ }
spin_unlock_bh(&mm->context.lock);
table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
tlb_remove_table(tlb, table);
@@ -383,30 +419,48 @@ void __tlb_remove_table(void *_table)
{
unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
void *table = (void *)((unsigned long) _table ^ mask);
- struct page *page = virt_to_page(table);
+ struct ptdesc *ptdesc = virt_to_ptdesc(table);
switch (half) {
case 0x00U: /* pmd, pud, or p4d */
- free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ pagetable_free(ptdesc);
return;
case 0x01U: /* lower 2K of a 4K page table */
case 0x02U: /* higher 2K of a 4K page table */
- mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
+ mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24));
mask >>= 24;
if (mask != 0x00U)
return;
break;
case 0x03U: /* 4K page table with pgstes */
- mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
mask >>= 24;
break;
}
- page_table_release_check(page, table, half, mask);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
+ if (folio_test_clear_active(ptdesc_folio(ptdesc)))
+ call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
+ else
+ pte_free_now(&ptdesc->pt_rcu_head);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
+{
+ struct page *page;
+
+ page = virt_to_page(pgtable);
+ SetPageActive(page);
+ page_table_free(mm, (unsigned long *)pgtable);
+ /*
+ * page_table_free() does not do the pgste gmap_unlink() which
+ * page_table_free_rcu() does: warn us if pgste ever reaches here.
+ */
+ WARN_ON_ONCE(mm_has_pgste(mm));
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/*
* Base infrastructure required to generate basic asces, region, segment,
* and page tables that do not make use of enhanced features like EDAT1.
@@ -432,16 +486,20 @@ static void base_pgt_free(unsigned long *table)
static unsigned long *base_crst_alloc(unsigned long val)
{
unsigned long *table;
+ struct ptdesc *ptdesc;
- table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
- if (table)
- crst_table_init(table, val);
+ ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER);
+ if (!ptdesc)
+ return NULL;
+ table = ptdesc_address(ptdesc);
+
+ crst_table_init(table, val);
return table;
}
static void base_crst_free(unsigned long *table)
{
- free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ pagetable_free(virt_to_ptdesc(table));
}
#define BASE_ADDR_END_FUNC(NAME, SIZE) \
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index b26649233d12..e44243b9c0a4 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -36,7 +36,7 @@ static void vmem_free_pages(unsigned long addr, int order)
{
/* We don't expect boot memory to be removed ever. */
if (!slab_is_available() ||
- WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
+ WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr))))
return;
free_pages(addr, order);
}
@@ -531,7 +531,7 @@ struct range arch_get_mappable_range(void)
struct range mhp_range;
mhp_range.start = 0;
- mhp_range.end = VMEM_MAX_PHYS - 1;
+ mhp_range.end = max_mappable - 1;
return mhp_range;
}
@@ -763,6 +763,8 @@ void __init vmem_map_init(void)
if (static_key_enabled(&cpu_has_bear))
set_memory_nx(0, 1);
set_memory_nx(PAGE_SIZE, 1);
+ if (debug_pagealloc_enabled())
+ set_memory_4k(0, ident_map_size >> PAGE_SHIFT);
pr_info("Write protected kernel read-only data: %luk\n",
(unsigned long)(__end_rodata - _stext) >> 10);