summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/s390/Kbuild1
-rw-r--r--arch/s390/Kconfig37
-rw-r--r--arch/s390/include/asm/mmzone.h16
-rw-r--r--arch/s390/include/asm/numa.h31
-rw-r--r--arch/s390/include/asm/pci.h16
-rw-r--r--arch/s390/include/asm/topology.h39
-rw-r--r--arch/s390/include/asm/unistd.h12
-rw-r--r--arch/s390/include/uapi/asm/unistd.h10
-rw-r--r--arch/s390/kernel/setup.c2
-rw-r--r--arch/s390/kernel/syscalls.S10
-rw-r--r--arch/s390/kernel/topology.c21
-rw-r--r--arch/s390/mm/init.c2
-rw-r--r--arch/s390/numa/Makefile1
-rw-r--r--arch/s390/numa/numa.c180
-rw-r--r--arch/s390/numa/numa_mode.h23
15 files changed, 375 insertions, 26 deletions
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild
index 2938934c6518..e256592eb66e 100644
--- a/arch/s390/Kbuild
+++ b/arch/s390/Kbuild
@@ -6,3 +6,4 @@ obj-$(CONFIG_S390_HYPFS_FS) += hypfs/
obj-$(CONFIG_APPLDATA_BASE) += appldata/
obj-y += net/
obj-$(CONFIG_PCI) += pci/
+obj-$(CONFIG_NUMA) += numa/
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 91e8954f1237..25510adb07d3 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -153,6 +153,10 @@ config S390
select TTY
select VIRT_CPU_ACCOUNTING
select VIRT_TO_BUS
+ select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_WANTS_PROT_NUMA_PROT_NONE
+ select HAVE_ARCH_EARLY_PFN_TO_NID
+
config SCHED_OMIT_FRAME_POINTER
def_bool y
@@ -386,6 +390,39 @@ config HOTPLUG_CPU
config SCHED_SMT
def_bool n
+# Some NUMA nodes have memory ranges that span
+# other nodes. Even though a pfn is valid and
+# between a node's start and end pfns, it may not
+# reside on that node. See memmap_init_zone()
+# for details. <- They meant memory holes!
+config NODES_SPAN_OTHER_NODES
+ def_bool NUMA
+
+config NUMA
+ bool "NUMA support"
+ depends on SMP && 64BIT && SCHED_TOPOLOGY
+ default n
+ help
+ Enable NUMA support
+
+ This option adds NUMA support to the kernel.
+
+ An operation mode can be selected by appending
+ numa=<method> to the kernel command line.
+
+ The default behaviour is identical to appending numa=plain to
+ the command line. This will create just one node with all
+ available memory and all CPUs in it.
+
+config NODES_SHIFT
+ int "Maximum NUMA nodes (as a power of 2)"
+ range 1 10
+ depends on NUMA
+ default "4"
+ help
+ Specify the maximum number of NUMA nodes available on the target
+ system. Increases memory reserved to accommodate various tables.
+
config SCHED_MC
def_bool n
diff --git a/arch/s390/include/asm/mmzone.h b/arch/s390/include/asm/mmzone.h
new file mode 100644
index 000000000000..a9e834e60b84
--- /dev/null
+++ b/arch/s390/include/asm/mmzone.h
@@ -0,0 +1,16 @@
+/*
+ * NUMA support for s390
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#ifndef _ASM_S390_MMZONE_H
+#define _ASM_S390_MMZONE_H
+
+#ifdef CONFIG_NUMA
+
+extern struct pglist_data *node_data[];
+#define NODE_DATA(nid) (node_data[nid])
+
+#endif /* CONFIG_NUMA */
+#endif /* _ASM_S390_MMZONE_H */
diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h
new file mode 100644
index 000000000000..ea4edbfba9f6
--- /dev/null
+++ b/arch/s390/include/asm/numa.h
@@ -0,0 +1,31 @@
+/*
+ * NUMA support for s390
+ *
+ * Declare the NUMA core code structures and functions.
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#ifndef _ASM_S390_NUMA_H
+#define _ASM_S390_NUMA_H
+
+#ifdef CONFIG_NUMA
+
+#include <linux/numa.h>
+#include <linux/cpumask.h>
+
+void numa_setup(void);
+int numa_pfn_to_nid(unsigned long pfn);
+int __node_distance(int a, int b);
+void numa_update_cpu_topology(void);
+
+extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+extern int numa_debug_enabled;
+
+#else
+
+static inline void numa_setup(void) { }
+static inline void numa_update_cpu_topology(void) { }
+
+#endif /* CONFIG_NUMA */
+#endif /* _ASM_S390_NUMA_H */
diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
index 9b6545e8f685..34d960353a08 100644
--- a/arch/s390/include/asm/pci.h
+++ b/arch/s390/include/asm/pci.h
@@ -192,4 +192,20 @@ void zpci_debug_init_device(struct zpci_dev *);
void zpci_debug_exit_device(struct zpci_dev *);
void zpci_debug_info(struct zpci_dev *, struct seq_file *);
+#ifdef CONFIG_NUMA
+
+/* Returns the node based on PCI bus */
+static inline int __pcibus_to_node(const struct pci_bus *bus)
+{
+ return NUMA_NO_NODE;
+}
+
+static inline const struct cpumask *
+cpumask_of_pcibus(const struct pci_bus *bus)
+{
+ return cpu_online_mask;
+}
+
+#endif /* CONFIG_NUMA */
+
#endif
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index 4990f6c66288..27ebde643933 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -2,6 +2,7 @@
#define _ASM_S390_TOPOLOGY_H
#include <linux/cpumask.h>
+#include <asm/numa.h>
struct sysinfo_15_1_x;
struct cpu;
@@ -13,6 +14,7 @@ struct cpu_topology_s390 {
unsigned short core_id;
unsigned short socket_id;
unsigned short book_id;
+ unsigned short node_id;
cpumask_t thread_mask;
cpumask_t core_mask;
cpumask_t book_mask;
@@ -52,6 +54,43 @@ static inline void topology_expect_change(void) { }
#define POLARIZATION_VM (2)
#define POLARIZATION_VH (3)
+#define SD_BOOK_INIT SD_CPU_INIT
+
+#ifdef CONFIG_NUMA
+
+#define cpu_to_node cpu_to_node
+static inline int cpu_to_node(int cpu)
+{
+ return per_cpu(cpu_topology, cpu).node_id;
+}
+
+/* Returns a pointer to the cpumask of CPUs on node 'node'. */
+#define cpumask_of_node cpumask_of_node
+static inline const struct cpumask *cpumask_of_node(int node)
+{
+ return node_to_cpumask_map[node];
+}
+
+/*
+ * Returns the number of the node containing node 'node'. This
+ * architecture is flat, so it is a pretty simple function!
+ */
+#define parent_node(node) (node)
+
+#define pcibus_to_node(bus) __pcibus_to_node(bus)
+
+#define node_distance(a, b) __node_distance(a, b)
+
+#else /* !CONFIG_NUMA */
+
+#define numa_node_id numa_node_id
+static inline int numa_node_id(void)
+{
+ return 0;
+}
+
+#endif /* CONFIG_NUMA */
+
#include <asm-generic/topology.h>
#endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index 91f56b1d8156..ec2bfc83a1e9 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -11,12 +11,12 @@
#define __IGNORE_time
-/* Ignore NUMA system calls. Not wired up on s390. */
-#define __IGNORE_mbind
-#define __IGNORE_get_mempolicy
-#define __IGNORE_set_mempolicy
-#define __IGNORE_migrate_pages
-#define __IGNORE_move_pages
+/* NUMA system calls */
+#define _ARCH_WANT_mbind
+#define __ARCH_WANT_get_mempolicy
+#define __ARCH_WANT_set_mempolicy
+#define __ARCH_WANT_migrate_pages
+#define __ARCH_WANT_move_pages
/* Ignore system calls that are also reachable via sys_socket */
#define __IGNORE_recvmmsg
diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h
index 67878af257a0..59d2bb4e2d0c 100644
--- a/arch/s390/include/uapi/asm/unistd.h
+++ b/arch/s390/include/uapi/asm/unistd.h
@@ -204,9 +204,9 @@
#define __NR_statfs64 265
#define __NR_fstatfs64 266
#define __NR_remap_file_pages 267
-/* Number 268 is reserved for new sys_mbind */
-/* Number 269 is reserved for new sys_get_mempolicy */
-/* Number 270 is reserved for new sys_set_mempolicy */
+#define __NR_mbind 268
+#define __NR_get_mempolicy 269
+#define __NR_set_mempolicy 270
#define __NR_mq_open 271
#define __NR_mq_unlink 272
#define __NR_mq_timedsend 273
@@ -223,7 +223,7 @@
#define __NR_inotify_init 284
#define __NR_inotify_add_watch 285
#define __NR_inotify_rm_watch 286
-/* Number 287 is reserved for new sys_migrate_pages */
+#define __NR_migrate_pages 287
#define __NR_openat 288
#define __NR_mkdirat 289
#define __NR_mknodat 290
@@ -245,7 +245,7 @@
#define __NR_sync_file_range 307
#define __NR_tee 308
#define __NR_vmsplice 309
-/* Number 310 is reserved for new sys_move_pages */
+#define __NR_move_pages 310
#define __NR_getcpu 311
#define __NR_epoll_pwait 312
#define __NR_utimes 313
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 85a1d4770c9c..af6b0236ccf3 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -62,6 +62,7 @@
#include <asm/os_info.h>
#include <asm/sclp.h>
#include <asm/sysinfo.h>
+#include <asm/numa.h>
#include "entry.h"
/*
@@ -879,6 +880,7 @@ void __init setup_arch(char **cmdline_p)
setup_lowcore();
smp_fill_possible_mask();
cpu_init();
+ numa_setup();
/*
* Setup capabilities (ELF_HWCAP & ELF_PLATFORM).
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 1acad02681c4..f3f4a137aef6 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -276,9 +276,9 @@ SYSCALL(sys_ni_syscall,compat_sys_s390_fadvise64_64)
SYSCALL(sys_statfs64,compat_sys_statfs64)
SYSCALL(sys_fstatfs64,compat_sys_fstatfs64)
SYSCALL(sys_remap_file_pages,compat_sys_remap_file_pages)
-NI_SYSCALL /* 268 sys_mbind */
-NI_SYSCALL /* 269 sys_get_mempolicy */
-NI_SYSCALL /* 270 sys_set_mempolicy */
+SYSCALL(sys_mbind,compat_sys_mbind)
+SYSCALL(sys_get_mempolicy,compat_sys_get_mempolicy)
+SYSCALL(sys_set_mempolicy,compat_sys_set_mempolicy)
SYSCALL(sys_mq_open,compat_sys_mq_open)
SYSCALL(sys_mq_unlink,compat_sys_mq_unlink)
SYSCALL(sys_mq_timedsend,compat_sys_mq_timedsend)
@@ -295,7 +295,7 @@ SYSCALL(sys_ioprio_get,compat_sys_ioprio_get)
SYSCALL(sys_inotify_init,sys_inotify_init)
SYSCALL(sys_inotify_add_watch,compat_sys_inotify_add_watch) /* 285 */
SYSCALL(sys_inotify_rm_watch,compat_sys_inotify_rm_watch)
-NI_SYSCALL /* 287 sys_migrate_pages */
+SYSCALL(sys_migrate_pages,compat_sys_migrate_pages)
SYSCALL(sys_openat,compat_sys_openat)
SYSCALL(sys_mkdirat,compat_sys_mkdirat)
SYSCALL(sys_mknodat,compat_sys_mknodat) /* 290 */
@@ -318,7 +318,7 @@ SYSCALL(sys_splice,compat_sys_splice)
SYSCALL(sys_sync_file_range,compat_sys_s390_sync_file_range)
SYSCALL(sys_tee,compat_sys_tee)
SYSCALL(sys_vmsplice,compat_sys_vmsplice)
-NI_SYSCALL /* 310 sys_move_pages */
+SYSCALL(sys_move_pages,compat_sys_move_pages)
SYSCALL(sys_getcpu,compat_sys_getcpu)
SYSCALL(sys_epoll_pwait,compat_sys_epoll_pwait)
SYSCALL(sys_utimes,compat_sys_utimes)
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 5728c5bd44a8..0f5f8b09c903 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -18,7 +18,10 @@
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/nodemask.h>
+#include <linux/node.h>
#include <asm/sysinfo.h>
+#include <asm/numa.h>
#define PTF_HORIZONTAL (0UL)
#define PTF_VERTICAL (1UL)
@@ -260,6 +263,7 @@ static void update_cpu_masks(void)
}
}
spin_unlock_irqrestore(&topology_lock, flags);
+ numa_update_cpu_topology();
}
void store_topology(struct sysinfo_15_1_x *info)
@@ -274,21 +278,21 @@ int arch_update_cpu_topology(void)
{
struct sysinfo_15_1_x *info = tl_info;
struct device *dev;
- int cpu;
+ int cpu, rc = 0;
- if (!MACHINE_HAS_TOPOLOGY) {
- update_cpu_masks();
- topology_update_polarization_simple();
- return 0;
+ if (MACHINE_HAS_TOPOLOGY) {
+ rc = 1;
+ store_topology(info);
+ tl_to_masks(info);
}
- store_topology(info);
- tl_to_masks(info);
update_cpu_masks();
+ if (!MACHINE_HAS_TOPOLOGY)
+ topology_update_polarization_simple();
for_each_online_cpu(cpu) {
dev = get_cpu_device(cpu);
kobject_uevent(&dev->kobj, KOBJ_CHANGE);
}
- return 1;
+ return rc;
}
static void topology_work_fn(struct work_struct *work)
@@ -450,7 +454,6 @@ static struct sched_domain_topology_level s390_topology[] = {
{ cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
{ cpu_book_mask, SD_INIT_NAME(BOOK) },
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index dc4db08286e9..2963b563621c 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -139,7 +139,7 @@ void __init mem_init(void)
cpumask_set_cpu(0, mm_cpumask(&init_mm));
atomic_set(&init_mm.context.attach_count, 1);
- max_mapnr = max_low_pfn;
+ set_max_mapnr(max_low_pfn);
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
/* Setup guest page hinting */
diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile
new file mode 100644
index 000000000000..7e94c8f491f7
--- /dev/null
+++ b/arch/s390/numa/Makefile
@@ -0,0 +1 @@
+obj-y += numa.o
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
new file mode 100644
index 000000000000..0416a3671e33
--- /dev/null
+++ b/arch/s390/numa/numa.c
@@ -0,0 +1,180 @@
+/*
+ * NUMA support for s390
+ *
+ * Implement NUMA core code.
+ *
+ * Copyright IBM Corp. 2015
+ */
+
+#define KMSG_COMPONENT "numa"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/mmzone.h>
+#include <linux/cpumask.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/slab.h>
+#include <linux/node.h>
+
+#include <asm/numa.h>
+#include "numa_mode.h"
+
+pg_data_t *node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
+EXPORT_SYMBOL(node_to_cpumask_map);
+
+const struct numa_mode numa_mode_plain = {
+ .name = "plain",
+};
+
+static const struct numa_mode *mode = &numa_mode_plain;
+
+int numa_pfn_to_nid(unsigned long pfn)
+{
+ return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0;
+}
+
+void numa_update_cpu_topology(void)
+{
+ if (mode->update_cpu_topology)
+ mode->update_cpu_topology();
+}
+
+int __node_distance(int a, int b)
+{
+ return mode->distance ? mode->distance(a, b) : 0;
+}
+
+int numa_debug_enabled;
+
+/*
+ * alloc_node_data() - Allocate node data
+ */
+static __init pg_data_t *alloc_node_data(void)
+{
+ pg_data_t *res;
+
+ res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 1);
+ if (!res)
+ panic("Could not allocate memory for node data!\n");
+ memset(res, 0, sizeof(pg_data_t));
+ return res;
+}
+
+/*
+ * numa_setup_memory() - Assign bootmem to nodes
+ *
+ * The memory is first added to memblock without any respect to nodes.
+ * This is fixed before remaining memblock memory is handed over to the
+ * buddy allocator.
+ * An important side effect is that large bootmem allocations might easily
+ * cross node boundaries, which can be needed for large allocations with
+ * smaller memory stripes in each node (i.e. when using NUMA emulation).
+ *
+ * Memory defines nodes:
+ * Therefore this routine also sets the nodes online with memory.
+ */
+static void __init numa_setup_memory(void)
+{
+ unsigned long cur_base, align, end_of_dram;
+ int nid = 0;
+
+ end_of_dram = memblock_end_of_DRAM();
+ align = mode->align ? mode->align() : ULONG_MAX;
+
+ /*
+ * Step through all available memory and assign it to the nodes
+ * indicated by the mode implementation.
+ * All nodes which are seen here will be set online.
+ */
+ cur_base = 0;
+ do {
+ nid = numa_pfn_to_nid(PFN_DOWN(cur_base));
+ node_set_online(nid);
+ memblock_set_node(cur_base, align, &memblock.memory, nid);
+ cur_base += align;
+ } while (cur_base < end_of_dram);
+
+ /* Allocate and fill out node_data */
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ NODE_DATA(nid) = alloc_node_data();
+
+ for_each_online_node(nid) {
+ unsigned long start_pfn, end_pfn;
+ unsigned long t_start, t_end;
+ int i;
+
+ start_pfn = ULONG_MAX;
+ end_pfn = 0;
+ for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) {
+ if (t_start < start_pfn)
+ start_pfn = t_start;
+ if (t_end > end_pfn)
+ end_pfn = t_end;
+ }
+ NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
+ NODE_DATA(nid)->node_id = nid;
+ }
+}
+
+/*
+ * numa_setup() - Earliest initialization
+ *
+ * Assign the mode and call the mode's setup routine.
+ */
+void __init numa_setup(void)
+{
+ pr_info("NUMA mode: %s\n", mode->name);
+ if (mode->setup)
+ mode->setup();
+ numa_setup_memory();
+ memblock_dump_all();
+}
+
+
+/*
+ * numa_init_early() - Initialization initcall
+ *
+ * This runs when only one CPU is online and before the first
+ * topology update is called for by the scheduler.
+ */
+static int __init numa_init_early(void)
+{
+ /* Attach all possible CPUs to node 0 for now. */
+ cpumask_copy(node_to_cpumask_map[0], cpu_possible_mask);
+ return 0;
+}
+early_initcall(numa_init_early);
+
+/*
+ * numa_init_late() - Initialization initcall
+ *
+ * Register NUMA nodes.
+ */
+static int __init numa_init_late(void)
+{
+ int nid;
+
+ for_each_online_node(nid)
+ register_one_node(nid);
+ return 0;
+}
+device_initcall(numa_init_late);
+
+static int __init parse_debug(char *parm)
+{
+ numa_debug_enabled = 1;
+ return 0;
+}
+early_param("numa_debug", parse_debug);
+
+static int __init parse_numa(char *parm)
+{
+ if (strcmp(parm, numa_mode_plain.name) == 0)
+ mode = &numa_mode_plain;
+ return 0;
+}
+early_param("numa", parse_numa);
diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h
new file mode 100644
index 000000000000..775659848011
--- /dev/null
+++ b/arch/s390/numa/numa_mode.h
@@ -0,0 +1,23 @@
+/*
+ * NUMA support for s390
+ *
+ * Define declarations used for communication between NUMA mode
+ * implementations and NUMA core functionality.
+ *
+ * Copyright IBM Corp. 2015
+ */
+#ifndef __S390_NUMA_MODE_H
+#define __S390_NUMA_MODE_H
+
+struct numa_mode {
+ char *name; /* Name of mode */
+ void (*setup)(void); /* Initizalize mode */
+ void (*update_cpu_topology)(void); /* Called by topology code */
+ int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */
+ unsigned long (*align)(void); /* Minimum node alignment */
+ int (*distance)(int a, int b); /* Distance between two nodes */
+};
+
+extern const struct numa_mode numa_mode_plain;
+
+#endif /* __S390_NUMA_MODE_H */