From 403a91b1659cb149dbddc5885f892734ae4542d8 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 29 Oct 2009 00:25:59 +0900 Subject: percpu: allow pcpu_alloc() to be called with IRQs off pcpu_alloc() and pcpu_extend_area_map() perform a series of spin_lock_irq()/spin_unlock_irq() calls, which make them unsafe with respect to being called from contexts which have IRQs off. This patch converts the code to perform save/restore of flags instead, making pcpu_alloc() (or __alloc_percpu() respectively) to be called from early kernel startup stage, where IRQs are off. This is needed for proper initialization of per-cpu rq_weight data from sched_init(). tj: added comment explaining why irqsave/restore is used in alloc path. Signed-off-by: Jiri Kosina Acked-by: Ingo Molnar Signed-off-by: Tejun Heo --- mm/percpu.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index 6af78c1ee704..d90797160c2a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -153,7 +153,10 @@ static int pcpu_reserved_chunk_limit; * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. + * allocations are done using GFP_KERNEL with pcpu_lock released. In + * general, percpu memory can't be allocated with irq off but + * irqsave/restore are still used in alloc path so that it can be used + * from early init path - sched_init() specifically. * * Free path accesses and alters only the index data structures, so it * can be safely called from atomic context. When memory needs to be @@ -366,7 +369,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * RETURNS: * 0 if noop, 1 if successfully extended, -errno on failure. */ -static int pcpu_extend_area_map(struct pcpu_chunk *chunk) +static int pcpu_extend_area_map(struct pcpu_chunk *chunk, unsigned long *flags) { int new_alloc; int *new; @@ -376,7 +379,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) if (chunk->map_alloc >= chunk->map_used + 2) return 0; - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, *flags); new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < chunk->map_used + 2) @@ -384,7 +387,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); if (!new) { - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, *flags); return -ENOMEM; } @@ -393,7 +396,7 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) * could have happened inbetween, so map_used couldn't have * grown. */ - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, *flags); BUG_ON(new_alloc < chunk->map_used + 2); size = chunk->map_alloc * sizeof(chunk->map[0]); @@ -1047,6 +1050,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) struct pcpu_chunk *chunk; const char *err; int slot, off; + unsigned long flags; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " @@ -1055,13 +1059,13 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) } mutex_lock(&pcpu_alloc_mutex); - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) { + pcpu_extend_area_map(chunk, &flags) < 0) { err = "failed to extend area map of reserved chunk"; goto fail_unlock; } @@ -1079,7 +1083,7 @@ restart: if (size > chunk->contig_hint) continue; - switch (pcpu_extend_area_map(chunk)) { + switch (pcpu_extend_area_map(chunk, &flags)) { case 0: break; case 1: @@ -1096,7 +1100,7 @@ restart: } /* hmmm... no space left, create a new chunk */ - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); chunk = alloc_pcpu_chunk(); if (!chunk) { @@ -1104,16 +1108,16 @@ restart: goto fail_unlock_mutex; } - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); goto restart; area_found: - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); /* populate, map and clear the area */ if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; @@ -1125,7 +1129,7 @@ area_found: return __addr_to_pcpu_ptr(chunk->base_addr + off); fail_unlock: - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); if (warn_limit) { -- cgit v1.2.3 From 833af8427be4b217b5bc522f61afdbd3f1d282c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Nov 2009 15:35:18 +0900 Subject: percpu: restructure pcpu_extend_area_map() to fix bugs and improve readability pcpu_extend_area_map() had the following two bugs. * It should return 1 if pcpu_lock was dropped and reacquired but it returned 0. This could lead to oops if free_percpu() races with area map extension. * pcpu_mem_free() was called under pcpu_lock. pcpu_mem_free() might end up calling vfree() which isn't IRQ safe. This could lead to deadlock through lock order inversion via IRQ. In addition, Linus pointed out that the temporary lock dropping and subtle three-way return value of pcpu_extend_area_map() was very ugly and suggested to split the function into two - pcpu_need_to_extend() and pcpu_extend_area_map(). This patch restructures pcpu_extend_area_map() as suggested and fixes the two bugs. Signed-off-by: Tejun Heo Acked-by: Linus Torvalds Cc: Ingo Molnar --- mm/percpu.c | 121 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 81 insertions(+), 40 deletions(-) (limited to 'mm/percpu.c') diff --git a/mm/percpu.c b/mm/percpu.c index d90797160c2a..5adfc268b408 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -355,62 +355,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) } /** - * pcpu_extend_area_map - extend area map for allocation - * @chunk: target chunk + * pcpu_need_to_extend - determine whether chunk area map needs to be extended + * @chunk: chunk of interest * - * Extend area map of @chunk so that it can accomodate an allocation. - * A single allocation can split an area into three areas, so this - * function makes sure that @chunk->map has at least two extra slots. + * Determine whether area map of @chunk needs to be extended to + * accomodate a new allocation. * * CONTEXT: - * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired - * if area map is extended. + * pcpu_lock. * * RETURNS: - * 0 if noop, 1 if successfully extended, -errno on failure. + * New target map allocation length if extension is necessary, 0 + * otherwise. */ -static int pcpu_extend_area_map(struct pcpu_chunk *chunk, unsigned long *flags) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk) { int new_alloc; - int *new; - size_t size; - /* has enough? */ if (chunk->map_alloc >= chunk->map_used + 2) return 0; - spin_unlock_irqrestore(&pcpu_lock, *flags); - new_alloc = PCPU_DFL_MAP_ALLOC; while (new_alloc < chunk->map_used + 2) new_alloc *= 2; - new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); - if (!new) { - spin_lock_irqsave(&pcpu_lock, *flags); + return new_alloc; +} + +/** + * pcpu_extend_area_map - extend area map of a chunk + * @chunk: chunk of interest + * @new_alloc: new target allocation length of the area map + * + * Extend area map of @chunk to have @new_alloc entries. + * + * CONTEXT: + * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) +{ + int *old = NULL, *new = NULL; + size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); + unsigned long flags; + + new = pcpu_mem_alloc(new_size); + if (!new) return -ENOMEM; - } - /* - * Acquire pcpu_lock and switch to new area map. Only free - * could have happened inbetween, so map_used couldn't have - * grown. - */ - spin_lock_irqsave(&pcpu_lock, *flags); - BUG_ON(new_alloc < chunk->map_used + 2); + /* acquire pcpu_lock and switch to new area map */ + spin_lock_irqsave(&pcpu_lock, flags); + + if (new_alloc <= chunk->map_alloc) + goto out_unlock; - size = chunk->map_alloc * sizeof(chunk->map[0]); - memcpy(new, chunk->map, size); + old_size = chunk->map_alloc * sizeof(chunk->map[0]); + memcpy(new, chunk->map, old_size); /* * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is * one of the first chunks and still using static map. */ if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) - pcpu_mem_free(chunk->map, size); + old = chunk->map; chunk->map_alloc = new_alloc; chunk->map = new; + new = NULL; + +out_unlock: + spin_unlock_irqrestore(&pcpu_lock, flags); + + /* + * pcpu_mem_free() might end up calling vfree() which uses + * IRQ-unsafe lock and thus can't be called under pcpu_lock. + */ + pcpu_mem_free(old, old_size); + pcpu_mem_free(new, new_size); + return 0; } @@ -1049,7 +1073,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off; + int slot, off, new_alloc; unsigned long flags; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { @@ -1064,14 +1088,25 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; - if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk, &flags) < 0) { - err = "failed to extend area map of reserved chunk"; + + if (size > chunk->contig_hint) { + err = "alloc from reserved chunk failed"; goto fail_unlock; } + + while ((new_alloc = pcpu_need_to_extend(chunk))) { + spin_unlock_irqrestore(&pcpu_lock, flags); + if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + err = "failed to extend area map of reserved chunk"; + goto fail_unlock_mutex; + } + spin_lock_irqsave(&pcpu_lock, flags); + } + off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; + err = "alloc from reserved chunk failed"; goto fail_unlock; } @@ -1083,14 +1118,20 @@ restart: if (size > chunk->contig_hint) continue; - switch (pcpu_extend_area_map(chunk, &flags)) { - case 0: - break; - case 1: - goto restart; /* pcpu_lock dropped, restart */ - default: - err = "failed to extend area map"; - goto fail_unlock; + new_alloc = pcpu_need_to_extend(chunk); + if (new_alloc) { + spin_unlock_irqrestore(&pcpu_lock, flags); + if (pcpu_extend_area_map(chunk, + new_alloc) < 0) { + err = "failed to extend area map"; + goto fail_unlock_mutex; + } + spin_lock_irqsave(&pcpu_lock, flags); + /* + * pcpu_lock has been dropped, need to + * restart cpu_slot list walking. + */ + goto restart; } off = pcpu_alloc_area(chunk, size, align); -- cgit v1.2.3 From 3b034b0d084221596bf35c8d893e1d4d5477b9cc Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 24 Nov 2009 15:50:03 +0900 Subject: percpu: Fix kdump failure if booted with percpu_alloc=page o kdump functionality reserves a per cpu area at boot time and exports the physical address of that area to user space through sys interface. This area stores some dump related information like cpu register states etc at the time of crash. o We were assuming that per cpu area always come from linearly mapped meory region and using __pa() to determine physical address. With percpu_alloc=page, per cpu area can come from vmalloc region also and __pa() breaks. o This patch implments a new function to convert per cpu address to physical address. Before the patch, crash_notes addresses looked as follows. cpu0 60fffff49800 cpu1 60fffff60800 cpu2 60fffff77800 These are bogus phsyical addresses. After the patch, address are following. cpu0 13eb44000 cpu1 13eb43000 cpu2 13eb42000 cpu3 13eb41000 These look fine. I got 4G of memory and /proc/iomem tell me following. 100000000-13fffffff : System RAM tj: * added missing asm/io.h include reported by Stephen Rothwell * repositioned per_cpu_ptr_phys() in percpu.c and added comment. Signed-off-by: Vivek Goyal Signed-off-by: Tejun Heo Cc: Stephen Rothwell --- drivers/base/cpu.c | 2 +- include/linux/percpu.h | 1 + mm/percpu.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'mm/percpu.c') diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index e62a4ccea54d..69ee5b7517ec 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -97,7 +97,7 @@ static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute * boot up and this data does not change there after. Hence this * operation should be safe. No locking required. */ - addr = __pa(per_cpu_ptr(crash_notes, cpunum)); + addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpunum)); rc = sprintf(buf, "%Lx\n", addr); return rc; } diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 878836ca999c..6ac984fa34f8 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -154,6 +154,7 @@ struct percpu_data { extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); +extern phys_addr_t per_cpu_ptr_to_phys(void *addr); #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA extern void __init setup_per_cpu_areas(void); diff --git a/mm/percpu.c b/mm/percpu.c index 5adfc268b408..008fbd9e6fa4 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -74,6 +74,7 @@ #include #include #include +#include #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ @@ -1302,6 +1303,27 @@ void free_percpu(void *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +/** + * per_cpu_ptr_to_phys - convert translated percpu address to physical address + * @addr: the address to be converted to physical address + * + * Given @addr which is dereferenceable address obtained via one of + * percpu access macros, this function translates it into its physical + * address. The caller is responsible for ensuring @addr stays valid + * until this function finishes. + * + * RETURNS: + * The physical address for @addr. + */ +phys_addr_t per_cpu_ptr_to_phys(void *addr) +{ + if ((unsigned long)addr < VMALLOC_START || + (unsigned long)addr >= VMALLOC_END) + return __pa(addr); + else + return page_to_phys(vmalloc_to_page(addr)); +} + static inline size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, ssize_t *dyn_sizep) -- cgit v1.2.3