diff options
39 files changed, 879 insertions, 444 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3201e93ebd07..ac1c4de3a484 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4549,7 +4549,7 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; - if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto nomem; register_shrinker(&mmu_shrinker); diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ed5217867555..371d8800b48a 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -402,6 +402,12 @@ static void blk_mq_sysfs_init(struct request_queue *q) } } +/* see blk_register_queue() */ +void blk_mq_finish_init(struct request_queue *q) +{ + percpu_ref_switch_to_percpu(&q->mq_usage_counter); +} + int blk_mq_register_disk(struct gendisk *disk) { struct device *dev = disk_to_dev(disk); diff --git a/block/blk-mq.c b/block/blk-mq.c index df8e1e09dd17..38f4a165640d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -119,16 +119,7 @@ void blk_mq_freeze_queue(struct request_queue *q) spin_unlock_irq(q->queue_lock); if (freeze) { - /* - * XXX: Temporary kludge to work around SCSI blk-mq stall. - * SCSI synchronously creates and destroys many queues - * back-to-back during probe leading to lengthy stalls. - * This will be fixed by keeping ->mq_usage_counter in - * atomic mode until genhd registration, but, for now, - * let's work around using expedited synchronization. - */ - __percpu_ref_kill_expedited(&q->mq_usage_counter); - + percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_queues(q, false); } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); @@ -1804,7 +1795,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 17f5c84ce7bf..521ae9089c50 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -551,12 +551,19 @@ int blk_register_queue(struct gendisk *disk) return -ENXIO; /* - * Initialization must be complete by now. Finish the initial - * bypass from queue allocation. + * SCSI probing may synchronously create and destroy a lot of + * request_queues for non-existent devices. Shutting down a fully + * functional queue takes measureable wallclock time as RCU grace + * periods are involved. To avoid excessive latency in these + * cases, a request_queue starts out in a degraded mode which is + * faster to shut down and is made fully functional here as + * request_queues for non-existent devices never get registered. */ if (!blk_queue_init_done(q)) { queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); blk_queue_bypass_end(q); + if (q->mq_ops) + blk_mq_finish_init(q); } ret = blk_trace_init_sysfs(dev); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index fddfae61222f..be783f717f19 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,8 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0, + GFP_KERNEL); if (ret < 0) return ret; @@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a1d36e62179c..d0d78dc07792 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1183,7 +1183,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0); + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -2188,7 +2188,7 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_bdi; @@ -2196,13 +2196,13 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0); + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3efe1c3877bf..caaf015d6e4b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3494,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0); + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); if (ret) { kfree(found); return ret; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b88edc05c230..170dc41e8bf4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext2_count_free_blocks(sb)); + ext2_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext2_count_free_inodes(sb)); + ext2_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext2_count_dirs(sb)); + ext2_count_dirs(sb), GFP_KERNEL); } if (err) { ext2_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 622e88249024..bb0fdacad058 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount2; } err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb)); + ext3_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb)); + ext3_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); + ext3_count_dirs(sb), GFP_KERNEL); } if (err) { ext3_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0b28b36e7915..05c159218bc2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3892,7 +3892,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Register extent status tree shrinker */ ext4_es_register_shrinker(sbi); - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { + err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; } @@ -4106,17 +4107,20 @@ no_journal: block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); - err = percpu_counter_init(&sbi->s_freeclusters_counter, block); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); + ext4_count_dirs(sb), GFP_KERNEL); if (!err) - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; diff --git a/fs/file_table.c b/fs/file_table.c index 385bfd31512a..0bab12b20460 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f2d0eee9d1f1..8b663b2d9562 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2725,7 +2725,7 @@ static int __init dquot_init(void) panic("Cannot create dquot hash table"); for (i = 0; i < _DQST_DQSTAT_LAST; i++) { - ret = percpu_counter_init(&dqstats.counter[i], 0); + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); if (ret) panic("Cannot create dquot stat counters"); } diff --git a/fs/super.c b/fs/super.c index b9a214d2fe98..1b836107acee 100644 --- a/fs/super.c +++ b/fs/super.c @@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) + if (percpu_counter_init(&s->s_writers.counter[i], 0, + GFP_KERNEL) < 0) goto fail; lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], &type->s_writers_key[i], 0); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a1e31f274fcd..c13a0c09faea 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,6 +140,7 @@ enum { }; struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +void blk_mq_finish_init(struct request_queue *q); int blk_mq_register_disk(struct gendisk *); void blk_mq_unregister_disk(struct gendisk *); diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index 4ebc49fae391..0d348e011a6e 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h @@ -10,6 +10,7 @@ #include <linux/percpu_counter.h> #include <linux/spinlock.h> #include <linux/seqlock.h> +#include <linux/gfp.h> /* * When maximum proportion of some event type is specified, this is the @@ -32,7 +33,7 @@ struct fprop_global { seqcount_t sequence; }; -int fprop_global_init(struct fprop_global *p); +int fprop_global_init(struct fprop_global *p, gfp_t gfp); void fprop_global_destroy(struct fprop_global *p); bool fprop_new_period(struct fprop_global *p, int periods); @@ -79,7 +80,7 @@ struct fprop_local_percpu { raw_spinlock_t lock; /* Protect period and numerator */ }; -int fprop_local_init_percpu(struct fprop_local_percpu *pl); +int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 68a64f11ce02..d5c89e0dd0e6 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -13,7 +13,7 @@ * * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less * than an atomic_t - this is because of the way shutdown works, see - * percpu_ref_kill()/PCPU_COUNT_BIAS. + * percpu_ref_kill()/PERCPU_COUNT_BIAS. * * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() @@ -49,29 +49,60 @@ #include <linux/kernel.h> #include <linux/percpu.h> #include <linux/rcupdate.h> +#include <linux/gfp.h> struct percpu_ref; typedef void (percpu_ref_func_t)(struct percpu_ref *); +/* flags set in the lower bits of percpu_ref->percpu_count_ptr */ +enum { + __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ + __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ + __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, + + __PERCPU_REF_FLAG_BITS = 2, +}; + +/* @flags for percpu_ref_init() */ +enum { + /* + * Start w/ ref == 1 in atomic mode. Can be switched to percpu + * operation using percpu_ref_switch_to_percpu(). If initialized + * with this flag, the ref will stay in atomic mode until + * percpu_ref_switch_to_percpu() is invoked on it. + */ + PERCPU_REF_INIT_ATOMIC = 1 << 0, + + /* + * Start dead w/ ref == 0 in atomic mode. Must be revived with + * percpu_ref_reinit() before used. Implies INIT_ATOMIC. + */ + PERCPU_REF_INIT_DEAD = 1 << 1, +}; + struct percpu_ref { - atomic_t count; + atomic_long_t count; /* * The low bit of the pointer indicates whether the ref is in percpu * mode; if set, then get/put will manipulate the atomic_t. */ - unsigned long pcpu_count_ptr; + unsigned long percpu_count_ptr; percpu_ref_func_t *release; - percpu_ref_func_t *confirm_kill; + percpu_ref_func_t *confirm_switch; + bool force_atomic:1; struct rcu_head rcu; }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release); -void percpu_ref_reinit(struct percpu_ref *ref); + percpu_ref_func_t *release, unsigned int flags, + gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch); +void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); -void __percpu_ref_kill_expedited(struct percpu_ref *ref); +void percpu_ref_reinit(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref @@ -88,26 +119,24 @@ static inline void percpu_ref_kill(struct percpu_ref *ref) return percpu_ref_kill_and_confirm(ref, NULL); } -#define PCPU_REF_DEAD 1 - /* * Internal helper. Don't use outside percpu-refcount proper. The * function doesn't return the pointer and let the caller test it for NULL * because doing so forces the compiler to generate two conditional - * branches as it can't assume that @ref->pcpu_count is not NULL. + * branches as it can't assume that @ref->percpu_count is not NULL. */ -static inline bool __pcpu_ref_alive(struct percpu_ref *ref, - unsigned __percpu **pcpu_countp) +static inline bool __ref_is_percpu(struct percpu_ref *ref, + unsigned long __percpu **percpu_countp) { - unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); + unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); /* paired with smp_store_release() in percpu_ref_reinit() */ smp_read_barrier_depends(); - if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) + if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) return false; - *pcpu_countp = (unsigned __percpu *)pcpu_ptr; + *percpu_countp = (unsigned long __percpu *)percpu_ptr; return true; } @@ -115,18 +144,20 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, * percpu_ref_get - increment a percpu refcount * @ref: percpu_ref to get * - * Analagous to atomic_inc(). - */ + * Analagous to atomic_long_inc(). + * + * This function is safe to call as long as @ref is between init and exit. + */ static inline void percpu_ref_get(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *percpu_count; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) - this_cpu_inc(*pcpu_count); + if (__ref_is_percpu(ref, &percpu_count)) + this_cpu_inc(*percpu_count); else - atomic_inc(&ref->count); + atomic_long_inc(&ref->count); rcu_read_unlock_sched(); } @@ -138,20 +169,20 @@ static inline void percpu_ref_get(struct percpu_ref *ref) * Increment a percpu refcount unless its count already reached zero. * Returns %true on success; %false on failure. * - * The caller is responsible for ensuring that @ref stays accessible. + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; - int ret = false; + unsigned long __percpu *percpu_count; + int ret; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) { - this_cpu_inc(*pcpu_count); + if (__ref_is_percpu(ref, &percpu_count)) { + this_cpu_inc(*percpu_count); ret = true; } else { - ret = atomic_inc_not_zero(&ref->count); + ret = atomic_long_inc_not_zero(&ref->count); } rcu_read_unlock_sched(); @@ -166,23 +197,26 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) * Increment a percpu refcount unless it has already been killed. Returns * %true on success; %false on failure. * - * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget - * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be - * used. After the confirm_kill callback is invoked, it's guaranteed that - * no new reference will be given out by percpu_ref_tryget(). + * Completion of percpu_ref_kill() in itself doesn't guarantee that this + * function will fail. For such guarantee, percpu_ref_kill_and_confirm() + * should be used. After the confirm_kill callback is invoked, it's + * guaranteed that no new reference will be given out by + * percpu_ref_tryget_live(). * - * The caller is responsible for ensuring that @ref stays accessible. + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *percpu_count; int ret = false; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) { - this_cpu_inc(*pcpu_count); + if (__ref_is_percpu(ref, &percpu_count)) { + this_cpu_inc(*percpu_count); ret = true; + } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { + ret = atomic_long_inc_not_zero(&ref->count); } rcu_read_unlock_sched(); @@ -196,16 +230,18 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) + * + * This function is safe to call as long as @ref is between init and exit. */ static inline void percpu_ref_put(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *percpu_count; rcu_read_lock_sched(); - if (__pcpu_ref_alive(ref, &pcpu_count)) - this_cpu_dec(*pcpu_count); - else if (unlikely(atomic_dec_and_test(&ref->count))) + if (__ref_is_percpu(ref, &percpu_count)) + this_cpu_dec(*percpu_count); + else if (unlikely(atomic_long_dec_and_test(&ref->count))) ref->release(ref); rcu_read_unlock_sched(); @@ -216,14 +252,16 @@ static inline void percpu_ref_put(struct percpu_ref *ref) * @ref: percpu_ref to test * * Returns %true if @ref reached zero. + * + * This function is safe to call as long as @ref is between init and exit. */ static inline bool percpu_ref_is_zero(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count; + unsigned long __percpu *percpu_count; - if (__pcpu_ref_alive(ref, &pcpu_count)) + if (__ref_is_percpu(ref, &percpu_count)) return false; - return !atomic_read(&ref->count); + return !atomic_long_read(&ref->count); } #endif diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 6f61b61b7996..a3aa63e47637 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -48,9 +48,9 @@ * intelligent way to determine this would be nice. */ #if BITS_PER_LONG > 32 -#define PERCPU_DYNAMIC_RESERVE (20 << 10) +#define PERCPU_DYNAMIC_RESERVE (28 << 10) #else -#define PERCPU_DYNAMIC_RESERVE (12 << 10) +#define PERCPU_DYNAMIC_RESERVE (20 << 10) #endif extern void *pcpu_base_addr; @@ -122,11 +122,16 @@ extern void __init setup_per_cpu_areas(void); #endif extern void __init percpu_init_late(void); +extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); extern void __percpu *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void __percpu *__pdata); extern phys_addr_t per_cpu_ptr_to_phys(void *addr); -#define alloc_percpu(type) \ - (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) +#define alloc_percpu_gfp(type, gfp) \ + (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ + __alignof__(type), gfp) +#define alloc_percpu(type) \ + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ + __alignof__(type)) #endif /* __LINUX_PERCPU_H */ diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d5dd4657c8d6..50e50095c8d1 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -12,6 +12,7 @@ #include <linux/threads.h> #include <linux/percpu.h> #include <linux/types.h> +#include <linux/gfp.h> #ifdef CONFIG_SMP @@ -26,14 +27,14 @@ struct percpu_counter { extern int percpu_counter_batch; -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, +int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key); -#define percpu_counter_init(fbc, value) \ +#define percpu_counter_init(fbc, value, gfp) \ ({ \ static struct lock_class_key __key; \ \ - __percpu_counter_init(fbc, value, &__key); \ + __percpu_counter_init(fbc, value, gfp, &__key); \ }) void percpu_counter_destroy(struct percpu_counter *fbc); @@ -89,7 +90,8 @@ struct percpu_counter { s64 count; }; -static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) +static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, + gfp_t gfp) { fbc->count = amount; return 0; diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 26a8a4ed9b07..00e8e8fa7358 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -12,6 +12,7 @@ #include <linux/percpu_counter.h> #include <linux/spinlock.h> #include <linux/mutex.h> +#include <linux/gfp.h> struct prop_global { /* @@ -40,7 +41,7 @@ struct prop_descriptor { struct mutex mutex; /* serialize the prop_global switch */ }; -int prop_descriptor_init(struct prop_descriptor *pd, int shift); +int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); void prop_change_shift(struct prop_descriptor *pd, int new_shift); /* @@ -61,7 +62,7 @@ struct prop_local_percpu { raw_spinlock_t lock; /* protect the snapshot state */ }; -int prop_local_init_percpu(struct prop_local_percpu *pl); +int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); void prop_local_destroy_percpu(struct prop_local_percpu *pl); void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 2f26dfb8450e..1f99a1de0e4f 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -63,7 +63,7 @@ static inline void dst_entries_add(struct dst_ops *dst, int val) static inline int dst_entries_init(struct dst_ops *dst) { - return percpu_counter_init(&dst->pcpuc_entries, 0); + return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); } static inline void dst_entries_destroy(struct dst_ops *dst) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 65a8855e99fe..8d1765577acc 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -151,7 +151,7 @@ static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i) static inline void init_frag_mem_limit(struct netns_frags *nf) { - percpu_counter_init(&nf->mem, 0); + percpu_counter_init(&nf->mem, 0, GFP_KERNEL); } static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cab7dc4284dc..136eceadeed1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1607,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -4482,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release); + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; @@ -4555,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index ebf3bac460b0..8f25652f40d4 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c @@ -34,13 +34,13 @@ */ #include <linux/flex_proportions.h> -int fprop_global_init(struct fprop_global *p) +int fprop_global_init(struct fprop_global *p, gfp_t gfp) { int err; p->period = 0; /* Use 1 to avoid dealing with periods with 0 events... */ - err = percpu_counter_init(&p->events, 1); + err = percpu_counter_init(&p->events, 1, gfp); if (err) return err; seqcount_init(&p->sequence); @@ -168,11 +168,11 @@ void fprop_fraction_single(struct fprop_global *p, */ #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) -int fprop_local_init_percpu(struct fprop_local_percpu *pl) +int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp) { int err; - err = percpu_counter_init(&pl->events, 0); + err = percpu_counter_init(&pl->events, 0, gfp); if (err) return err; pl->period = 0; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index a89cf09a8268..6111bcb28376 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -1,6 +1,8 @@ #define pr_fmt(fmt) "%s: " fmt "\n", __func__ #include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/wait.h> #include <linux/percpu-refcount.h> /* @@ -11,8 +13,8 @@ * percpu counters will all sum to the correct value * * (More precisely: because moduler arithmatic is commutative the sum of all the - * pcpu_count vars will be equal to what it would have been if all the gets and - * puts were done to a single integer, even if some of the percpu integers + * percpu_count vars will be equal to what it would have been if all the gets + * and puts were done to a single integer, even if some of the percpu integers * overflow or underflow). * * The real trick to implementing percpu refcounts is shutdown. We can't detect @@ -25,75 +27,64 @@ * works. * * Converting to non percpu mode is done with some RCUish stuff in - * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t - * can't hit 0 before we've added up all the percpu refs. + * percpu_ref_kill. Additionally, we need a bias value so that the + * atomic_long_t can't hit 0 before we've added up all the percpu refs. */ -#define PCPU_COUNT_BIAS (1U << 31) +#define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) -static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) +static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); + +static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) { - return (unsigned __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); + return (unsigned long __percpu *) + (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); } /** * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @flags: PERCPU_REF_INIT_* flags + * @gfp: allocation mask to use * - * Initializes the refcount in single atomic counter mode with a refcount of 1; - * analagous to atomic_set(ref, 1). + * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a + * refcount of 1; analagous to atomic_long_set(ref, 1). See the + * definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ -int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) +int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, + unsigned int flags, gfp_t gfp) { - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); + size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, + __alignof__(unsigned long)); + unsigned long start_count = 0; - ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); - if (!ref->pcpu_count_ptr) + ref->percpu_count_ptr = (unsigned long) + __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); + if (!ref->percpu_count_ptr) return -ENOMEM; - ref->release = release; - return 0; -} -EXPORT_SYMBOL_GPL(percpu_ref_init); - -/** - * percpu_ref_reinit - re-initialize a percpu refcount - * @ref: perpcu_ref to re-initialize - * - * Re-initialize @ref so that it's in the same state as when it finished - * percpu_ref_init(). @ref must have been initialized successfully, killed - * and reached 0 but not exited. - * - * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while - * this function is in progress. - */ -void percpu_ref_reinit(struct percpu_ref *ref) -{ - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); - int cpu; + ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; - BUG_ON(!pcpu_count); - WARN_ON(!percpu_ref_is_zero(ref)); + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + else + start_count += PERCPU_COUNT_BIAS; - atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); + if (flags & PERCPU_REF_INIT_DEAD) + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + else + start_count++; - /* - * Restore per-cpu operation. smp_store_release() is paired with - * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees - * that the zeroing is visible to all percpu accesses which can see - * the following PCPU_REF_DEAD clearing. - */ - for_each_possible_cpu(cpu) - *per_cpu_ptr(pcpu_count, cpu) = 0; + atomic_long_set(&ref->count, start_count); - smp_store_release(&ref->pcpu_count_ptr, - ref->pcpu_count_ptr & ~PCPU_REF_DEAD); + ref->release = release; + return 0; } -EXPORT_SYMBOL_GPL(percpu_ref_reinit); +EXPORT_SYMBOL_GPL(percpu_ref_init); /** * percpu_ref_exit - undo percpu_ref_init() @@ -107,26 +98,39 @@ EXPORT_SYMBOL_GPL(percpu_ref_reinit); */ void percpu_ref_exit(struct percpu_ref *ref) { - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); - if (pcpu_count) { - free_percpu(pcpu_count); - ref->pcpu_count_ptr = PCPU_REF_DEAD; + if (percpu_count) { + free_percpu(percpu_count); + ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; } } EXPORT_SYMBOL_GPL(percpu_ref_exit); -static void percpu_ref_kill_rcu(struct rcu_head *rcu) +static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) +{ + struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); + + ref->confirm_switch(ref); + ref->confirm_switch = NULL; + wake_up_all(&percpu_ref_switch_waitq); + + /* drop ref from percpu_ref_switch_to_atomic() */ + percpu_ref_put(ref); +} + +static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) { struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); - unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); - unsigned count = 0; + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); + unsigned long count = 0; int cpu; for_each_possible_cpu(cpu) - count += *per_cpu_ptr(pcpu_count, cpu); + count += *per_cpu_ptr(percpu_count, cpu); - pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); + pr_debug("global %ld percpu %ld", + atomic_long_read(&ref->count), (long)count); /* * It's crucial that we sum the percpu counters _before_ adding the sum @@ -140,21 +144,137 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * reaching 0 before we add the percpu counts. But doing it at the same * time is equivalent and saves us atomic operations: */ + atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); + + WARN_ONCE(atomic_long_read(&ref->count) <= 0, + "percpu ref (%pf) <= 0 (%ld) after switching to atomic", + ref->release, atomic_long_read(&ref->count)); + + /* @ref is viewed as dead on all CPUs, send out switch confirmation */ + percpu_ref_call_confirm_rcu(rcu); +} + +static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) +{ +} + +static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { + /* switching from percpu to atomic */ + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + + /* + * Non-NULL ->confirm_switch is used to indicate that + * switching is in progress. Use noop one if unspecified. + */ + WARN_ON_ONCE(ref->confirm_switch); + ref->confirm_switch = + confirm_switch ?: percpu_ref_noop_confirm_switch; + + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); + } else if (confirm_switch) { + /* + * Somebody already set ATOMIC. Switching may still be in + * progress. @confirm_switch must be invoked after the + * switching is complete and a full sched RCU grace period + * has passed. Wait synchronously for the previous + * switching and schedule @confirm_switch invocation. + */ + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); + ref->confirm_switch = confirm_switch; - atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); + percpu_ref_get(ref); /* put after confirmation */ + call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); + } +} + +/** + * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode + * @ref: percpu_ref to switch to atomic mode + * @confirm_switch: optional confirmation callback + * + * There's no reason to use this function for the usual reference counting. + * Use percpu_ref_kill[_and_confirm](). + * + * Schedule switching of @ref to atomic mode. All its percpu counts will + * be collected to the main atomic counter. On completion, when all CPUs + * are guaraneed to be in atomic mode, @confirm_switch, which may not + * block, is invoked. This function may be invoked concurrently with all + * the get/put operations and can safely be mixed with kill and reinit + * operations. Note that @ref will stay in atomic mode across kill/reinit + * cycles until percpu_ref_switch_to_percpu() is called. + * + * This function normally doesn't block and can be called from any context + * but it may block if @confirm_kill is specified and @ref is already in + * the process of switching to atomic mode. In such cases, @confirm_switch + * will be invoked after the switching is complete. + * + * Due to the way percpu_ref is implemented, @confirm_switch will be called + * after at least one full sched RCU grace period has passed but this is an + * implementation detail and must not be depended upon. + */ +void percpu_ref_switch_to_atomic(struct percpu_ref *ref, + percpu_ref_func_t *confirm_switch) +{ + ref->force_atomic = true; + __percpu_ref_switch_to_atomic(ref, confirm_switch); +} - WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", - atomic_read(&ref->count)); +static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) +{ + unsigned long __percpu *percpu_count = percpu_count_ptr(ref); + int cpu; + + BUG_ON(!percpu_count); - /* @ref is viewed as dead on all CPUs, send out kill confirmation */ - if (ref->confirm_kill) - ref->confirm_kill(ref); + if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) + return; + + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); + + atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* - * Now we're in single atomic_t mode with a consistent refcount, so it's - * safe to drop our initial ref: + * Restore per-cpu operation. smp_store_release() is paired with + * smp_read_barrier_depends() in __ref_is_percpu() and guarantees + * that the zeroing is visible to all percpu accesses which can see + * the following __PERCPU_REF_ATOMIC clearing. */ - percpu_ref_put(ref); + for_each_possible_cpu(cpu) + *per_cpu_ptr(percpu_count, cpu) = 0; + + smp_store_release(&ref->percpu_count_ptr, + ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); +} + +/** + * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode + * @ref: percpu_ref to switch to percpu mode + * + * There's no reason to use this function for the usual reference counting. + * To re-use an expired ref, use percpu_ref_reinit(). + * + * Switch @ref to percpu mode. This function may be invoked concurrently + * with all the get/put operations and can safely be mixed with kill and + * reinit operations. This function reverses the sticky atomic state set + * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is + * dying or dead, the actual switching takes place on the following + * percpu_ref_reinit(). + * + * This function normally doesn't block and can be called from any context + * but it may block if @ref is in the process of switching to atomic mode + * by percpu_ref_switch_atomic(). + */ +void percpu_ref_switch_to_percpu(struct percpu_ref *ref) +{ + ref->force_atomic = false; + + /* a dying or dead ref can't be switched to percpu mode w/o reinit */ + if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) + __percpu_ref_switch_to_percpu(ref); } /** @@ -164,39 +284,48 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) * * Equivalent to percpu_ref_kill() but also schedules kill confirmation if * @confirm_kill is not NULL. @confirm_kill, which may not block, will be - * called after @ref is seen as dead from all CPUs - all further - * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() - * for more details. + * called after @ref is seen as dead from all CPUs at which point all + * further invocations of percpu_ref_tryget_live() will fail. See + * percpu_ref_tryget_live() for details. + * + * This function normally doesn't block and can be called from any context + * but it may block if @confirm_kill is specified and @ref is in the + * process of switching to atomic mode by percpu_ref_switch_atomic(). * - * Due to the way percpu_ref is implemented, @confirm_kill will be called - * after at least one full RCU grace period has passed but this is an - * implementation detail and callers must not depend on it. + * Due to the way percpu_ref is implemented, @confirm_switch will be called + * after at least one full sched RCU grace period has passed but this is an + * implementation detail and must not be depended upon. */ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill) { - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once!\n"); + WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, + "%s called more than once on %pf!", __func__, ref->release); - ref->pcpu_count_ptr |= PCPU_REF_DEAD; - ref->confirm_kill = confirm_kill; - - call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + __percpu_ref_switch_to_atomic(ref, confirm_kill); + percpu_ref_put(ref); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); -/* - * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by - * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 - * devel cycle. Do not use anywhere else. +/** + * percpu_ref_reinit - re-initialize a percpu refcount + * @ref: perpcu_ref to re-initialize + * + * Re-initialize @ref so that it's in the same state as when it finished + * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been + * initialized successfully and reached 0 but not exited. + * + * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while + * this function is in progress. */ -void __percpu_ref_kill_expedited(struct percpu_ref *ref) +void percpu_ref_reinit(struct percpu_ref *ref) { - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once on %pf!", - ref->release); + WARN_ON_ONCE(!percpu_ref_is_zero(ref)); - ref->pcpu_count_ptr |= PCPU_REF_DEAD; - synchronize_sched_expedited(); - percpu_ref_kill_rcu(&ref->rcu); + ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; + percpu_ref_get(ref); + if (!ref->force_atomic) + __percpu_ref_switch_to_percpu(ref); } +EXPORT_SYMBOL_GPL(percpu_ref_reinit); diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 7dd33577b905..48144cdae819 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -112,13 +112,15 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) } EXPORT_SYMBOL(__percpu_counter_sum); -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, +int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { + unsigned long flags __maybe_unused; + raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; - fbc->counters = alloc_percpu(s32); + fbc->counters = alloc_percpu_gfp(s32, gfp); if (!fbc->counters) return -ENOMEM; @@ -126,9 +128,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc->list); - spin_lock(&percpu_counters_lock); + spin_lock_irqsave(&percpu_counters_lock, flags); list_add(&fbc->list, &percpu_counters); - spin_unlock(&percpu_counters_lock); + spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } @@ -136,15 +138,17 @@ EXPORT_SYMBOL(__percpu_counter_init); void percpu_counter_destroy(struct percpu_counter *fbc) { + unsigned long flags __maybe_unused; + if (!fbc->counters) return; debug_percpu_counter_deactivate(fbc); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&percpu_counters_lock); + spin_lock_irqsave(&percpu_counters_lock, flags); list_del(&fbc->list); - spin_unlock(&percpu_counters_lock); + spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc->counters); fbc->counters = NULL; @@ -173,7 +177,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, return NOTIFY_OK; cpu = (unsigned long)hcpu; - spin_lock(&percpu_counters_lock); + spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; unsigned long flags; @@ -184,7 +188,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, *pcount = 0; raw_spin_unlock_irqrestore(&fbc->lock, flags); } - spin_unlock(&percpu_counters_lock); + spin_unlock_irq(&percpu_counters_lock); #endif return NOTIFY_OK; } diff --git a/lib/proportions.c b/lib/proportions.c index 05df84801b56..6f724298f67a 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -73,7 +73,7 @@ #include <linux/proportions.h> #include <linux/rcupdate.h> -int prop_descriptor_init(struct prop_descriptor *pd, int shift) +int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) { int err; @@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift) pd->index = 0; pd->pg[0].shift = shift; mutex_init(&pd->mutex); - err = percpu_counter_init(&pd->pg[0].events, 0); + err = percpu_counter_init(&pd->pg[0].events, 0, gfp); if (err) goto out; - err = percpu_counter_init(&pd->pg[1].events, 0); + err = percpu_counter_init(&pd->pg[1].events, 0, gfp); if (err) percpu_counter_destroy(&pd->pg[0].events); @@ -188,12 +188,12 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) -int prop_local_init_percpu(struct prop_local_percpu *pl) +int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) { raw_spin_lock_init(&pl->lock); pl->shift = 0; pl->period = 0; - return percpu_counter_init(&pl->events, 0); + return percpu_counter_init(&pl->events, 0, gfp); } void prop_local_destroy_percpu(struct prop_local_percpu *pl) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b27714f1b40f..12a992b62576 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); if (err) goto err; } @@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->write_bandwidth = INIT_BW; bdi->avg_write_bandwidth = INIT_BW; - err = fprop_local_init_percpu(&bdi->completions); + err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); if (err) { err: diff --git a/mm/mmap.c b/mm/mmap.c index 16d19b48e2ad..93d28c7e5420 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3202,7 +3202,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index a881d9673c6b..bd1808e194a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -539,7 +539,7 @@ void __init mmap_init(void) { int ret; - ret = percpu_counter_init(&vm_committed_as, 0); + ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 35ca7102d421..ff24c9d83112 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - fprop_global_init(&writeout_completions); + fprop_global_init(&writeout_completions, GFP_KERNEL); } /** diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 89633fefc6a2..10e3d0b8a86d 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -33,17 +33,14 @@ #include <linux/log2.h> -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - return 0; } -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { /* nada */ } @@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void) chunk->data = pages; chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, 0, nr_pages); + spin_unlock_irq(&pcpu_lock); + return chunk; } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 51108165f829..538998a137d2 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, } /** - * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * pcpu_get_pages - get temp pages array * @chunk: chunk of interest - * @bitmapp: output parameter for bitmap - * @may_alloc: may allocate the array * - * Returns pointer to array of pointers to struct page and bitmap, - * both of which can be indexed with pcpu_page_idx(). The returned - * array is cleared to zero and *@bitmapp is copied from - * @chunk->populated. Note that there is only one array and bitmap - * and access exclusion is the caller's responsibility. - * - * CONTEXT: - * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. - * Otherwise, don't care. + * Returns pointer to array of pointers to struct page which can be indexed + * with pcpu_page_idx(). Note that there is only one array and accesses + * should be serialized by pcpu_alloc_mutex. * * RETURNS: - * Pointer to temp pages array on success, NULL on failure. + * Pointer to temp pages array on success. */ -static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, - unsigned long **bitmapp, - bool may_alloc) +static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) { static struct page **pages; - static unsigned long *bitmap; size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); - size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * - sizeof(unsigned long); - - if (!pages || !bitmap) { - if (may_alloc && !pages) - pages = pcpu_mem_zalloc(pages_size); - if (may_alloc && !bitmap) - bitmap = pcpu_mem_zalloc(bitmap_size); - if (!pages || !bitmap) - return NULL; - } - bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + lockdep_assert_held(&pcpu_alloc_mutex); - *bitmapp = bitmap; + if (!pages) + pages = pcpu_mem_zalloc(pages_size); return pages; } @@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * pcpu_free_pages - free pages which were allocated for @chunk * @chunk: chunk pages were allocated for * @pages: array of pages to be freed, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be freed * @page_end: page index of the last page to be freed + 1 * @@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, * The pages were allocated for @chunk. */ static void pcpu_free_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * pcpu_alloc_pages - allocates pages for @chunk * @chunk: target chunk * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() - * @populated: populated bitmap * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 * @@ -104,8 +80,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; unsigned int cpu, tcpu; @@ -164,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * pcpu_unmap_pages - unmap pages out of a pcpu_chunk * @chunk: chunk of interest * @pages: pages array which can be used to pass information to free - * @populated: populated bitmap * @page_start: page index of the first page to unmap * @page_end: page index of the last page to unmap + 1 * @@ -175,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) * proper pre/post flush functions. */ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu; int i; @@ -192,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), page_end - page_start); } - - bitmap_clear(populated, page_start, page_end - page_start); } /** @@ -228,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest * @pages: pages array containing pages to be mapped - * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * @@ -236,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, * caller is responsible for calling pcpu_post_map_flush() after all * mappings are complete. * - * This function is responsible for setting corresponding bits in - * @chunk->populated bitmap and whatever is necessary for reverse - * lookup (addr -> chunk). + * This function is responsible for setting up whatever is necessary for + * reverse lookup (addr -> chunk). */ static int pcpu_map_pages(struct pcpu_chunk *chunk, - struct page **pages, unsigned long *populated, - int page_start, int page_end) + struct page **pages, int page_start, int page_end) { unsigned int cpu, tcpu; int i, err; @@ -253,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, page_end - page_start); if (err < 0) goto err; - } - /* mapping successful, link chunk and mark populated */ - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) + for (i = page_start; i < page_end; i++) pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], chunk); - __set_bit(i, populated); } - return 0; - err: for_each_possible_cpu(tcpu) { if (tcpu == cpu) @@ -299,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. + * @chunk. * * CONTEXT: * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int free_end = page_start, unmap_end = page_start; struct page **pages; - unsigned long *populated; - unsigned int cpu; - int rs, re, rc; - - /* quick path, check whether all pages are already there */ - rs = page_start; - pcpu_next_pop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - goto clear; - /* need to allocate and map pages, this chunk can't be immutable */ - WARN_ON(chunk->immutable); - - pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + pages = pcpu_get_pages(chunk); if (!pages) return -ENOMEM; - /* alloc and map */ - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_free; - free_end = re; - } + if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + return -ENOMEM; - pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { - rc = pcpu_map_pages(chunk, pages, populated, rs, re); - if (rc) - goto err_unmap; - unmap_end = re; + if (pcpu_map_pages(chunk, pages, page_start, page_end)) { + pcpu_free_pages(chunk, pages, page_start, page_end); + return -ENOMEM; } pcpu_post_map_flush(chunk, page_start, page_end); - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); -clear: - for_each_possible_cpu(cpu) - memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; - -err_unmap: - pcpu_pre_unmap_flush(chunk, page_start, unmap_end); - pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); - pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); -err_free: - pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - return rc; } /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes + * @page_start: the start page + * @page_end: the end page * * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. + * from @chunk. * * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); struct page **pages; - unsigned long *populated; - int rs, re; - - /* quick path, check whether it's empty already */ - rs = page_start; - pcpu_next_unpop(chunk, &rs, &re, page_end); - if (rs == page_start && re == page_end) - return; - - /* immutable chunks can't be depopulated */ - WARN_ON(chunk->immutable); /* * If control reaches here, there must have been at least one * successful population attempt so the temp pages array must * be available now. */ - pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + pages = pcpu_get_pages(chunk); BUG_ON(!pages); /* unmap and free */ pcpu_pre_unmap_flush(chunk, page_start, page_end); - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_unmap_pages(chunk, pages, page_start, page_end); /* no need to flush tlb, vmalloc will handle it lazily */ - pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) - pcpu_free_pages(chunk, pages, populated, rs, re); - - /* commit new bitmap */ - bitmap_copy(chunk->populated, populated, pcpu_unit_pages); + pcpu_free_pages(chunk, pages, page_start, page_end); } static struct pcpu_chunk *pcpu_create_chunk(void) diff --git a/mm/percpu.c b/mm/percpu.c index da997f9800bd..014bab65e0ff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -76,6 +76,10 @@ #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#define PCPU_ATOMIC_MAP_MARGIN_LOW 32 +#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 +#define PCPU_EMPTY_POP_PAGES_LOW 2 +#define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ @@ -102,12 +106,16 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct work_struct map_extend_work;/* async ->map[] extension */ + void *data; /* chunk data */ int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ + int nr_populated; /* # of populated pages */ unsigned long populated[]; /* populated bitmap */ }; @@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk; static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; +static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ + +static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ + /* - * Synchronization rules. - * - * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks, populated bitmap and - * vmalloc mapping. The latter is a spinlock and protects the index - * data structures - chunk slots, chunks and area maps in chunks. - * - * During allocation, pcpu_alloc_mutex is kept locked all the time and - * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. In - * general, percpu memory can't be allocated with irq off but - * irqsave/restore are still used in alloc path so that it can be used - * from early init path - sched_init() specifically. - * - * Free path accesses and alters only the index data structures, so it - * can be safely called from atomic context. When memory needs to be - * returned to the system, free path schedules reclaim_work which - * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be - * reclaimed, release both locks and frees the chunks. Note that it's - * necessary to grab both locks to remove a chunk from circulation as - * allocation path might be referencing the chunk with only - * pcpu_alloc_mutex locked. + * The number of empty populated pages, protected by pcpu_lock. The + * reserved chunk doesn't contribute to the count. */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ -static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ +static int pcpu_nr_empty_pop_pages; -static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* + * Balance work is used to populate or destroy chunks asynchronously. We + * try to keep the number of populated free pages between + * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one + * empty chunk. + */ +static void pcpu_balance_workfn(struct work_struct *work); +static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); +static bool pcpu_async_enabled __read_mostly; +static bool pcpu_atomic_alloc_failed; -/* reclaim work to release fully free chunks, scheduled from free path */ -static void pcpu_reclaim(struct work_struct *work); -static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +static void pcpu_schedule_balance_work(void) +{ + if (pcpu_async_enabled) + schedule_work(&pcpu_balance_work); +} static bool pcpu_addr_in_first_chunk(void *addr) { @@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size) } /** + * pcpu_count_occupied_pages - count the number of pages an area occupies + * @chunk: chunk of interest + * @i: index of the area in question + * + * Count the number of pages chunk's @i'th area occupies. When the area's + * start and/or end address isn't aligned to page boundary, the straddled + * page is included in the count iff the rest of the page is free. + */ +static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) +{ + int off = chunk->map[i] & ~1; + int end = chunk->map[i + 1] & ~1; + + if (!PAGE_ALIGNED(off) && i > 0) { + int prev = chunk->map[i - 1]; + + if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) + off = round_down(off, PAGE_SIZE); + } + + if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { + int next = chunk->map[i + 1]; + int nend = chunk->map[i + 2] & ~1; + + if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) + end = round_up(end, PAGE_SIZE); + } + + return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); +} + +/** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on @@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) /** * pcpu_need_to_extend - determine whether chunk area map needs to be extended * @chunk: chunk of interest + * @is_atomic: the allocation context * - * Determine whether area map of @chunk needs to be extended to - * accommodate a new allocation. + * Determine whether area map of @chunk needs to be extended. If + * @is_atomic, only the amount necessary for a new allocation is + * considered; however, async extension is scheduled if the left amount is + * low. If !@is_atomic, it aims for more empty space. Combined, this + * ensures that the map is likely to have enough available space to + * accomodate atomic allocations which can't extend maps directly. * * CONTEXT: * pcpu_lock. @@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * New target map allocation length if extension is necessary, 0 * otherwise. */ -static int pcpu_need_to_extend(struct pcpu_chunk *chunk) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { - int new_alloc; + int margin, new_alloc; + + if (is_atomic) { + margin = 3; + + if (chunk->map_alloc < + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && + pcpu_async_enabled) + schedule_work(&chunk->map_extend_work); + } else { + margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; + } - if (chunk->map_alloc >= chunk->map_used + 3) + if (chunk->map_alloc >= chunk->map_used + margin) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 3) + while (new_alloc < chunk->map_used + margin) new_alloc *= 2; return new_alloc; @@ -418,11 +469,76 @@ out_unlock: return 0; } +static void pcpu_map_extend_workfn(struct work_struct *work) +{ + struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, + map_extend_work); + int new_alloc; + + spin_lock_irq(&pcpu_lock); + new_alloc = pcpu_need_to_extend(chunk, false); + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); +} + +/** + * pcpu_fit_in_area - try to fit the requested allocation in a candidate area + * @chunk: chunk the candidate area belongs to + * @off: the offset to the start of the candidate area + * @this_size: the size of the candidate area + * @size: the size of the target allocation + * @align: the alignment of the target allocation + * @pop_only: only allocate from already populated region + * + * We're trying to allocate @size bytes aligned at @align. @chunk's area + * at @off sized @this_size is a candidate. This function determines + * whether the target allocation fits in the candidate area and returns the + * number of bytes to pad after @off. If the target area doesn't fit, -1 + * is returned. + * + * If @pop_only is %true, this function only considers the already + * populated part of the candidate area. + */ +static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, + int size, int align, bool pop_only) +{ + int cand_off = off; + + while (true) { + int head = ALIGN(cand_off, align) - off; + int page_start, page_end, rs, re; + + if (this_size < head + size) + return -1; + + if (!pop_only) + return head; + + /* + * If the first unpopulated page is beyond the end of the + * allocation, the whole allocation is populated; + * otherwise, retry from the end of the unpopulated area. + */ + page_start = PFN_DOWN(head + off); + page_end = PFN_UP(head + off + size); + + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); + if (rs >= page_end) + return head; + cand_off = re * PAGE_SIZE; + } +} + /** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes * @align: wanted align + * @pop_only: allocate only from the populated area + * @occ_pages_p: out param for the number of pages the area occupies * * Try to allocate @size bytes area aligned at @align from @chunk. * Note that this function only allocates the offset. It doesn't @@ -437,7 +553,8 @@ out_unlock: * Allocated offset in @chunk on success, -1 if no matching area is * found. */ -static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, + bool pop_only, int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; @@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) if (off & 1) continue; - /* extra for alignment requirement */ - head = ALIGN(off, align) - off; - this_size = (p[1] & ~1) - off; - if (this_size < head + size) { + + head = pcpu_fit_in_area(chunk, off, this_size, size, align, + pop_only); + if (head < 0) { if (!seen_free) { chunk->first_free = i; seen_free = true; @@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) chunk->free_size -= size; *p |= 1; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); pcpu_chunk_relocate(chunk, oslot); return off; } @@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * pcpu_free_area - free area to a pcpu_chunk * @chunk: chunk of interest * @freeme: offset of area to free + * @occ_pages_p: out param for the number of pages the area occupies * * Free area starting from @freeme to @chunk. Note that this function * only modifies the allocation map. It doesn't depopulate or unmap @@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * CONTEXT: * pcpu_lock. */ -static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, + int *occ_pages_p) { int oslot = pcpu_chunk_slot(chunk); int off = 0; @@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) *p = off &= ~1; chunk->free_size += (p[1] & ~1) - off; + *occ_pages_p = pcpu_count_occupied_pages(chunk, i); + /* merge with next? */ if (!(p[1] & 1)) to_free++; @@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); + INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) pcpu_mem_free(chunk, pcpu_chunk_struct_size); } +/** + * pcpu_chunk_populated - post-population bookkeeping + * @chunk: pcpu_chunk which got populated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been populated to @chunk. Update + * the bookkeeping information accordingly. Must be called after each + * successful population. + */ +static void pcpu_chunk_populated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_set(chunk->populated, page_start, nr); + chunk->nr_populated += nr; + pcpu_nr_empty_pop_pages += nr; +} + +/** + * pcpu_chunk_depopulated - post-depopulation bookkeeping + * @chunk: pcpu_chunk which got depopulated + * @page_start: the start page + * @page_end: the end page + * + * Pages in [@page_start,@page_end) have been depopulated from @chunk. + * Update the bookkeeping information accordingly. Must be called after + * each successful depopulation. + */ +static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + int nr = page_end - page_start; + + lockdep_assert_held(&pcpu_lock); + + bitmap_clear(chunk->populated, page_start, nr); + chunk->nr_populated -= nr; + pcpu_nr_empty_pop_pages -= nr; +} + /* * Chunk management implementation. * @@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available + * @gfp: allocation flags * - * Allocate percpu area of @size bytes aligned at @align. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't + * contain %GFP_KERNEL, the allocation is atomic. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + gfp_t gfp) { static int warn_limit = 10; struct pcpu_chunk *chunk; const char *err; - int slot, off, new_alloc; + bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + int occ_pages = 0; + int slot, off, new_alloc, cpu, ret; unsigned long flags; void __percpu *ptr; @@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) return NULL; } - mutex_lock(&pcpu_alloc_mutex); spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) goto fail_unlock; } - while ((new_alloc = pcpu_need_to_extend(chunk))) { + while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { spin_unlock_irqrestore(&pcpu_lock, flags); - if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + if (is_atomic || + pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map of reserved chunk"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; @@ -764,13 +934,15 @@ restart: if (size > chunk->contig_hint) continue; - new_alloc = pcpu_need_to_extend(chunk); + new_alloc = pcpu_need_to_extend(chunk, is_atomic); if (new_alloc) { + if (is_atomic) + continue; spin_unlock_irqrestore(&pcpu_lock, flags); if (pcpu_extend_area_map(chunk, new_alloc) < 0) { err = "failed to extend area map"; - goto fail_unlock_mutex; + goto fail; } spin_lock_irqsave(&pcpu_lock, flags); /* @@ -780,74 +952,134 @@ restart: goto restart; } - off = pcpu_alloc_area(chunk, size, align); + off = pcpu_alloc_area(chunk, size, align, is_atomic, + &occ_pages); if (off >= 0) goto area_found; } } - /* hmmm... no space left, create a new chunk */ spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = pcpu_create_chunk(); - if (!chunk) { - err = "failed to allocate new chunk"; - goto fail_unlock_mutex; + /* + * No space left. Create a new chunk. We don't want multiple + * tasks to create chunks simultaneously. Serialize and create iff + * there's still no empty chunk after grabbing the mutex. + */ + if (is_atomic) + goto fail; + + mutex_lock(&pcpu_alloc_mutex); + + if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + chunk = pcpu_create_chunk(); + if (!chunk) { + mutex_unlock(&pcpu_alloc_mutex); + err = "failed to allocate new chunk"; + goto fail; + } + + spin_lock_irqsave(&pcpu_lock, flags); + pcpu_chunk_relocate(chunk, -1); + } else { + spin_lock_irqsave(&pcpu_lock, flags); } - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_relocate(chunk, -1); + mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: spin_unlock_irqrestore(&pcpu_lock, flags); - /* populate, map and clear the area */ - if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irqsave(&pcpu_lock, flags); - pcpu_free_area(chunk, off); - err = "failed to populate"; - goto fail_unlock; + /* populate if not all pages are already there */ + if (!is_atomic) { + int page_start, page_end, rs, re; + + mutex_lock(&pcpu_alloc_mutex); + + page_start = PFN_DOWN(off); + page_end = PFN_UP(off + size); + + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + WARN_ON(chunk->immutable); + + ret = pcpu_populate_chunk(chunk, rs, re); + + spin_lock_irqsave(&pcpu_lock, flags); + if (ret) { + mutex_unlock(&pcpu_alloc_mutex); + pcpu_free_area(chunk, off, &occ_pages); + err = "failed to populate"; + goto fail_unlock; + } + pcpu_chunk_populated(chunk, rs, re); + spin_unlock_irqrestore(&pcpu_lock, flags); + } + + mutex_unlock(&pcpu_alloc_mutex); } - mutex_unlock(&pcpu_alloc_mutex); + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages -= occ_pages; + + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + + /* clear the areas and return address relative to base address */ + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); - /* return address relative to base address */ ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); -fail_unlock_mutex: - mutex_unlock(&pcpu_alloc_mutex); - if (warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " - "%s\n", size, align, err); +fail: + if (!is_atomic && warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("PERCPU: limit reached, disable warning\n"); } + if (is_atomic) { + /* see the flag handling in pcpu_blance_workfn() */ + pcpu_atomic_alloc_failed = true; + pcpu_schedule_balance_work(); + } return NULL; } /** - * __alloc_percpu - allocate dynamic percpu area + * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @gfp: allocation flags * - * Allocate zero-filled percpu area of @size bytes aligned at @align. - * Might sleep. Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. + * Allocate zero-filled percpu area of @size bytes aligned at @align. If + * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can + * be called from any context but is a lot more likely to fail. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ +void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +{ + return pcpu_alloc(size, align, false, gfp); +} +EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). + */ void __percpu *__alloc_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, false); + return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); @@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { - return pcpu_alloc(size, align, true); + return pcpu_alloc(size, align, true, GFP_KERNEL); } /** - * pcpu_reclaim - reclaim fully free chunks, workqueue function + * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Reclaim all fully free chunks except for the first one. - * - * CONTEXT: - * workqueue context. */ -static void pcpu_reclaim(struct work_struct *work) +static void pcpu_balance_workfn(struct work_struct *work) { - LIST_HEAD(todo); - struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; + LIST_HEAD(to_free); + struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; + int slot, nr_to_pop, ret; + /* + * There's no reason to keep around multiple unused chunks and VM + * areas can be scarce. Destroy all free chunks except for one. + */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, head, list) { + list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ - if (chunk == list_first_entry(head, struct pcpu_chunk, list)) + if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_move(&chunk->list, &todo); + list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); - list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + list_for_each_entry_safe(chunk, next, &to_free, list) { + int rs, re; + + pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { + pcpu_depopulate_chunk(chunk, rs, re); + spin_lock_irq(&pcpu_lock); + pcpu_chunk_depopulated(chunk, rs, re); + spin_unlock_irq(&pcpu_lock); + } pcpu_destroy_chunk(chunk); } + /* + * Ensure there are certain number of free populated pages for + * atomic allocs. Fill up from the most packed so that atomic + * allocs don't increase fragmentation. If atomic allocation + * failed previously, always populate the maximum amount. This + * should prevent atomic allocs larger than PAGE_SIZE from keeping + * failing indefinitely; however, large atomic allocs are not + * something we support properly and can be highly unreliable and + * inefficient. + */ +retry_pop: + if (pcpu_atomic_alloc_failed) { + nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; + /* best effort anyway, don't worry about synchronization */ + pcpu_atomic_alloc_failed = false; + } else { + nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - + pcpu_nr_empty_pop_pages, + 0, PCPU_EMPTY_POP_PAGES_HIGH); + } + + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + int nr_unpop = 0, rs, re; + + if (!nr_to_pop) + break; + + spin_lock_irq(&pcpu_lock); + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + nr_unpop = pcpu_unit_pages - chunk->nr_populated; + if (nr_unpop) + break; + } + spin_unlock_irq(&pcpu_lock); + + if (!nr_unpop) + continue; + + /* @chunk can't go away while pcpu_alloc_mutex is held */ + pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { + int nr = min(re - rs, nr_to_pop); + + ret = pcpu_populate_chunk(chunk, rs, rs + nr); + if (!ret) { + nr_to_pop -= nr; + spin_lock_irq(&pcpu_lock); + pcpu_chunk_populated(chunk, rs, rs + nr); + spin_unlock_irq(&pcpu_lock); + } else { + nr_to_pop = 0; + } + + if (!nr_to_pop) + break; + } + } + + if (nr_to_pop) { + /* ran out of chunks to populate, create a new one and retry */ + chunk = pcpu_create_chunk(); + if (chunk) { + spin_lock_irq(&pcpu_lock); + pcpu_chunk_relocate(chunk, -1); + spin_unlock_irq(&pcpu_lock); + goto retry_pop; + } + } + mutex_unlock(&pcpu_alloc_mutex); } @@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr) void *addr; struct pcpu_chunk *chunk; unsigned long flags; - int off; + int off, occ_pages; if (!ptr) return; @@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr) chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; - pcpu_free_area(chunk, off); + pcpu_free_area(chunk, off, &occ_pages); + + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += occ_pages; /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_size == pcpu_unit_size) { @@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - schedule_work(&pcpu_reclaim_work); + pcpu_schedule_balance_work(); break; } } @@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); + INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); + schunk->nr_populated = pcpu_unit_pages; if (ai->reserved_size) { schunk->free_size = ai->reserved_size; @@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); + INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; bitmap_fill(dchunk->populated, pcpu_unit_pages); + dchunk->nr_populated = pcpu_unit_pages; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[0] = 1; @@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; + pcpu_nr_empty_pop_pages += + pcpu_count_occupied_pages(pcpu_first_chunk, 1); pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ @@ -1932,8 +2250,6 @@ void __init setup_per_cpu_areas(void) if (pcpu_setup_first_chunk(ai, fc) < 0) panic("Failed to initialize percpu areas."); - - pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ @@ -1967,3 +2283,15 @@ void __init percpu_init_late(void) spin_unlock_irqrestore(&pcpu_lock, flags); } } + +/* + * Percpu allocator is initialized early during boot when neither slab or + * workqueue is available. Plug async management until everything is up + * and running. + */ +static int __init percpu_enable_async(void) +{ + pcpu_async_enabled = true; + return 0; +} +subsys_initcall(percpu_enable_async); diff --git a/mm/shmem.c b/mm/shmem.c index 4fad61bb41e5..cd6fc7590e54 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2995,7 +2995,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #endif spin_lock_init(&sbinfo->stat_lock); - if (percpu_counter_init(&sbinfo->used_blocks, 0)) + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 97b0fcc79547..5ab6627cf370 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1115,7 +1115,7 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); - rc = percpu_counter_init(&dccp_orphan_count, 0); + rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); if (rc) goto out_fail; rc = -ENOBUFS; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 461003d258ba..86023b9be47f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3071,8 +3071,8 @@ void __init tcp_init(void) BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); - percpu_counter_init(&tcp_sockets_allocated, 0); - percpu_counter_init(&tcp_orphan_count, 0); + percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); + percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 3af522622fad..1d191357bf88 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -32,7 +32,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) res_parent = &parent_cg->memory_allocated; res_counter_init(&cg_proto->memory_allocated, res_parent); - percpu_counter_init(&cg_proto->sockets_allocated, 0); + percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); return 0; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9d2c6c9facb6..8f34b27d5775 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1341,7 +1341,7 @@ static __init int sctp_init(void) if (!sctp_chunk_cachep) goto err_chunk_cachep; - status = percpu_counter_init(&sctp_sockets_allocated, 0); + status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); if (status) goto err_percpu_counter_init; |