diff options
-rw-r--r-- | Documentation/block/queue-sysfs.txt | 63 | ||||
-rw-r--r-- | block/blk-core.c | 6 | ||||
-rw-r--r-- | block/blk.h | 8 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci.c | 2 | ||||
-rw-r--r-- | include/linux/bio.h | 10 | ||||
-rw-r--r-- | include/linux/blkdev.h | 2 | ||||
-rw-r--r-- | include/linux/module.h | 25 | ||||
-rw-r--r-- | kernel/module.c | 35 | ||||
-rw-r--r-- | mm/mlock.c | 47 |
9 files changed, 125 insertions, 73 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt new file mode 100644 index 000000000000..e164403f60e1 --- /dev/null +++ b/Documentation/block/queue-sysfs.txt @@ -0,0 +1,63 @@ +Queue sysfs files +================= + +This text file will detail the queue files that are located in the sysfs tree +for each block device. Note that stacked devices typically do not export +any settings, since their queue merely functions are a remapping target. +These files are the ones found in the /sys/block/xxx/queue/ directory. + +Files denoted with a RO postfix are readonly and the RW postfix means +read-write. + +hw_sector_size (RO) +------------------- +This is the hardware sector size of the device, in bytes. + +max_hw_sectors_kb (RO) +---------------------- +This is the maximum number of kilobytes supported in a single data transfer. + +max_sectors_kb (RW) +------------------- +This is the maximum number of kilobytes that the block layer will allow +for a filesystem request. Must be smaller than or equal to the maximum +size allowed by the hardware. + +nomerges (RW) +------------- +This enables the user to disable the lookup logic involved with IO merging +requests in the block layer. Merging may still occur through a direct +1-hit cache, since that comes for (almost) free. The IO scheduler will not +waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults +to 0, enabling all merges. + +nr_requests (RW) +---------------- +This controls how many requests may be allocated in the block layer for +read or write requests. Note that the total allocated number may be twice +this amount, since it applies only to reads or writes (not the accumulated +sum). + +read_ahead_kb (RW) +------------------ +Maximum number of kilobytes to read-ahead for filesystems on this block +device. + +rq_affinity (RW) +---------------- +If this option is enabled, the block layer will migrate request completions +to the CPU that originally submitted the request. For some workloads +this provides a significant reduction in CPU cycles due to caching effects. + +scheduler (RW) +-------------- +When read, this file will display the current and available IO schedulers +for this block device. The currently active IO scheduler will be enclosed +in [] brackets. Writing an IO scheduler name to this file will switch +control of this block device to that new IO scheduler. Note that writing +an IO scheduler name to this file will attempt to load that IO scheduler +module, if it isn't already present in the system. + + + +Jens Axboe <jens.axboe@oracle.com>, February 2009 diff --git a/block/blk-core.c b/block/blk-core.c index ca69f3d94100..29bcfac6c688 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io) int rw = rq_data_dir(rq); int cpu; - if (!blk_fs_request(rq) || !disk || !blk_queue_io_stat(disk->queue)) + if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue)) return; cpu = part_stat_lock(); @@ -1667,7 +1667,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) { struct gendisk *disk = req->rq_disk; - if (!disk || !blk_queue_io_stat(disk->queue)) + if (!disk || !blk_do_io_stat(disk->queue)) return; if (blk_fs_request(req)) { @@ -1686,7 +1686,7 @@ static void blk_account_io_done(struct request *req) { struct gendisk *disk = req->rq_disk; - if (!disk || !blk_queue_io_stat(disk->queue)) + if (!disk || !blk_do_io_stat(disk->queue)) return; /* diff --git a/block/blk.h b/block/blk.h index 6e1ed40534e9..0dce92c37496 100644 --- a/block/blk.h +++ b/block/blk.h @@ -108,4 +108,12 @@ static inline int blk_cpu_to_group(int cpu) #endif } +static inline int blk_do_io_stat(struct request_queue *q) +{ + if (q) + return blk_queue_io_stat(q); + + return 0; +} + #endif diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index bef6b45e8a5c..330aacbdec1f 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -192,7 +192,7 @@ static irqreturn_t vp_interrupt(int irq, void *opaque) drv = container_of(vp_dev->vdev.dev.driver, struct virtio_driver, driver); - if (drv->config_changed) + if (drv && drv->config_changed) drv->config_changed(&vp_dev->vdev); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 0942765cf8c0..2aa283ab062b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -451,12 +451,13 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly; #ifdef CONFIG_HIGHMEM /* - * remember to add offset! and never ever reenable interrupts between a - * bvec_kmap_irq and bvec_kunmap_irq!! + * remember never ever reenable interrupts between a bvec_kmap_irq and + * bvec_kunmap_irq! * * This function MUST be inlined - it plays with the CPU interrupt flags. */ -static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) +static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, + unsigned long *flags) { unsigned long addr; @@ -472,7 +473,8 @@ static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) return (char *) addr + bvec->bv_offset; } -static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) +static __always_inline void bvec_kunmap_irq(char *buffer, + unsigned long *flags) { unsigned long ptr = (unsigned long) buffer & PAGE_MASK; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d08c4b8219a6..dcaa0fd84b02 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -455,7 +455,7 @@ struct request_queue #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_CLUSTER) | \ - 1 << QUEUE_FLAG_STACKABLE) + (1 << QUEUE_FLAG_STACKABLE)) static inline int queue_is_locked(struct request_queue *q) { diff --git a/include/linux/module.h b/include/linux/module.h index 4f7ea12463d3..f3b8329eb5b8 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -219,11 +219,6 @@ void *__symbol_get_gpl(const char *symbol); #endif -struct module_ref -{ - local_t count; -} ____cacheline_aligned; - enum module_state { MODULE_STATE_LIVE, @@ -344,8 +339,11 @@ struct module /* Destruction function. */ void (*exit)(void); - /* Reference counts */ - struct module_ref ref[NR_CPUS]; +#ifdef CONFIG_SMP + char *refptr; +#else + local_t ref; +#endif #endif }; #ifndef MODULE_ARCH_INIT @@ -395,13 +393,22 @@ void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x) void symbol_put_addr(void *addr); +static inline local_t *__module_ref_addr(struct module *mod, int cpu) +{ +#ifdef CONFIG_SMP + return (local_t *) (mod->refptr + per_cpu_offset(cpu)); +#else + return &mod->ref; +#endif +} + /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ static inline void __module_get(struct module *module) { if (module) { BUG_ON(module_refcount(module) == 0); - local_inc(&module->ref[get_cpu()].count); + local_inc(__module_ref_addr(module, get_cpu())); put_cpu(); } } @@ -413,7 +420,7 @@ static inline int try_module_get(struct module *module) if (module) { unsigned int cpu = get_cpu(); if (likely(module_is_live(module))) - local_inc(&module->ref[cpu].count); + local_inc(__module_ref_addr(module, cpu)); else ret = 0; put_cpu(); diff --git a/kernel/module.c b/kernel/module.c index e8b51d41dd72..ba22484a987e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; /* Init the unload section of the module. */ static void module_unload_init(struct module *mod) { - unsigned int i; + int cpu; INIT_LIST_HEAD(&mod->modules_which_use_me); - for (i = 0; i < NR_CPUS; i++) - local_set(&mod->ref[i].count, 0); + for_each_possible_cpu(cpu) + local_set(__module_ref_addr(mod, cpu), 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[raw_smp_processor_id()].count, 1); + local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced) unsigned int module_refcount(struct module *mod) { - unsigned int i, total = 0; + unsigned int total = 0; + int cpu; - for (i = 0; i < NR_CPUS; i++) - total += local_read(&mod->ref[i].count); + for_each_possible_cpu(cpu) + total += local_read(__module_ref_addr(mod, cpu)); return total; } EXPORT_SYMBOL(module_refcount); @@ -894,7 +895,7 @@ void module_put(struct module *module) { if (module) { unsigned int cpu = get_cpu(); - local_dec(&module->ref[cpu].count); + local_dec(__module_ref_addr(module, cpu)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); @@ -1464,7 +1465,10 @@ static void free_module(struct module *mod) kfree(mod->args); if (mod->percpu) percpu_modfree(mod->percpu); - +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + if (mod->refptr) + percpu_modfree(mod->refptr); +#endif /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -2011,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod, if (err < 0) goto free_mod; +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), + mod->name); + if (!mod->refptr) { + err = -ENOMEM; + goto free_mod; + } +#endif if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, @@ -2018,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod, mod->name); if (!percpu) { err = -ENOMEM; - goto free_mod; + goto free_percpu; } sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; mod->percpu = percpu; @@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod, free_percpu: if (percpu) percpu_modfree(percpu); +#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) + percpu_modfree(mod->refptr); +#endif free_mod: kfree(args); free_hdr: diff --git a/mm/mlock.c b/mm/mlock.c index 2904a347e476..028ec482fdd4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval) * * return number of pages [> 0] to be removed from locked_vm on success * of "special" vmas. - * - * return negative error if vma spanning @start-@range disappears while - * mmap semaphore is dropped. Unlikely? */ long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; int nr_pages = (end - start) / PAGE_SIZE; BUG_ON(!(vma->vm_flags & VM_LOCKED)); @@ -314,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - long error; - downgrade_write(&mm->mmap_sem); - - error = __mlock_vma_pages_range(vma, start, end, 1); - up_read(&mm->mmap_sem); - /* vma can change or disappear */ - down_write(&mm->mmap_sem); - vma = find_vma(mm, start); - /* non-NULL vma must contain @start, but need to check @end */ - if (!vma || end > vma->vm_end) - return -ENOMEM; - - return 0; /* hide other errors from mmap(), et al */ + return __mlock_vma_pages_range(vma, start, end, 1); } /* @@ -438,41 +422,14 @@ success: vma->vm_flags = newflags; if (lock) { - /* - * mmap_sem is currently held for write. Downgrade the write - * lock to a read lock so that other faults, mmap scans, ... - * while we fault in all pages. - */ - downgrade_write(&mm->mmap_sem); - ret = __mlock_vma_pages_range(vma, start, end, 1); - /* - * Need to reacquire mmap sem in write mode, as our callers - * expect this. We have no support for atomically upgrading - * a sem to write, so we need to check for ranges while sem - * is unlocked. - */ - up_read(&mm->mmap_sem); - /* vma can change or disappear */ - down_write(&mm->mmap_sem); - *prev = find_vma(mm, start); - /* non-NULL *prev must contain @start, but need to check @end */ - if (!(*prev) || end > (*prev)->vm_end) - ret = -ENOMEM; - else if (ret > 0) { + if (ret > 0) { mm->locked_vm -= ret; ret = 0; } else ret = __mlock_posix_error_return(ret); /* translate if needed */ } else { - /* - * TODO: for unlocking, pages will already be resident, so - * we don't need to wait for allocations/reclaim/pagein, ... - * However, unlocking a very large region can still take a - * while. Should we downgrade the semaphore for both lock - * AND unlock ? - */ __mlock_vma_pages_range(vma, start, end, 0); } |