summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/block/queue-sysfs.txt63
-rw-r--r--block/blk-core.c6
-rw-r--r--block/blk.h8
-rw-r--r--drivers/virtio/virtio_pci.c2
-rw-r--r--include/linux/bio.h10
-rw-r--r--include/linux/blkdev.h2
-rw-r--r--include/linux/module.h25
-rw-r--r--kernel/module.c35
-rw-r--r--mm/mlock.c47
9 files changed, 125 insertions, 73 deletions
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
new file mode 100644
index 000000000000..e164403f60e1
--- /dev/null
+++ b/Documentation/block/queue-sysfs.txt
@@ -0,0 +1,63 @@
+Queue sysfs files
+=================
+
+This text file will detail the queue files that are located in the sysfs tree
+for each block device. Note that stacked devices typically do not export
+any settings, since their queue merely functions are a remapping target.
+These files are the ones found in the /sys/block/xxx/queue/ directory.
+
+Files denoted with a RO postfix are readonly and the RW postfix means
+read-write.
+
+hw_sector_size (RO)
+-------------------
+This is the hardware sector size of the device, in bytes.
+
+max_hw_sectors_kb (RO)
+----------------------
+This is the maximum number of kilobytes supported in a single data transfer.
+
+max_sectors_kb (RW)
+-------------------
+This is the maximum number of kilobytes that the block layer will allow
+for a filesystem request. Must be smaller than or equal to the maximum
+size allowed by the hardware.
+
+nomerges (RW)
+-------------
+This enables the user to disable the lookup logic involved with IO merging
+requests in the block layer. Merging may still occur through a direct
+1-hit cache, since that comes for (almost) free. The IO scheduler will not
+waste cycles doing tree/hash lookups for merges if nomerges is 1. Defaults
+to 0, enabling all merges.
+
+nr_requests (RW)
+----------------
+This controls how many requests may be allocated in the block layer for
+read or write requests. Note that the total allocated number may be twice
+this amount, since it applies only to reads or writes (not the accumulated
+sum).
+
+read_ahead_kb (RW)
+------------------
+Maximum number of kilobytes to read-ahead for filesystems on this block
+device.
+
+rq_affinity (RW)
+----------------
+If this option is enabled, the block layer will migrate request completions
+to the CPU that originally submitted the request. For some workloads
+this provides a significant reduction in CPU cycles due to caching effects.
+
+scheduler (RW)
+--------------
+When read, this file will display the current and available IO schedulers
+for this block device. The currently active IO scheduler will be enclosed
+in [] brackets. Writing an IO scheduler name to this file will switch
+control of this block device to that new IO scheduler. Note that writing
+an IO scheduler name to this file will attempt to load that IO scheduler
+module, if it isn't already present in the system.
+
+
+
+Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/block/blk-core.c b/block/blk-core.c
index ca69f3d94100..29bcfac6c688 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
int rw = rq_data_dir(rq);
int cpu;
- if (!blk_fs_request(rq) || !disk || !blk_queue_io_stat(disk->queue))
+ if (!blk_fs_request(rq) || !disk || !blk_do_io_stat(disk->queue))
return;
cpu = part_stat_lock();
@@ -1667,7 +1667,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
struct gendisk *disk = req->rq_disk;
- if (!disk || !blk_queue_io_stat(disk->queue))
+ if (!disk || !blk_do_io_stat(disk->queue))
return;
if (blk_fs_request(req)) {
@@ -1686,7 +1686,7 @@ static void blk_account_io_done(struct request *req)
{
struct gendisk *disk = req->rq_disk;
- if (!disk || !blk_queue_io_stat(disk->queue))
+ if (!disk || !blk_do_io_stat(disk->queue))
return;
/*
diff --git a/block/blk.h b/block/blk.h
index 6e1ed40534e9..0dce92c37496 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -108,4 +108,12 @@ static inline int blk_cpu_to_group(int cpu)
#endif
}
+static inline int blk_do_io_stat(struct request_queue *q)
+{
+ if (q)
+ return blk_queue_io_stat(q);
+
+ return 0;
+}
+
#endif
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index bef6b45e8a5c..330aacbdec1f 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -192,7 +192,7 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
drv = container_of(vp_dev->vdev.dev.driver,
struct virtio_driver, driver);
- if (drv->config_changed)
+ if (drv && drv->config_changed)
drv->config_changed(&vp_dev->vdev);
}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 0942765cf8c0..2aa283ab062b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -451,12 +451,13 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly;
#ifdef CONFIG_HIGHMEM
/*
- * remember to add offset! and never ever reenable interrupts between a
- * bvec_kmap_irq and bvec_kunmap_irq!!
+ * remember never ever reenable interrupts between a bvec_kmap_irq and
+ * bvec_kunmap_irq!
*
* This function MUST be inlined - it plays with the CPU interrupt flags.
*/
-static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
+static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec,
+ unsigned long *flags)
{
unsigned long addr;
@@ -472,7 +473,8 @@ static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
return (char *) addr + bvec->bv_offset;
}
-static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
+static __always_inline void bvec_kunmap_irq(char *buffer,
+ unsigned long *flags)
{
unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d08c4b8219a6..dcaa0fd84b02 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -455,7 +455,7 @@ struct request_queue
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_CLUSTER) | \
- 1 << QUEUE_FLAG_STACKABLE)
+ (1 << QUEUE_FLAG_STACKABLE))
static inline int queue_is_locked(struct request_queue *q)
{
diff --git a/include/linux/module.h b/include/linux/module.h
index 4f7ea12463d3..f3b8329eb5b8 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -219,11 +219,6 @@ void *__symbol_get_gpl(const char *symbol);
#endif
-struct module_ref
-{
- local_t count;
-} ____cacheline_aligned;
-
enum module_state
{
MODULE_STATE_LIVE,
@@ -344,8 +339,11 @@ struct module
/* Destruction function. */
void (*exit)(void);
- /* Reference counts */
- struct module_ref ref[NR_CPUS];
+#ifdef CONFIG_SMP
+ char *refptr;
+#else
+ local_t ref;
+#endif
#endif
};
#ifndef MODULE_ARCH_INIT
@@ -395,13 +393,22 @@ void __symbol_put(const char *symbol);
#define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
void symbol_put_addr(void *addr);
+static inline local_t *__module_ref_addr(struct module *mod, int cpu)
+{
+#ifdef CONFIG_SMP
+ return (local_t *) (mod->refptr + per_cpu_offset(cpu));
+#else
+ return &mod->ref;
+#endif
+}
+
/* Sometimes we know we already have a refcount, and it's easier not
to handle the error case (which only happens with rmmod --wait). */
static inline void __module_get(struct module *module)
{
if (module) {
BUG_ON(module_refcount(module) == 0);
- local_inc(&module->ref[get_cpu()].count);
+ local_inc(__module_ref_addr(module, get_cpu()));
put_cpu();
}
}
@@ -413,7 +420,7 @@ static inline int try_module_get(struct module *module)
if (module) {
unsigned int cpu = get_cpu();
if (likely(module_is_live(module)))
- local_inc(&module->ref[cpu].count);
+ local_inc(__module_ref_addr(module, cpu));
else
ret = 0;
put_cpu();
diff --git a/kernel/module.c b/kernel/module.c
index e8b51d41dd72..ba22484a987e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
- unsigned int i;
+ int cpu;
INIT_LIST_HEAD(&mod->modules_which_use_me);
- for (i = 0; i < NR_CPUS; i++)
- local_set(&mod->ref[i].count, 0);
+ for_each_possible_cpu(cpu)
+ local_set(__module_ref_addr(mod, cpu), 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+ local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
unsigned int module_refcount(struct module *mod)
{
- unsigned int i, total = 0;
+ unsigned int total = 0;
+ int cpu;
- for (i = 0; i < NR_CPUS; i++)
- total += local_read(&mod->ref[i].count);
+ for_each_possible_cpu(cpu)
+ total += local_read(__module_ref_addr(mod, cpu));
return total;
}
EXPORT_SYMBOL(module_refcount);
@@ -894,7 +895,7 @@ void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
- local_dec(&module->ref[cpu].count);
+ local_dec(__module_ref_addr(module, cpu));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
@@ -1464,7 +1465,10 @@ static void free_module(struct module *mod)
kfree(mod->args);
if (mod->percpu)
percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ if (mod->refptr)
+ percpu_modfree(mod->refptr);
+#endif
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2011,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod,
if (err < 0)
goto free_mod;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+ mod->name);
+ if (!mod->refptr) {
+ err = -ENOMEM;
+ goto free_mod;
+ }
+#endif
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -2018,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod,
mod->name);
if (!percpu) {
err = -ENOMEM;
- goto free_mod;
+ goto free_percpu;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
@@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod,
free_percpu:
if (percpu)
percpu_modfree(percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ percpu_modfree(mod->refptr);
+#endif
free_mod:
kfree(args);
free_hdr:
diff --git a/mm/mlock.c b/mm/mlock.c
index 2904a347e476..028ec482fdd4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -294,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
*
* return number of pages [> 0] to be removed from locked_vm on success
* of "special" vmas.
- *
- * return negative error if vma spanning @start-@range disappears while
- * mmap semaphore is dropped. Unlikely?
*/
long mlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- struct mm_struct *mm = vma->vm_mm;
int nr_pages = (end - start) / PAGE_SIZE;
BUG_ON(!(vma->vm_flags & VM_LOCKED));
@@ -314,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current))) {
- long error;
- downgrade_write(&mm->mmap_sem);
-
- error = __mlock_vma_pages_range(vma, start, end, 1);
- up_read(&mm->mmap_sem);
- /* vma can change or disappear */
- down_write(&mm->mmap_sem);
- vma = find_vma(mm, start);
- /* non-NULL vma must contain @start, but need to check @end */
- if (!vma || end > vma->vm_end)
- return -ENOMEM;
-
- return 0; /* hide other errors from mmap(), et al */
+ return __mlock_vma_pages_range(vma, start, end, 1);
}
/*
@@ -438,41 +422,14 @@ success:
vma->vm_flags = newflags;
if (lock) {
- /*
- * mmap_sem is currently held for write. Downgrade the write
- * lock to a read lock so that other faults, mmap scans, ...
- * while we fault in all pages.
- */
- downgrade_write(&mm->mmap_sem);
-
ret = __mlock_vma_pages_range(vma, start, end, 1);
- /*
- * Need to reacquire mmap sem in write mode, as our callers
- * expect this. We have no support for atomically upgrading
- * a sem to write, so we need to check for ranges while sem
- * is unlocked.
- */
- up_read(&mm->mmap_sem);
- /* vma can change or disappear */
- down_write(&mm->mmap_sem);
- *prev = find_vma(mm, start);
- /* non-NULL *prev must contain @start, but need to check @end */
- if (!(*prev) || end > (*prev)->vm_end)
- ret = -ENOMEM;
- else if (ret > 0) {
+ if (ret > 0) {
mm->locked_vm -= ret;
ret = 0;
} else
ret = __mlock_posix_error_return(ret); /* translate if needed */
} else {
- /*
- * TODO: for unlocking, pages will already be resident, so
- * we don't need to wait for allocations/reclaim/pagein, ...
- * However, unlocking a very large region can still take a
- * while. Should we downgrade the semaphore for both lock
- * AND unlock ?
- */
__mlock_vma_pages_range(vma, start, end, 0);
}