From 5167695753c63444a9e6cbbef136200a16c7a225 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Dec 2010 14:18:20 +0100 Subject: perf: Fix duplicate events with multiple-pmu vs software events Because the multi-pmu bits can share contexts between struct pmu instances we could get duplicate events by iterating the pmu list. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index de2c41758e29..4f1279e105ee 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -887,6 +887,7 @@ struct perf_cpu_context { int exclusive; struct list_head rotation_list; int jiffies_interval; + struct pmu *active_pmu; }; struct perf_output_handle { -- cgit v1.2.3 From 0f004f5a696a9434b7214d0d3cbd0525ee77d428 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 30 Nov 2010 19:48:45 +0100 Subject: sched: Cure more NO_HZ load average woes There's a long-running regression that proved difficult to fix and which is hitting certain people and is rather annoying in its effects. Damien reported that after 74f5187ac8 (sched: Cure load average vs NO_HZ woes) his load average is unnaturally high, he also noted that even with that patch reverted the load avgerage numbers are not correct. The problem is that the previous patch only solved half the NO_HZ problem, it addressed the part of going into NO_HZ mode, not of comming out of NO_HZ mode. This patch implements that missing half. When comming out of NO_HZ mode there are two important things to take care of: - Folding the pending idle delta into the global active count. - Correctly aging the averages for the idle-duration. So with this patch the NO_HZ interaction should be complete and behaviour between CONFIG_NO_HZ=[yn] should be equivalent. Furthermore, this patch slightly changes the load average computation by adding a rounding term to the fixed point multiplication. Reported-by: Damien Wyart Reported-by: Tim McGrath Tested-by: Damien Wyart Tested-by: Orion Poplawski Tested-by: Kyle McMartin Signed-off-by: Peter Zijlstra Cc: stable@kernel.org Cc: Chase Douglas LKML-Reference: <1291129145.32004.874.camel@laptop> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++---- kernel/timer.c | 2 +- 3 files changed, 141 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c79e921a68b..223874538b33 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -143,7 +143,7 @@ extern unsigned long nr_iowait_cpu(int cpu); extern unsigned long this_cpu_load(void); -extern void calc_global_load(void); +extern void calc_global_load(unsigned long ticks); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d09ac3..6b7c26a1a097 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3119,6 +3119,15 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + #ifdef CONFIG_NO_HZ /* * For NO_HZ we delay the active fold to the next LOAD_FREQ update. @@ -3148,6 +3157,128 @@ static long calc_load_fold_idle(void) return delta; } + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(unsigned long ticks) +{ + long delta, active, n; + + if (time_before(jiffies, calc_load_update)) + return; + + /* + * If we crossed a calc_load_update boundary, make sure to fold + * any pending idle changes, the respective CPUs might have + * missed the tick driven calc_load_account_active() update + * due to NO_HZ. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + /* + * If we were idle for multiple load cycles, apply them. + */ + if (ticks >= LOAD_FREQ) { + n = ticks / LOAD_FREQ; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; + } + + /* + * Its possible the remainder of the above division also crosses + * a LOAD_FREQ period, the regular check in calc_global_load() + * which comes after this will take care of that. + * + * Consider us being 11 ticks before a cycle completion, and us + * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will + * age us 4 cycles, and the test in calc_global_load() will + * pick up the final one. + */ +} #else static void calc_load_account_idle(struct rq *this_rq) { @@ -3157,6 +3288,10 @@ static inline long calc_load_fold_idle(void) { return 0; } + +static void calc_global_nohz(unsigned long ticks) +{ +} #endif /** @@ -3174,24 +3309,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -static unsigned long -calc_load(unsigned long load, unsigned long exp, unsigned long active) -{ - load *= exp; - load += active * (FIXED_1 - exp); - return load >> FSHIFT; -} - /* * calc_load - update the avenrun load estimates 10 ticks after the * CPUs have updated calc_load_tasks. */ -void calc_global_load(void) +void calc_global_load(unsigned long ticks) { - unsigned long upd = calc_load_update + 10; long active; - if (time_before(jiffies, upd)) + calc_global_nohz(ticks); + + if (time_before(jiffies, calc_load_update + 10)) return; active = atomic_long_read(&calc_load_tasks); diff --git a/kernel/timer.c b/kernel/timer.c index 68a9ae7679b7..7bd715fda974 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1319,7 +1319,7 @@ void do_timer(unsigned long ticks) { jiffies_64 += ticks; update_wall_time(); - calc_global_load(); + calc_global_load(ticks); } #ifdef __ARCH_WANT_SYS_ALARM -- cgit v1.2.3 From 53dde5f385bc56e312f78b7cb25ffaf8efd4735d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 16 Nov 2010 13:23:50 -0800 Subject: bootmem: Add alloc_bootmem_align() Add an alloc_bootmem_align() interface to allocate bootmem with specified alignment. This is necessary to be able to allocate the xsave area in a subsequent patch. Signed-off-by: Suresh Siddha LKML-Reference: <20101116212441.977574826@sbsiddha-MOBL3.sc.intel.com> Acked-by: H. Peter Anvin Signed-off-by: H. Peter Anvin Cc: --- include/linux/bootmem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 266ab9291232..499dfe982a0e 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -105,6 +105,8 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define alloc_bootmem(x) \ __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_align(x, align) \ + __alloc_bootmem(x, align, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_nopanic(x) \ __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_pages(x) \ -- cgit v1.2.3 From e692cb668fdd5a712c6ed2a2d6f2a36ee83997b4 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Dec 2010 19:41:49 +0100 Subject: block: Deprecate QUEUE_FLAG_CLUSTER and use queue_limits instead When stacking devices, a request_queue is not always available. This forced us to have a no_cluster flag in the queue_limits that could be used as a carrier until the request_queue had been set up for a metadevice. There were several problems with that approach. First of all it was up to the stacking device to remember to set queue flag after stacking had completed. Also, the queue flag and the queue limits had to be kept in sync at all times. We got that wrong, which could lead to us issuing commands that went beyond the max scatterlist limit set by the driver. The proper fix is to avoid having two flags for tracking the same thing. We deprecate QUEUE_FLAG_CLUSTER and use the queue limit directly in the block layer merging functions. The queue_limit 'no_cluster' is turned into 'cluster' to avoid double negatives and to ease stacking. Clustering defaults to being enabled as before. The queue flag logic is removed from the stacking function, and explicitly setting the cluster flag is no longer necessary in DM and MD. Reported-by: Ed Lin Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Cc: stable@kernel.org Signed-off-by: Jens Axboe --- block/blk-merge.c | 6 +++--- block/blk-settings.c | 25 ++----------------------- block/blk-sysfs.c | 2 +- drivers/md/dm-table.c | 5 ----- drivers/md/md.c | 3 --- drivers/scsi/scsi_lib.c | 3 +-- include/linux/blkdev.h | 9 ++++++--- 7 files changed, 13 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index 77b7c26df6b5..74bc4a768f32 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -21,7 +21,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, return 0; fbio = bio; - cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); + cluster = blk_queue_cluster(q); seg_size = 0; nr_phys_segs = 0; for_each_bio(bio) { @@ -87,7 +87,7 @@ EXPORT_SYMBOL(blk_recount_segments); static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) { - if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + if (!blk_queue_cluster(q)) return 0; if (bio->bi_seg_back_size + nxt->bi_seg_front_size > @@ -123,7 +123,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, int nsegs, cluster; nsegs = 0; - cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); + cluster = blk_queue_cluster(q); /* * for each bio in rq diff --git a/block/blk-settings.c b/block/blk-settings.c index 701859fb9647..e55f5fc4ca22 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -126,7 +126,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; - lim->no_cluster = 0; + lim->cluster = 1; } EXPORT_SYMBOL(blk_set_default_limits); @@ -464,15 +464,6 @@ EXPORT_SYMBOL(blk_queue_io_opt); void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) { blk_stack_limits(&t->limits, &b->limits, 0); - - if (!t->queue_lock) - WARN_ON_ONCE(1); - else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { - unsigned long flags; - spin_lock_irqsave(t->queue_lock, flags); - queue_flag_clear(QUEUE_FLAG_CLUSTER, t); - spin_unlock_irqrestore(t->queue_lock, flags); - } } EXPORT_SYMBOL(blk_queue_stack_limits); @@ -545,7 +536,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->io_min = max(t->io_min, b->io_min); t->io_opt = lcm(t->io_opt, b->io_opt); - t->no_cluster |= b->no_cluster; + t->cluster &= b->cluster; t->discard_zeroes_data &= b->discard_zeroes_data; /* Physical block size a multiple of the logical block size? */ @@ -641,7 +632,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) { struct request_queue *t = disk->queue; - struct request_queue *b = bdev_get_queue(bdev); if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; @@ -652,17 +642,6 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", top, bottom); } - - if (!t->queue_lock) - WARN_ON_ONCE(1); - else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { - unsigned long flags; - - spin_lock_irqsave(t->queue_lock, flags); - if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) - queue_flag_clear(QUEUE_FLAG_CLUSTER, t); - spin_unlock_irqrestore(t->queue_lock, flags); - } } EXPORT_SYMBOL(disk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 013457f47fdc..41fb69150b4d 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -119,7 +119,7 @@ static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char * static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { - if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + if (blk_queue_cluster(q)) return queue_var_show(queue_max_segment_size(q), (page)); return queue_var_show(PAGE_CACHE_SIZE, (page)); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 90267f8d64ee..e2da1912a2cb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1131,11 +1131,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ q->limits = *limits; - if (limits->no_cluster) - queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); - else - queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q); - if (!dm_table_supports_discards(t)) queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); else diff --git a/drivers/md/md.c b/drivers/md/md.c index 84c46a161927..52694d29663d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4296,9 +4296,6 @@ static int md_alloc(dev_t dev, char *name) goto abort; mddev->queue->queuedata = mddev; - /* Can be unlocked because the queue is new: no concurrency */ - queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); - blk_queue_make_request(mddev->queue, md_make_request); disk = alloc_disk(1 << shift); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index eafeeda6e194..9d7ba07dc5ef 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1642,9 +1642,8 @@ struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, blk_queue_max_segment_size(q, dma_get_max_seg_size(dev)); - /* New queue, no concurrency on queue_flags */ if (!shost->use_clustering) - queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); + q->limits.cluster = 0; /* * set a reasonable default alignment on word boundaries: the diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index aae86fd10c4f..95aeeeb49e8b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -250,7 +250,7 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; - unsigned char no_cluster; + unsigned char cluster; signed char discard_zeroes_data; }; @@ -380,7 +380,6 @@ struct request_queue #endif }; -#define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ #define QUEUE_FLAG_SYNCFULL 3 /* read queue has been filled */ @@ -403,7 +402,6 @@ struct request_queue #define QUEUE_FLAG_SECDISCARD 19 /* supports SECDISCARD */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ - (1 << QUEUE_FLAG_CLUSTER) | \ (1 << QUEUE_FLAG_STACKABLE) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_ADD_RANDOM)) @@ -510,6 +508,11 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) #define rq_data_dir(rq) ((rq)->cmd_flags & 1) +static inline unsigned int blk_queue_cluster(struct request_queue *q) +{ + return q->limits.cluster; +} + /* * We regard a request as sync, if either a read or a sync write */ -- cgit v1.2.3 From 72d4cd9f38b5ed96b75df4c622be25e1c2648dd3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 17 Dec 2010 08:34:20 +0100 Subject: block: max hardware sectors limit wrapper Implement blk_limits_max_hw_sectors() and make blk_queue_max_hw_sectors() a wrapper around it. DM needs this to avoid setting queue_limits' max_hw_sectors and max_sectors directly. dm_set_device_limits() now leverages blk_limits_max_hw_sectors() logic to establish the appropriate max_hw_sectors minimum (PAGE_SIZE). Fixes issue where DM was incorrectly setting max_sectors rather than max_hw_sectors (which caused dm_merge_bvec()'s max_hw_sectors check to be ineffective). Signed-off-by: Mike Snitzer Cc: stable@kernel.org Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 26 ++++++++++++++++++++------ drivers/md/dm-table.c | 5 ++--- include/linux/blkdev.h | 1 + 3 files changed, 23 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index e55f5fc4ca22..36c8c1f2af18 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -229,8 +229,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device + * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request + * @limits: the queue limits * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -244,7 +244,7 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * per-device basis in /sys/block//queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) { if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); @@ -252,9 +252,23 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto __func__, max_hw_sectors); } - q->limits.max_hw_sectors = max_hw_sectors; - q->limits.max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + limits->max_hw_sectors = max_hw_sectors; + limits->max_sectors = min_t(unsigned int, max_hw_sectors, + BLK_DEF_MAX_SECTORS); +} +EXPORT_SYMBOL(blk_limits_max_hw_sectors); + +/** + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * + * Description: + * See description for blk_limits_max_hw_sectors(). + **/ +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +{ + blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e2da1912a2cb..4d705cea0f8c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -517,9 +517,8 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, */ if (q->merge_bvec_fn && !ti->type->merge) - limits->max_sectors = - min_not_zero(limits->max_sectors, - (unsigned int) (PAGE_SIZE >> 9)); + blk_limits_max_hw_sectors(limits, + (unsigned int) (PAGE_SIZE >> 9)); return 0; } EXPORT_SYMBOL_GPL(dm_set_device_limits); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 95aeeeb49e8b..36ab42c9bb99 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -808,6 +808,7 @@ extern struct request_queue *blk_init_allocated_queue(struct request_queue *, extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); +extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); -- cgit v1.2.3 From b6aa5901c7a2bd90d0b6b9866300d2648b2568f3 Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Wed, 15 Dec 2010 20:45:41 -0800 Subject: ceph: mark user pages dirty on direct-io reads For read operation, we have to set the argument _write_ of get_user_pages to 1 since we will write data to pages. Also, we need to SetPageDirty before releasing these pages. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 8 ++++---- include/linux/ceph/libceph.h | 6 ++++-- net/ceph/pagevec.c | 11 +++++++---- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e860d8f1bb45..7d0e4a82d898 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -384,7 +384,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, if (file->f_flags & O_DIRECT) { num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, num_pages); + pages = ceph_get_direct_page_vector(data, num_pages, true); } else { num_pages = calc_pages_for(off, len); pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); @@ -413,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, done: if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages, true); else ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); @@ -522,7 +522,7 @@ more: return -ENOMEM; if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages); + pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -572,7 +572,7 @@ more: } if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages, false); else if (file->f_flags & O_SYNC) ceph_release_page_vector(pages, num_pages); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 9e76d35670d2..72c72bfccb88 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -227,8 +227,10 @@ extern int ceph_open_session(struct ceph_client *client); extern void ceph_release_page_vector(struct page **pages, int num_pages); extern struct page **ceph_get_direct_page_vector(const char __user *data, - int num_pages); -extern void ceph_put_page_vector(struct page **pages, int num_pages); + int num_pages, + bool write_page); +extern void ceph_put_page_vector(struct page **pages, int num_pages, + bool dirty); extern void ceph_release_page_vector(struct page **pages, int num_pages); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern int ceph_copy_user_to_page_vector(struct page **pages, diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index ac34feeb2b3a..01947a5d03b7 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -13,7 +13,7 @@ * build a vector of user pages */ struct page **ceph_get_direct_page_vector(const char __user *data, - int num_pages) + int num_pages, bool write_page) { struct page **pages; int rc; @@ -24,7 +24,7 @@ struct page **ceph_get_direct_page_vector(const char __user *data, down_read(¤t->mm->mmap_sem); rc = get_user_pages(current, current->mm, (unsigned long)data, - num_pages, 0, 0, pages, NULL); + num_pages, write_page, 0, pages, NULL); up_read(¤t->mm->mmap_sem); if (rc < 0) goto fail; @@ -36,12 +36,15 @@ fail: } EXPORT_SYMBOL(ceph_get_direct_page_vector); -void ceph_put_page_vector(struct page **pages, int num_pages) +void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) { int i; - for (i = 0; i < num_pages; i++) + for (i = 0; i < num_pages; i++) { + if (dirty) + set_page_dirty_lock(pages[i]); put_page(pages[i]); + } kfree(pages); } EXPORT_SYMBOL(ceph_put_page_vector); -- cgit v1.2.3 From c0f5ac5426f7fd82b23dd5c6a1e633b290294a08 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 16 Dec 2010 10:38:41 -0700 Subject: Revert "resources: support allocating space within a region from the top down" This reverts commit e7f8567db9a7f6b3151b0b275e245c1cef0d9c70. Acked-by: H. Peter Anvin Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- Documentation/kernel-parameters.txt | 5 -- include/linux/ioport.h | 1 - kernel/resource.c | 98 ++----------------------------------- 3 files changed, 4 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cdd2a6e8a3b7..8b61c9360999 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2175,11 +2175,6 @@ and is between 256 and 4096 characters. It is defined in the file reset_devices [KNL] Force drivers to reset the underlying device during initialization. - resource_alloc_from_bottom - Allocate new resources from the beginning of available - space, not the end. If you need to use this, please - report a bug. - resume= [SWSUSP] Specify the partition device for software suspend diff --git a/include/linux/ioport.h b/include/linux/ioport.h index d377ea815d45..b22790268b64 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -112,7 +112,6 @@ struct resource_list { /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; -extern int resource_alloc_from_bottom; extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); diff --git a/kernel/resource.c b/kernel/resource.c index 9fad33efd0db..560659f7baef 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource); static DEFINE_RWLOCK(resource_lock); -/* - * By default, we allocate free space bottom-up. The architecture can request - * top-down by clearing this flag. The user can override the architecture's - * choice with the "resource_alloc_from_bottom" kernel boot option, but that - * should only be a debugging tool. - */ -int resource_alloc_from_bottom = 1; - -static __init int setup_alloc_from_bottom(char *s) -{ - printk(KERN_INFO - "resource: allocating from bottom-up; please report a bug\n"); - resource_alloc_from_bottom = 1; - return 0; -} -early_param("resource_alloc_from_bottom", setup_alloc_from_bottom); - static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -396,75 +379,8 @@ static bool resource_contains(struct resource *res1, struct resource *res2) return res1->start <= res2->start && res1->end >= res2->end; } -/* - * Find the resource before "child" in the sibling list of "root" children. - */ -static struct resource *find_sibling_prev(struct resource *root, struct resource *child) -{ - struct resource *this; - - for (this = root->child; this; this = this->sibling) - if (this->sibling == child) - return this; - - return NULL; -} - -/* - * Find empty slot in the resource tree given range and alignment. - * This version allocates from the end of the root resource first. - */ -static int find_resource_from_top(struct resource *root, struct resource *new, - resource_size_t size, resource_size_t min, - resource_size_t max, resource_size_t align, - resource_size_t (*alignf)(void *, - const struct resource *, - resource_size_t, - resource_size_t), - void *alignf_data) -{ - struct resource *this; - struct resource tmp, avail, alloc; - - tmp.start = root->end; - tmp.end = root->end; - - this = find_sibling_prev(root, NULL); - for (;;) { - if (this) { - if (this->end < root->end) - tmp.start = this->end + 1; - } else - tmp.start = root->start; - - resource_clip(&tmp, min, max); - - /* Check for overflow after ALIGN() */ - avail = *new; - avail.start = ALIGN(tmp.start, align); - avail.end = tmp.end; - if (avail.start >= tmp.start) { - alloc.start = alignf(alignf_data, &avail, size, align); - alloc.end = alloc.start + size - 1; - if (resource_contains(&avail, &alloc)) { - new->start = alloc.start; - new->end = alloc.end; - return 0; - } - } - - if (!this || this->start == root->start) - break; - - tmp.end = this->start - 1; - this = find_sibling_prev(root, this); - } - return -EBUSY; -} - /* * Find empty slot in the resource tree given range and alignment. - * This version allocates from the beginning of the root resource first. */ static int find_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, @@ -480,15 +396,14 @@ static int find_resource(struct resource *root, struct resource *new, tmp.start = root->start; /* - * Skip past an allocated resource that starts at 0, since the - * assignment of this->start - 1 to tmp->end below would cause an - * underflow. + * Skip past an allocated resource that starts at 0, since the assignment + * of this->start - 1 to tmp->end below would cause an underflow. */ if (this && this->start == 0) { tmp.start = this->end + 1; this = this->sibling; } - for (;;) { + for(;;) { if (this) tmp.end = this->start - 1; else @@ -509,10 +424,8 @@ static int find_resource(struct resource *root, struct resource *new, return 0; } } - if (!this) break; - tmp.start = this->end + 1; this = this->sibling; } @@ -545,10 +458,7 @@ int allocate_resource(struct resource *root, struct resource *new, alignf = simple_align_resource; write_lock(&resource_lock); - if (resource_alloc_from_bottom) - err = find_resource(root, new, size, min, max, align, alignf, alignf_data); - else - err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data); + err = find_resource(root, new, size, min, max, align, alignf, alignf_data); if (err >= 0 && __request_resource(root, new)) err = -EBUSY; write_unlock(&resource_lock); -- cgit v1.2.3 From fcb119183c73bf0781009713f303e28b1fb13d3e Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 16 Dec 2010 10:38:46 -0700 Subject: resources: add arch hook for preventing allocation in reserved areas This adds arch_remove_reservations(), which an arch can implement if it needs to protect part of the address space from allocation. Sometimes that can be done by just putting a region in the resource tree, but there are cases where that doesn't work well. For example, x86 BIOS E820 reservations are not related to devices, so they may overlap part of, all of, or more than a device resource, so they may not end up at the correct spot in the resource tree. Acked-by: H. Peter Anvin Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- include/linux/ioport.h | 1 + kernel/resource.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index b22790268b64..e9bb22cba764 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -123,6 +123,7 @@ extern void reserve_region_with_split(struct resource *root, extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new); extern int insert_resource(struct resource *parent, struct resource *new); extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new); +extern void arch_remove_reservations(struct resource *avail); extern int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, diff --git a/kernel/resource.c b/kernel/resource.c index 560659f7baef..798e2fae2a06 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -357,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn) return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; } +void __weak arch_remove_reservations(struct resource *avail) +{ +} + static resource_size_t simple_align_resource(void *data, const struct resource *avail, resource_size_t size, @@ -394,6 +398,7 @@ static int find_resource(struct resource *root, struct resource *new, struct resource *this = root->child; struct resource tmp = *new, avail, alloc; + tmp.flags = new->flags; tmp.start = root->start; /* * Skip past an allocated resource that starts at 0, since the assignment @@ -410,6 +415,7 @@ static int find_resource(struct resource *root, struct resource *new, tmp.end = root->end; resource_clip(&tmp, min, max); + arch_remove_reservations(&tmp); /* Check for overflow after ALIGN() */ avail = *new; -- cgit v1.2.3 From 4b8fe66300acb2fba8b16d62606e0d30204022fc Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Fri, 17 Dec 2010 12:03:14 -0800 Subject: netlink: fix gcc -Wconversion compilation warning $ cat << EOF | gcc -Wconversion -xc -S -o/dev/null - unsigned f(void) {return NLMSG_HDRLEN;} EOF : In function 'f': :3:26: warning: negative integer implicitly converted to unsigned type Signed-off-by: Dmitry V. Levin Signed-off-by: Kirill A. Shutemov Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 123566912d73..e2b9e63afa68 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -70,7 +70,7 @@ struct nlmsghdr { Check NLM_F_EXCL */ -#define NLMSG_ALIGNTO 4 +#define NLMSG_ALIGNTO 4U #define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) #define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) #define NLMSG_LENGTH(len) ((len)+NLMSG_ALIGN(NLMSG_HDRLEN)) -- cgit v1.2.3 From b8da46d3d55807037b58f14621a0949f18053bde Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 20 Dec 2010 00:29:32 -0500 Subject: clarify a usage constraint for cnt32_to_63() The cnt32_to_63 algorithm relies on proper counter data evaluation ordering to work properly. This was missing from the provided documentation. Let's augment the documentation with the missing usage constraint and fix the only instance that got it wrong. Signed-off-by: Nicolas Pitre Acked-by: David Howells Signed-off-by: Linus Torvalds --- arch/mn10300/kernel/time.c | 10 +++------- include/linux/cnt32_to_63.h | 20 +++++++++++++++++++- 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/mn10300/kernel/time.c b/arch/mn10300/kernel/time.c index f860a340acc9..75da468090b9 100644 --- a/arch/mn10300/kernel/time.c +++ b/arch/mn10300/kernel/time.c @@ -40,21 +40,17 @@ unsigned long long sched_clock(void) unsigned long long ll; unsigned l[2]; } tsc64, result; - unsigned long tsc, tmp; + unsigned long tmp; unsigned product[3]; /* 96-bit intermediate value */ /* cnt32_to_63() is not safe with preemption */ preempt_disable(); - /* read the TSC value - */ - tsc = get_cycles(); - - /* expand to 64-bits. + /* expand the tsc to 64-bits. * - sched_clock() must be called once a minute or better or the * following will go horribly wrong - see cnt32_to_63() */ - tsc64.ll = cnt32_to_63(tsc) & 0x7fffffffffffffffULL; + tsc64.ll = cnt32_to_63(get_cycles()) & 0x7fffffffffffffffULL; preempt_enable(); diff --git a/include/linux/cnt32_to_63.h b/include/linux/cnt32_to_63.h index 7605fdd1eb65..e3d8bf26e5eb 100644 --- a/include/linux/cnt32_to_63.h +++ b/include/linux/cnt32_to_63.h @@ -61,13 +61,31 @@ union cnt32_to_63 { * * 2) this code must not be preempted for a duration longer than the * 32-bit counter half period minus the longest period between two - * calls to this code. + * calls to this code; * * Those requirements ensure proper update to the state bit in memory. * This is usually not a problem in practice, but if it is then a kernel * timer should be scheduled to manage for this code to be executed often * enough. * + * And finally: + * + * 3) the cnt_lo argument must be seen as a globally incrementing value, + * meaning that it should be a direct reference to the counter data which + * can be evaluated according to a specific ordering within the macro, + * and not the result of a previous evaluation stored in a variable. + * + * For example, this is wrong: + * + * u32 partial = get_hw_count(); + * u64 full = cnt32_to_63(partial); + * return full; + * + * This is fine: + * + * u64 full = cnt32_to_63(get_hw_count()); + * return full; + * * Note that the top bit (bit 63) in the returned value should be considered * as garbage. It is not cleared here because callers are likely to use a * multiplier on the returned value which can get rid of the top bit -- cgit v1.2.3 From 4f32e9b1f812fd6c00cc85a127583fefbdedaedc Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Wed, 22 Dec 2010 10:27:53 +0100 Subject: kthread_work: make lockdep happy spinlock in kthread_worker and wait_queue_head in kthread_work both should be lockdep sensible, so change the interface to make it suiltable for CONFIG_LOCKDEP. tj: comment update Reported-by: Nicolas Signed-off-by: Yong Zhang Signed-off-by: Andy Walls Tested-by: Andy Walls Cc: Tejun Heo Cc: Andrew Morton Signed-off-by: Tejun Heo --- include/linux/kthread.h | 45 +++++++++++++++++++++++++++++++++++---------- kernel/kthread.c | 11 +++++++++++ 2 files changed, 46 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 685ea65eb803..ce0775aa64c3 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -81,16 +81,41 @@ struct kthread_work { #define DEFINE_KTHREAD_WORK(work, fn) \ struct kthread_work work = KTHREAD_WORK_INIT(work, fn) -static inline void init_kthread_worker(struct kthread_worker *worker) -{ - *worker = (struct kthread_worker)KTHREAD_WORKER_INIT(*worker); -} - -static inline void init_kthread_work(struct kthread_work *work, - kthread_work_func_t fn) -{ - *work = (struct kthread_work)KTHREAD_WORK_INIT(*work, fn); -} +/* + * kthread_worker.lock and kthread_work.done need their own lockdep class + * keys if they are defined on stack with lockdep enabled. Use the + * following macros when defining them on stack. + */ +#ifdef CONFIG_LOCKDEP +# define KTHREAD_WORKER_INIT_ONSTACK(worker) \ + ({ init_kthread_worker(&worker); worker; }) +# define DEFINE_KTHREAD_WORKER_ONSTACK(worker) \ + struct kthread_worker worker = KTHREAD_WORKER_INIT_ONSTACK(worker) +# define KTHREAD_WORK_INIT_ONSTACK(work, fn) \ + ({ init_kthread_work((&work), fn); work; }) +# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn) \ + struct kthread_work work = KTHREAD_WORK_INIT_ONSTACK(work, fn) +#else +# define DEFINE_KTHREAD_WORKER_ONSTACK(worker) DEFINE_KTHREAD_WORKER(worker) +# define DEFINE_KTHREAD_WORK_ONSTACK(work, fn) DEFINE_KTHREAD_WORK(work, fn) +#endif + +extern void __init_kthread_worker(struct kthread_worker *worker, + const char *name, struct lock_class_key *key); + +#define init_kthread_worker(worker) \ + do { \ + static struct lock_class_key __key; \ + __init_kthread_worker((worker), "("#worker")->lock", &__key); \ + } while (0) + +#define init_kthread_work(work, fn) \ + do { \ + memset((work), 0, sizeof(struct kthread_work)); \ + INIT_LIST_HEAD(&(work)->node); \ + (work)->func = (fn); \ + init_waitqueue_head(&(work)->done); \ + } while (0) int kthread_worker_fn(void *worker_ptr); diff --git a/kernel/kthread.c b/kernel/kthread.c index 2dc3786349d1..ca61bbdd44b2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -265,6 +265,17 @@ int kthreadd(void *unused) return 0; } +void __init_kthread_worker(struct kthread_worker *worker, + const char *name, + struct lock_class_key *key) +{ + spin_lock_init(&worker->lock); + lockdep_set_class_and_name(&worker->lock, key, name); + INIT_LIST_HEAD(&worker->work_list); + worker->task = NULL; +} +EXPORT_SYMBOL_GPL(__init_kthread_worker); + /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker -- cgit v1.2.3 From 4e06fd14d5fa78826397c891654a37e5a36ee827 Mon Sep 17 00:00:00 2001 From: Will Newton Date: Tue, 21 Dec 2010 17:24:29 -0800 Subject: include/linux/unaligned: pack the whole struct rather than just the field The current packed struct implementation of unaligned access adds the packed attribute only to the field within the unaligned struct rather than to the struct as a whole. This is not sufficient to enforce proper behaviour on architectures with a default struct alignment of more than one byte. For example, the current implementation of __get_unaligned_cpu16 when compiled for arm with gcc -O1 -mstructure-size-boundary=32 assumes the struct is on a 4 byte boundary so performs the load of the 16bit packed field as if it were on a 4 byte boundary: __get_unaligned_cpu16: ldrh r0, [r0, #0] bx lr Moving the packed attribute to the struct rather than the field causes the proper unaligned access code to be generated: __get_unaligned_cpu16: ldrb r3, [r0, #0] @ zero_extendqisi2 ldrb r0, [r0, #1] @ zero_extendqisi2 orr r0, r3, r0, asl #8 bx lr Signed-off-by: Will Newton Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/unaligned/packed_struct.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/unaligned/packed_struct.h b/include/linux/unaligned/packed_struct.h index 2498bb9fe002..c9a6abd972a1 100644 --- a/include/linux/unaligned/packed_struct.h +++ b/include/linux/unaligned/packed_struct.h @@ -3,9 +3,9 @@ #include -struct __una_u16 { u16 x __attribute__((packed)); }; -struct __una_u32 { u32 x __attribute__((packed)); }; -struct __una_u64 { u64 x __attribute__((packed)); }; +struct __una_u16 { u16 x; } __attribute__((packed)); +struct __una_u32 { u32 x; } __attribute__((packed)); +struct __una_u64 { u64 x; } __attribute__((packed)); static inline u16 __get_unaligned_cpu16(const void *p) { -- cgit v1.2.3 From 4be2c95d1f7706ca0e74499f2bd118e1cee19669 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 21 Dec 2010 17:24:30 -0800 Subject: taskstats: pad taskstats netlink response for aligment issues on ia64 The taskstats structure is internally aligned on 8 byte boundaries but the layout of the aggregrate reply, with two NLA headers and the pid (each 4 bytes), actually force the entire structure to be unaligned. This causes the kernel to issue unaligned access warnings on some architectures like ia64. Unfortunately, some software out there doesn't properly unroll the NLA packet and assumes that the start of the taskstats structure will always be 20 bytes from the start of the netlink payload. Aligning the start of the taskstats structure breaks this software, which we don't want. So, for now the alignment only happens on architectures that require it and those users will have to update to fixed versions of those packages. Space is reserved in the packet only when needed. This ifdef should be removed in several years e.g. 2012 once we can be confident that fixed versions are installed on most systems. We add the padding before the aggregate since the aggregate is already a defined type. Commit 85893120 ("delayacct: align to 8 byte boundary on 64-bit systems") previously addressed the alignment issues by padding out the pid field. This was supposed to be a compatible change but the circumstances described above mean that it wasn't. This patch backs out that change, since it was a hack, and introduces a new NULL attribute type to provide the padding. Padding the response with 4 bytes avoids allocating an aligned taskstats structure and copying it back. Since the structure weighs in at 328 bytes, it's too big to do it on the stack. Signed-off-by: Jeff Mahoney Reported-by: Brian Rogers Cc: Jeff Mahoney Cc: Guillaume Chazarain Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/getdelays.c | 1 + include/linux/taskstats.h | 3 +- kernel/taskstats.c | 57 ++++++++++++++++++++++++++++-------- 3 files changed, 47 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/Documentation/accounting/getdelays.c b/Documentation/accounting/getdelays.c index a2976a6de033..e9c77788a39d 100644 --- a/Documentation/accounting/getdelays.c +++ b/Documentation/accounting/getdelays.c @@ -516,6 +516,7 @@ int main(int argc, char *argv[]) default: fprintf(stderr, "Unknown nla_type %d\n", na->nla_type); + case TASKSTATS_TYPE_NULL: break; } na = (struct nlattr *) (GENLMSG_DATA(&msg) + len); diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h index 341dddb55090..2466e550a41d 100644 --- a/include/linux/taskstats.h +++ b/include/linux/taskstats.h @@ -33,7 +33,7 @@ */ -#define TASKSTATS_VERSION 7 +#define TASKSTATS_VERSION 8 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -188,6 +188,7 @@ enum { TASKSTATS_TYPE_STATS, /* taskstats structure */ TASKSTATS_TYPE_AGGR_PID, /* contains pid + stats */ TASKSTATS_TYPE_AGGR_TGID, /* contains tgid + stats */ + TASKSTATS_TYPE_NULL, /* contains nothing */ __TASKSTATS_TYPE_MAX, }; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index c8231fb15708..3308fd7f1b52 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -349,25 +349,47 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } +#ifdef CONFIG_IA64 +#define TASKSTATS_NEEDS_PADDING 1 +#endif + static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) { struct nlattr *na, *ret; int aggr; - /* If we don't pad, we end up with alignment on a 4 byte boundary. - * This causes lots of runtime warnings on systems requiring 8 byte - * alignment */ - u32 pids[2] = { pid, 0 }; - int pid_size = ALIGN(sizeof(pid), sizeof(long)); - aggr = (type == TASKSTATS_TYPE_PID) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; + /* + * The taskstats structure is internally aligned on 8 byte + * boundaries but the layout of the aggregrate reply, with + * two NLA headers and the pid (each 4 bytes), actually + * force the entire structure to be unaligned. This causes + * the kernel to issue unaligned access warnings on some + * architectures like ia64. Unfortunately, some software out there + * doesn't properly unroll the NLA packet and assumes that the start + * of the taskstats structure will always be 20 bytes from the start + * of the netlink payload. Aligning the start of the taskstats + * structure breaks this software, which we don't want. So, for now + * the alignment only happens on architectures that require it + * and those users will have to update to fixed versions of those + * packages. Space is reserved in the packet only when needed. + * This ifdef should be removed in several years e.g. 2012 once + * we can be confident that fixed versions are installed on most + * systems. We add the padding before the aggregate since the + * aggregate is already a defined type. + */ +#ifdef TASKSTATS_NEEDS_PADDING + if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) + goto err; +#endif na = nla_nest_start(skb, aggr); if (!na) goto err; - if (nla_put(skb, type, pid_size, pids) < 0) + + if (nla_put(skb, type, sizeof(pid), &pid) < 0) goto err; ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); if (!ret) @@ -456,6 +478,18 @@ out: return rc; } +static size_t taskstats_packet_size(void) +{ + size_t size; + + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); +#ifdef TASKSTATS_NEEDS_PADDING + size += nla_total_size(0); /* Padding for alignment */ +#endif + return size; +} + static int cmd_attr_pid(struct genl_info *info) { struct taskstats *stats; @@ -464,8 +498,7 @@ static int cmd_attr_pid(struct genl_info *info) u32 pid; int rc; - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) @@ -494,8 +527,7 @@ static int cmd_attr_tgid(struct genl_info *info) u32 tgid; int rc; - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) @@ -570,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) /* * Size includes space for nested attributes */ - size = nla_total_size(sizeof(u32)) + - nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + size = taskstats_packet_size(); is_thread_group = !!taskstats_tgid_alloc(tsk); if (is_thread_group) { -- cgit v1.2.3