From e03826d5045e81a66a4fad7be9a8ecdaeb7911cf Mon Sep 17 00:00:00 2001 From: Keerthy Date: Tue, 17 Mar 2015 15:56:04 +0530 Subject: regulator: palmas: Correct TPS659038 register definition for REGEN2 The register offset for REGEN2_CTRL in different for TPS659038 chip as when compared with other Palmas family PMICs. In the case of TPS659038 the wrong offset pointed to PLLEN_CTRL and was causing a hang. Correcting the same. Signed-off-by: Keerthy Signed-off-by: Mark Brown Cc: stable@vger.kernel.org --- include/linux/mfd/palmas.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h index fb0390a1a498..ee7b1ce7a6f8 100644 --- a/include/linux/mfd/palmas.h +++ b/include/linux/mfd/palmas.h @@ -2999,6 +2999,9 @@ enum usb_irq_events { #define PALMAS_GPADC_TRIM15 0x0E #define PALMAS_GPADC_TRIM16 0x0F +/* TPS659038 regen2_ctrl offset iss different from palmas */ +#define TPS659038_REGEN2_CTRL 0x12 + /* TPS65917 Interrupt registers */ /* Registers for function INTERRUPT */ -- cgit v1.2.3 From a2f4870697a5bcf4a87073ec6b32dd2928c1211d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 17 Mar 2015 12:23:19 -0400 Subject: fs: make sure the timestamps for lazytime inodes eventually get written Jan Kara pointed out that if there is an inode which is constantly getting dirtied with I_DIRTY_PAGES, an inode with an updated timestamp will never be written since inode->dirtied_when is constantly getting updated. We fix this by adding an extra field to the inode, dirtied_time_when, so inodes with a stale dirtytime can get detected and handled. In addition, if we have a dirtytime inode caused by an atime update, and there is no write activity on the file system, we need to have a secondary system to make sure these inodes get written out. We do this by setting up a second delayed work structure which wakes up the CPU much more rarely compared to writeback_expire_centisecs. Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/fs-writeback.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++------- include/linux/fs.h | 1 + 2 files changed, 73 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e907052eeadb..2cfcd74faf87 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,6 +53,18 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +/* + * If an inode is constantly having its pages dirtied, but then the + * updates stop dirtytime_expire_interval seconds in the past, it's + * possible for the worst case time between when an inode has its + * timestamps updated and when they finally get written out to be two + * dirtytime_expire_intervals. We set the default to 12 hours (in + * seconds), which means most of the time inodes will have their + * timestamps written to disk after 12 hours, but in the worst case a + * few inodes might not their timestamps updated for 24 hours. + */ +unsigned int dirtytime_expire_interval = 12 * 60 * 60; + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -275,8 +287,8 @@ static int move_expired_inodes(struct list_head *delaying_queue, if ((flags & EXPIRE_DIRTY_ATIME) == 0) older_than_this = work->older_than_this; - else if ((work->reason == WB_REASON_SYNC) == 0) { - expire_time = jiffies - (HZ * 86400); + else if (!work->for_sync) { + expire_time = jiffies - (dirtytime_expire_interval * HZ); older_than_this = &expire_time; } while (!list_empty(delaying_queue)) { @@ -458,6 +470,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, */ redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ @@ -505,12 +518,17 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) && - (inode->i_state & I_DIRTY_TIME)) || - (inode->i_state & I_DIRTY_TIME_EXPIRED)) { - dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; - trace_writeback_lazytime(inode); - } + if (inode->i_state & I_DIRTY_TIME) { + if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || + unlikely(time_after(jiffies, + (inode->dirtied_time_when + + dirtytime_expire_interval * HZ)))) { + dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; + trace_writeback_lazytime(inode); + } + } else + inode->i_state &= ~I_DIRTY_TIME_EXPIRED; inode->i_state &= ~dirty; /* @@ -1131,6 +1149,45 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) rcu_read_unlock(); } +/* + * Wake up bdi's periodically to make sure dirtytime inodes gets + * written back periodically. We deliberately do *not* check the + * b_dirtytime list in wb_has_dirty_io(), since this would cause the + * kernel to be constantly waking up once there are any dirtytime + * inodes on the system. So instead we define a separate delayed work + * function which gets called much more rarely. (By default, only + * once every 12 hours.) + * + * If there is any other write activity going on in the file system, + * this function won't be necessary. But if the only thing that has + * happened on the file system is a dirtytime inode caused by an atime + * update, we need this infrastructure below to make sure that inode + * eventually gets pushed out to disk. + */ +static void wakeup_dirtytime_writeback(struct work_struct *w); +static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); + +static void wakeup_dirtytime_writeback(struct work_struct *w) +{ + struct backing_dev_info *bdi; + + rcu_read_lock(); + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + if (list_empty(&bdi->wb.b_dirty_time)) + continue; + bdi_wakeup_thread(bdi); + } + rcu_read_unlock(); + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); +} + +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + return 0; +} +__initcall(start_dirtytime_writeback); + static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -1269,8 +1326,13 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, dirtytime ? - &bdi->wb.b_dirty_time : &bdi->wb.b_dirty); + if (dirtytime) + inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + else + list_move(&inode->i_wb_list, + &bdi->wb.b_dirty_time); spin_unlock(&bdi->wb.list_lock); trace_writeback_dirty_inode_enqueue(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index b4d71b5e1ff2..f4131e8ead74 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -604,6 +604,7 @@ struct inode { struct mutex i_mutex; unsigned long dirtied_when; /* jiffies of first dirtying */ + unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_wb_list; /* backing dev IO list */ -- cgit v1.2.3 From 1efff914afac8a965ad63817ecf8861a927c2ace Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 17 Mar 2015 12:23:32 -0400 Subject: fs: add dirtytime_expire_seconds sysctl Add a tuning knob so we can adjust the dirtytime expiration timeout, which is very useful for testing lazytime. Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/fs-writeback.c | 11 +++++++++++ include/linux/writeback.h | 3 +++ kernel/sysctl.c | 8 ++++++++ 3 files changed, 22 insertions(+) (limited to 'include/linux') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2cfcd74faf87..32a8bbd7a9ad 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1188,6 +1188,17 @@ static int __init start_dirtytime_writeback(void) } __initcall(start_dirtytime_writeback); +int dirtytime_interval_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + mod_delayed_work(system_wq, &dirtytime_work, 0); + return ret; +} + static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 00048339c23e..b2dd371ec0ca 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -130,6 +130,7 @@ extern int vm_dirty_ratio; extern unsigned long vm_dirty_bytes; extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; +extern unsigned int dirtytime_expire_interval; extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; @@ -146,6 +147,8 @@ extern int dirty_ratio_handler(struct ctl_table *table, int write, extern int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int dirtytime_interval_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); struct ctl_table; int dirty_writeback_centisecs_handler(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 88ea2d6e0031..ce410bb9f2e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1227,6 +1227,14 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = &zero, + }, { .procname = "nr_pdflush_threads", .mode = 0444 /* read-only */, -- cgit v1.2.3 From cf39284d41f67964cf42b21bb386c012cf5b7f65 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Wed, 18 Mar 2015 08:57:41 +0800 Subject: regulator: Fix documentation for regmap in the config dev_get_regulator() does not exist, fix the typo. Signed-off-by: Axel Lin Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index d4ad5b5a02bb..045f709cb89b 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -316,7 +316,7 @@ struct regulator_desc { * @driver_data: private regulator data * @of_node: OpenFirmware node to parse for device tree bindings (may be * NULL). - * @regmap: regmap to use for core regmap helpers if dev_get_regulator() is + * @regmap: regmap to use for core regmap helpers if dev_get_regmap() is * insufficient. * @ena_gpio_initialized: GPIO controlling regulator enable was properly * initialized, meaning that >= 0 is a valid gpio -- cgit v1.2.3 From 5067c0469c643512f24786990e315f9c15cc7d24 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 12 Mar 2015 10:32:18 -0700 Subject: ata: Add a new flag to destinguish sas controller SAS controller has its own tag allocation, which doesn't directly match to ATA tag, so SAS and SATA have different code path for ata tags. Originally we use port->scsi_host (98bd4be1) to destinguish SAS controller, but libsas set ->scsi_host too, so we can't use it for the destinguish, we add a new flag for this purpose. Without this patch, the following oops can happen because scsi-mq uses a host-wide tag map shared among all devices with some integer tag values >= ATA_MAX_QUEUE. These unexpectedly high tag values cause __ata_qc_from_tag() to return NULL, which is then dereferenced in ata_qc_new_init(). BUG: unable to handle kernel NULL pointer dereference at 0000000000000058 IP: [] ata_qc_new_init+0x3e/0x120 PGD 32adf0067 PUD 32adf1067 PMD 0 Oops: 0002 [#1] SMP DEBUG_PAGEALLOC Modules linked in: iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi igb i2c_algo_bit ptp pps_core pm80xx libsas scsi_transport_sas sg coretemp eeprom w83795 i2c_i801 CPU: 4 PID: 1450 Comm: cydiskbench Not tainted 4.0.0-rc3 #1 Hardware name: Supermicro X8DTH-i/6/iF/6F/X8DTH, BIOS 2.1b 05/04/12 task: ffff8800ba86d500 ti: ffff88032a064000 task.ti: ffff88032a064000 RIP: 0010:[] [] ata_qc_new_init+0x3e/0x120 RSP: 0018:ffff88032a067858 EFLAGS: 00010046 RAX: 0000000000000000 RBX: ffff8800ba0d2230 RCX: 000000000000002a RDX: ffffffff80505ae0 RSI: 0000000000000020 RDI: ffff8800ba0d2230 RBP: ffff88032a067868 R08: 0000000000000201 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8800ba0d0000 R13: ffff8800ba0d2230 R14: ffffffff80505ae0 R15: ffff8800ba0d0000 FS: 0000000041223950(0063) GS:ffff88033e480000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000058 CR3: 000000032a0a3000 CR4: 00000000000006e0 Stack: ffff880329eee758 ffff880329eee758 ffff88032a0678a8 ffffffff80502dad ffff8800ba167978 ffff880329eee758 ffff88032bf9c520 ffff8800ba167978 ffff88032bf9c520 ffff88032bf9a290 ffff88032a0678b8 ffffffff80506909 Call Trace: [] ata_scsi_translate+0x3d/0x1b0 [] ata_sas_queuecmd+0x149/0x2a0 [] sas_queuecommand+0xa0/0x1f0 [libsas] [] scsi_dispatch_cmd+0xd4/0x1a0 [] scsi_queue_rq+0x66f/0x7f0 [] __blk_mq_run_hw_queue+0x208/0x3f0 [] blk_mq_run_hw_queue+0x88/0xc0 [] blk_mq_insert_request+0xc4/0x130 [] blk_execute_rq_nowait+0x73/0x160 [] sg_common_write+0x3da/0x720 [sg] [] sg_new_write+0x250/0x360 [sg] [] sg_write+0x13b/0x450 [sg] [] vfs_write+0xd1/0x1b0 [] SyS_write+0x54/0xc0 [] system_call_fastpath+0x12/0x17 tj: updated description. Fixes: 12cb5ce101ab ("libata: use blk taging") Reported-and-tested-by: Tony Battersby Signed-off-by: Shaohua Li Signed-off-by: Tejun Heo --- drivers/ata/libata-core.c | 4 ++-- drivers/scsi/ipr.c | 3 ++- drivers/scsi/libsas/sas_ata.c | 3 ++- include/linux/libata.h | 1 + 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 4c35f0822d06..ef150ebb4c30 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4737,7 +4737,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_device *dev, int tag) return NULL; /* libsas case */ - if (!ap->scsi_host) { + if (ap->flags & ATA_FLAG_SAS_HOST) { tag = ata_sas_allocate_tag(ap); if (tag < 0) return NULL; @@ -4776,7 +4776,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) tag = qc->tag; if (likely(ata_tag_valid(tag))) { qc->tag = ATA_TAG_POISON; - if (!ap->scsi_host) + if (ap->flags & ATA_FLAG_SAS_HOST) ata_sas_free_tag(tag, ap); } } diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 9219953ee949..d9afc51af7d3 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -6815,7 +6815,8 @@ static struct ata_port_operations ipr_sata_ops = { }; static struct ata_port_info sata_port_info = { - .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA, + .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | + ATA_FLAG_SAS_HOST, .pio_mask = ATA_PIO4_ONLY, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA6, diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 932d9cc98d2f..9c706d8c1441 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -547,7 +547,8 @@ static struct ata_port_operations sas_sata_ops = { }; static struct ata_port_info sata_port_info = { - .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | ATA_FLAG_NCQ, + .flags = ATA_FLAG_SATA | ATA_FLAG_PIO_DMA | ATA_FLAG_NCQ | + ATA_FLAG_SAS_HOST, .pio_mask = ATA_PIO4, .mwdma_mask = ATA_MWDMA2, .udma_mask = ATA_UDMA6, diff --git a/include/linux/libata.h b/include/linux/libata.h index fc03efa64ffe..6b08cc106c21 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -232,6 +232,7 @@ enum { * led */ ATA_FLAG_NO_DIPM = (1 << 23), /* host not happy with DIPM */ ATA_FLAG_LOWTAG = (1 << 24), /* host wants lowest available tag */ + ATA_FLAG_SAS_HOST = (1 << 25), /* SAS host */ /* bits 24:31 of ap->flags are reserved for LLD specific flags */ -- cgit v1.2.3 From 074c238177a75f5e79af3b2cb6a84e54823ef950 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Mar 2015 15:55:42 -0700 Subject: mm: numa: slow PTE scan rate if migration failures occur Dave Chinner reported the following on https://lkml.org/lkml/2015/3/1/226 Across the board the 4.0-rc1 numbers are much slower, and the degradation is far worse when using the large memory footprint configs. Perf points straight at the cause - this is from 4.0-rc1 on the "-o bhash=101073" config: - 56.07% 56.07% [kernel] [k] default_send_IPI_mask_sequence_phys - default_send_IPI_mask_sequence_phys - 99.99% physflat_send_IPI_mask - 99.37% native_send_call_func_ipi smp_call_function_many - native_flush_tlb_others - 99.85% flush_tlb_page ptep_clear_flush try_to_unmap_one rmap_walk try_to_unmap migrate_pages migrate_misplaced_page - handle_mm_fault - 99.73% __do_page_fault trace_do_page_fault do_async_page_fault + async_page_fault 0.63% native_send_call_func_single_ipi generic_exec_single smp_call_function_single This is showing excessive migration activity even though excessive migrations are meant to get throttled. Normally, the scan rate is tuned on a per-task basis depending on the locality of faults. However, if migrations fail for any reason then the PTE scanner may scan faster if the faults continue to be remote. This means there is higher system CPU overhead and fault trapping at exactly the time we know that migrations cannot happen. This patch tracks when migration failures occur and slows the PTE scanner. Signed-off-by: Mel Gorman Reported-by: Dave Chinner Tested-by: Dave Chinner Cc: Ingo Molnar Cc: Aneesh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 +++++---- kernel/sched/fair.c | 8 ++++++-- mm/huge_memory.c | 3 ++- mm/memory.c | 3 ++- 4 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..a419b65770d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1625,11 +1625,11 @@ struct task_struct { /* * numa_faults_locality tracks if faults recorded during the last - * scan window were remote/local. The task scan period is adapted - * based on the locality of the faults with different weights - * depending on whether they were shared or private faults + * scan window were remote/local or failed to migrate. The task scan + * period is adapted based on the locality of the faults with different + * weights depending on whether they were shared or private faults */ - unsigned long numa_faults_locality[2]; + unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1719,6 +1719,7 @@ struct task_struct { #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 #define TNF_FAULT_LOCAL 0x08 +#define TNF_MIGRATE_FAIL 0x10 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..bcfe32088b37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower */ - if (local + shared == 0) { + if (local + shared == 0 || p->numa_faults_locality[2]) { p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period << 1); @@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0a42d1521aa4..51b3e7c64622 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1350,7 +1350,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; - } + } else + flags |= TNF_MIGRATE_FAIL; goto out; clear_pmdnuma: diff --git a/mm/memory.c b/mm/memory.c index d20e12da3a3c..97839f5c8c30 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3103,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { page_nid = target_nid; flags |= TNF_MIGRATED; - } + } else + flags |= TNF_MIGRATE_FAIL; out: if (page_nid != -1) -- cgit v1.2.3 From 1e9e39f4a29857a396ac7b669d109f697f66695e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 26 Feb 2015 19:34:37 +0000 Subject: usbnet: Fix tx_packets stat for FLAG_MULTI_FRAME drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the usbnet core does not update the tx_packets statistic for drivers with FLAG_MULTI_PACKET and there is no hook in the TX completion path where they could do this. cdc_ncm and dependent drivers are bumping tx_packets stat on the transmit path while asix and sr9800 aren't updating it at all. Add a packet count in struct skb_data so these drivers can fill it in, initialise it to 1 for other drivers, and add the packet count to the tx_packets statistic on completion. Signed-off-by: Ben Hutchings Tested-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/asix_common.c | 2 ++ drivers/net/usb/cdc_ncm.c | 3 ++- drivers/net/usb/sr9800.c | 1 + drivers/net/usb/usbnet.c | 5 +++-- include/linux/usb/usbnet.h | 12 ++++++++++++ 5 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index 5c55f11572ba..724a9b50df7a 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -188,6 +188,8 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb, memcpy(skb_tail_pointer(skb), &padbytes, sizeof(padbytes)); skb_put(skb, sizeof(padbytes)); } + + usbnet_set_skb_tx_stats(skb, 1); return skb; } diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 80a844e0ae03..70cbea551139 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1172,7 +1172,6 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign) /* return skb */ ctx->tx_curr_skb = NULL; - dev->net->stats.tx_packets += ctx->tx_curr_frame_num; /* keep private stats: framing overhead and number of NTBs */ ctx->tx_overhead += skb_out->len - ctx->tx_curr_frame_payload; @@ -1184,6 +1183,8 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign) */ dev->net->stats.tx_bytes -= skb_out->len - ctx->tx_curr_frame_payload; + usbnet_set_skb_tx_stats(skb_out, n); + return skb_out; exit_no_skb: diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c index b94a0fbb8b3b..7650cdc8fe6b 100644 --- a/drivers/net/usb/sr9800.c +++ b/drivers/net/usb/sr9800.c @@ -144,6 +144,7 @@ static struct sk_buff *sr_tx_fixup(struct usbnet *dev, struct sk_buff *skb, skb_put(skb, sizeof(padbytes)); } + usbnet_set_skb_tx_stats(skb, 1); return skb; } diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 449835f4331e..0f3ff285f6a1 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1188,8 +1188,7 @@ static void tx_complete (struct urb *urb) struct usbnet *dev = entry->dev; if (urb->status == 0) { - if (!(dev->driver_info->flags & FLAG_MULTI_PACKET)) - dev->net->stats.tx_packets++; + dev->net->stats.tx_packets += entry->packets; dev->net->stats.tx_bytes += entry->length; } else { dev->net->stats.tx_errors++; @@ -1348,6 +1347,8 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, urb->transfer_flags |= URB_ZERO_PACKET; } entry->length = urb->transfer_buffer_length = length; + if (!(info->flags & FLAG_MULTI_PACKET)) + usbnet_set_skb_tx_stats(skb, 1); spin_lock_irqsave(&dev->txq.lock, flags); retval = usb_autopm_get_interface_async(dev->intf); diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index d9a4905e01d0..ff3fb2bd0e90 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -228,8 +228,20 @@ struct skb_data { /* skb->cb is one of these */ struct usbnet *dev; enum skb_state state; size_t length; + unsigned long packets; }; +/* Drivers that set FLAG_MULTI_PACKET must call this in their + * tx_fixup method before returning an skb. + */ +static inline void +usbnet_set_skb_tx_stats(struct sk_buff *skb, unsigned long packets) +{ + struct skb_data *entry = (struct skb_data *) skb->cb; + + entry->packets = packets; +} + extern int usbnet_open(struct net_device *net); extern int usbnet_stop(struct net_device *net); extern netdev_tx_t usbnet_start_xmit(struct sk_buff *skb, -- cgit v1.2.3 From 7a1e890e2168e33fb62d84528e996b8b4b478fea Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 25 Mar 2015 21:41:33 +0100 Subject: usbnet: Fix tx_bytes statistic running backward in cdc_ncm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cdc_ncm disagrees with usbnet about how much framing overhead should be counted in the tx_bytes statistics, and tries 'fix' this by decrementing tx_bytes on the transmit path. But statistics must never be decremented except due to roll-over; this will thoroughly confuse user-space. Also, tx_bytes is only incremented by usbnet in the completion path. Fix this by requiring drivers that set FLAG_MULTI_FRAME to set a tx_bytes delta along with the tx_packets count. Fixes: beeecd42c3b4 ("net: cdc_ncm/cdc_mbim: adding NCM protocol statistics") Signed-off-by: Ben Hutchings Signed-off-by: Bjørn Mork --- drivers/net/usb/asix_common.c | 2 +- drivers/net/usb/cdc_ncm.c | 7 +++---- drivers/net/usb/sr9800.c | 2 +- drivers/net/usb/usbnet.c | 16 +++++++++++++--- include/linux/usb/usbnet.h | 6 ++++-- 5 files changed, 22 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index 724a9b50df7a..75d6f26729a3 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -189,7 +189,7 @@ struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb, skb_put(skb, sizeof(padbytes)); } - usbnet_set_skb_tx_stats(skb, 1); + usbnet_set_skb_tx_stats(skb, 1, 0); return skb; } diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 70cbea551139..c3e4da9e79ca 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1177,13 +1177,12 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign) ctx->tx_overhead += skb_out->len - ctx->tx_curr_frame_payload; ctx->tx_ntbs++; - /* usbnet has already counted all the framing overhead. + /* usbnet will count all the framing overhead by default. * Adjust the stats so that the tx_bytes counter show real * payload data instead. */ - dev->net->stats.tx_bytes -= skb_out->len - ctx->tx_curr_frame_payload; - - usbnet_set_skb_tx_stats(skb_out, n); + usbnet_set_skb_tx_stats(skb_out, n, + ctx->tx_curr_frame_payload - skb_out->len); return skb_out; diff --git a/drivers/net/usb/sr9800.c b/drivers/net/usb/sr9800.c index 7650cdc8fe6b..953de13267df 100644 --- a/drivers/net/usb/sr9800.c +++ b/drivers/net/usb/sr9800.c @@ -144,7 +144,7 @@ static struct sk_buff *sr_tx_fixup(struct usbnet *dev, struct sk_buff *skb, skb_put(skb, sizeof(padbytes)); } - usbnet_set_skb_tx_stats(skb, 1); + usbnet_set_skb_tx_stats(skb, 1, 0); return skb; } diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 0f3ff285f6a1..777757ae1973 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1346,9 +1346,19 @@ netdev_tx_t usbnet_start_xmit (struct sk_buff *skb, } else urb->transfer_flags |= URB_ZERO_PACKET; } - entry->length = urb->transfer_buffer_length = length; - if (!(info->flags & FLAG_MULTI_PACKET)) - usbnet_set_skb_tx_stats(skb, 1); + urb->transfer_buffer_length = length; + + if (info->flags & FLAG_MULTI_PACKET) { + /* Driver has set number of packets and a length delta. + * Calculate the complete length and ensure that it's + * positive. + */ + entry->length += length; + if (WARN_ON_ONCE(entry->length <= 0)) + entry->length = length; + } else { + usbnet_set_skb_tx_stats(skb, 1, length); + } spin_lock_irqsave(&dev->txq.lock, flags); retval = usb_autopm_get_interface_async(dev->intf); diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index ff3fb2bd0e90..6e0ce8c7b8cb 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -227,7 +227,7 @@ struct skb_data { /* skb->cb is one of these */ struct urb *urb; struct usbnet *dev; enum skb_state state; - size_t length; + long length; unsigned long packets; }; @@ -235,11 +235,13 @@ struct skb_data { /* skb->cb is one of these */ * tx_fixup method before returning an skb. */ static inline void -usbnet_set_skb_tx_stats(struct sk_buff *skb, unsigned long packets) +usbnet_set_skb_tx_stats(struct sk_buff *skb, + unsigned long packets, long bytes_delta) { struct skb_data *entry = (struct skb_data *) skb->cb; entry->packets = packets; + entry->length = bytes_delta; } extern int usbnet_open(struct net_device *net); -- cgit v1.2.3 From 4ad3e3634a6cbe916722c7113c5b488d52c7a3dc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 27 Mar 2015 14:15:04 +0000 Subject: irqchip: gicv3-its: Fix PROP/PEND and BASE/CBASE confusion The ITS driver sometime mixes up the use of GICR_PROPBASE bitfields for the GICR_PENDBASE register, and GITS_BASER for GICR_CBASE. This does not lead to any observable bug because similar bits are at the same location, but this just make the code even harder to understand... This patch provides the required #defines and fixes the mixup. Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/1427465705-17126-4-git-send-email-marc.zyngier@arm.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-gic-v3-its.c | 6 +++--- include/linux/irqchip/arm-gic-v3.h | 13 +++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index fa0c43660c8b..56353f6b5952 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -986,8 +986,8 @@ static void its_cpu_init_lpis(void) /* set PENDBASE */ val = (page_to_phys(pend_page) | - GICR_PROPBASER_InnerShareable | - GICR_PROPBASER_WaWb); + GICR_PENDBASER_InnerShareable | + GICR_PENDBASER_WaWb); writeq_relaxed(val, rbase + GICR_PENDBASER); @@ -1425,7 +1425,7 @@ static int its_probe(struct device_node *node, struct irq_domain *parent) writeq_relaxed(0, its->base + GITS_CWRITER); writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR); - if ((tmp ^ baser) & GITS_BASER_SHAREABILITY_MASK) { + if ((tmp ^ baser) & GITS_CBASER_SHAREABILITY_MASK) { pr_info("ITS: using cache flushing for cmd queue\n"); its->flags |= ITS_FLAGS_CMDQ_NEEDS_FLUSHING; } diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 781974afff9f..826a4bd63d4a 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -128,6 +128,19 @@ #define GICR_PROPBASER_RaWaWb (7U << 7) #define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GICR_PENDBASER_NonShareable (0U << 10) +#define GICR_PENDBASER_InnerShareable (1U << 10) +#define GICR_PENDBASER_OuterShareable (2U << 10) +#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10) +#define GICR_PENDBASER_nCnB (0U << 7) +#define GICR_PENDBASER_nC (1U << 7) +#define GICR_PENDBASER_RaWt (2U << 7) +#define GICR_PENDBASER_RaWb (3U << 7) +#define GICR_PENDBASER_WaWt (4U << 7) +#define GICR_PENDBASER_WaWb (5U << 7) +#define GICR_PENDBASER_RaWaWt (6U << 7) +#define GICR_PENDBASER_RaWaWb (7U << 7) + /* * Re-Distributor registers, offsets from SGI_base */ -- cgit v1.2.3 From 241a386c7dbb8b0db400a1f92f2ebe3b10eb661d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 27 Mar 2015 14:15:05 +0000 Subject: irqchip: gicv3-its: Use non-cacheable accesses when no shareability If the ITS or the redistributors report their shareability as zero, then it is important to make sure they will no generate any cacheable traffic, as this is unlikely to produce the expected result. Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/1427465705-17126-5-git-send-email-marc.zyngier@arm.com Signed-off-by: Jason Cooper --- drivers/irqchip/irq-gic-v3-its.c | 47 ++++++++++++++++++++++++++++++++++---- include/linux/irqchip/arm-gic-v3.h | 4 ++++ 2 files changed, 47 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 56353f6b5952..9687f8afebff 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -802,6 +802,7 @@ static int its_alloc_tables(struct its_node *its) int i; int psz = SZ_64K; u64 shr = GITS_BASER_InnerShareable; + u64 cache = GITS_BASER_WaWb; for (i = 0; i < GITS_BASER_NR_REGS; i++) { u64 val = readq_relaxed(its->base + GITS_BASER + i * 8); @@ -848,7 +849,7 @@ retry_baser: val = (virt_to_phys(base) | (type << GITS_BASER_TYPE_SHIFT) | ((entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | - GITS_BASER_WaWb | + cache | shr | GITS_BASER_VALID); @@ -874,9 +875,12 @@ retry_baser: * Shareability didn't stick. Just use * whatever the read reported, which is likely * to be the only thing this redistributor - * supports. + * supports. If that's zero, make it + * non-cacheable as well. */ shr = tmp & GITS_BASER_SHAREABILITY_MASK; + if (!shr) + cache = GITS_BASER_nC; goto retry_baser; } @@ -980,6 +984,17 @@ static void its_cpu_init_lpis(void) tmp = readq_relaxed(rbase + GICR_PROPBASER); if ((tmp ^ val) & GICR_PROPBASER_SHAREABILITY_MASK) { + if (!(tmp & GICR_PROPBASER_SHAREABILITY_MASK)) { + /* + * The HW reports non-shareable, we must + * remove the cacheability attributes as + * well. + */ + val &= ~(GICR_PROPBASER_SHAREABILITY_MASK | + GICR_PROPBASER_CACHEABILITY_MASK); + val |= GICR_PROPBASER_nC; + writeq_relaxed(val, rbase + GICR_PROPBASER); + } pr_info_once("GIC: using cache flushing for LPI property table\n"); gic_rdists->flags |= RDIST_FLAGS_PROPBASE_NEEDS_FLUSHING; } @@ -990,6 +1005,18 @@ static void its_cpu_init_lpis(void) GICR_PENDBASER_WaWb); writeq_relaxed(val, rbase + GICR_PENDBASER); + tmp = readq_relaxed(rbase + GICR_PENDBASER); + + if (!(tmp & GICR_PENDBASER_SHAREABILITY_MASK)) { + /* + * The HW reports non-shareable, we must remove the + * cacheability attributes as well. + */ + val &= ~(GICR_PENDBASER_SHAREABILITY_MASK | + GICR_PENDBASER_CACHEABILITY_MASK); + val |= GICR_PENDBASER_nC; + writeq_relaxed(val, rbase + GICR_PENDBASER); + } /* Enable LPIs */ val = readl_relaxed(rbase + GICR_CTLR); @@ -1422,14 +1449,26 @@ static int its_probe(struct device_node *node, struct irq_domain *parent) writeq_relaxed(baser, its->base + GITS_CBASER); tmp = readq_relaxed(its->base + GITS_CBASER); - writeq_relaxed(0, its->base + GITS_CWRITER); - writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR); if ((tmp ^ baser) & GITS_CBASER_SHAREABILITY_MASK) { + if (!(tmp & GITS_CBASER_SHAREABILITY_MASK)) { + /* + * The HW reports non-shareable, we must + * remove the cacheability attributes as + * well. + */ + baser &= ~(GITS_CBASER_SHAREABILITY_MASK | + GITS_CBASER_CACHEABILITY_MASK); + baser |= GITS_CBASER_nC; + writeq_relaxed(baser, its->base + GITS_CBASER); + } pr_info("ITS: using cache flushing for cmd queue\n"); its->flags |= ITS_FLAGS_CMDQ_NEEDS_FLUSHING; } + writeq_relaxed(0, its->base + GITS_CWRITER); + writel_relaxed(GITS_CTLR_ENABLE, its->base + GITS_CTLR); + if (of_property_read_bool(its->msi_chip.of_node, "msi-controller")) { its->domain = irq_domain_add_tree(NULL, &its_domain_ops, its); if (!its->domain) { diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 826a4bd63d4a..ffbc034c8810 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -126,6 +126,7 @@ #define GICR_PROPBASER_WaWb (5U << 7) #define GICR_PROPBASER_RaWaWt (6U << 7) #define GICR_PROPBASER_RaWaWb (7U << 7) +#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7) #define GICR_PROPBASER_IDBITS_MASK (0x1f) #define GICR_PENDBASER_NonShareable (0U << 10) @@ -140,6 +141,7 @@ #define GICR_PENDBASER_WaWb (5U << 7) #define GICR_PENDBASER_RaWaWt (6U << 7) #define GICR_PENDBASER_RaWaWb (7U << 7) +#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7) /* * Re-Distributor registers, offsets from SGI_base @@ -195,6 +197,7 @@ #define GITS_CBASER_WaWb (5UL << 59) #define GITS_CBASER_RaWaWt (6UL << 59) #define GITS_CBASER_RaWaWb (7UL << 59) +#define GITS_CBASER_CACHEABILITY_MASK (7UL << 59) #define GITS_CBASER_NonShareable (0UL << 10) #define GITS_CBASER_InnerShareable (1UL << 10) #define GITS_CBASER_OuterShareable (2UL << 10) @@ -211,6 +214,7 @@ #define GITS_BASER_WaWb (5UL << 59) #define GITS_BASER_RaWaWt (6UL << 59) #define GITS_BASER_RaWaWb (7UL << 59) +#define GITS_BASER_CACHEABILITY_MASK (7UL << 59) #define GITS_BASER_TYPE_SHIFT (56) #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) #define GITS_BASER_ENTRY_SIZE_SHIFT (48) -- cgit v1.2.3 From e9637415a92cf25ad800b7fdeddcd30cce7b44ab Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 30 Mar 2015 13:39:09 -0400 Subject: block: fix blk_stack_limits() regression due to lcm() change Linux 3.19 commit 69c953c ("lib/lcm.c: lcm(n,0)=lcm(0,n) is 0, not n") caused blk_stack_limits() to not properly stack queue_limits for stacked devices (e.g. DM). Fix this regression by establishing lcm_not_zero() and switching blk_stack_limits() over to using it. DM uses blk_set_stacking_limits() to establish the initial top-level queue_limits that are then built up based on underlying devices' limits using blk_stack_limits(). In the case of optimal_io_size (io_opt) blk_set_stacking_limits() establishes a default value of 0. With commit 69c953c, lcm(0, n) is no longer n, which compromises proper stacking of the underlying devices' io_opt. Test: $ modprobe scsi_debug dev_size_mb=10 num_tgts=1 opt_blks=1536 $ cat /sys/block/sde/queue/optimal_io_size 786432 $ dmsetup create node --table "0 100 linear /dev/sde 0" Before this fix: $ cat /sys/block/dm-5/queue/optimal_io_size 0 After this fix: $ cat /sys/block/dm-5/queue/optimal_io_size 786432 Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.19+ Acked-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 6 +++--- include/linux/lcm.h | 1 + lib/lcm.c | 11 +++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index 6ed2cbe5e8c9..12600bfffca9 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -585,7 +585,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->physical_block_size); t->io_min = max(t->io_min, b->io_min); - t->io_opt = lcm(t->io_opt, b->io_opt); + t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); t->cluster &= b->cluster; t->discard_zeroes_data &= b->discard_zeroes_data; @@ -616,7 +616,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->raid_partial_stripes_expensive); /* Find lowest common alignment_offset */ - t->alignment_offset = lcm(t->alignment_offset, alignment) + t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment) % max(t->physical_block_size, t->io_min); /* Verify that new alignment_offset is on a logical block boundary */ @@ -643,7 +643,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, b->max_discard_sectors); t->discard_granularity = max(t->discard_granularity, b->discard_granularity); - t->discard_alignment = lcm(t->discard_alignment, alignment) % + t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % t->discard_granularity; } diff --git a/include/linux/lcm.h b/include/linux/lcm.h index 7bf01d779b45..1ce79a7f1daa 100644 --- a/include/linux/lcm.h +++ b/include/linux/lcm.h @@ -4,5 +4,6 @@ #include unsigned long lcm(unsigned long a, unsigned long b) __attribute_const__; +unsigned long lcm_not_zero(unsigned long a, unsigned long b) __attribute_const__; #endif /* _LCM_H */ diff --git a/lib/lcm.c b/lib/lcm.c index e97dbd51e756..03d7fcb420b5 100644 --- a/lib/lcm.c +++ b/lib/lcm.c @@ -12,3 +12,14 @@ unsigned long lcm(unsigned long a, unsigned long b) return 0; } EXPORT_SYMBOL_GPL(lcm); + +unsigned long lcm_not_zero(unsigned long a, unsigned long b) +{ + unsigned long l = lcm(a, b); + + if (l) + return l; + + return (b ? : a); +} +EXPORT_SYMBOL_GPL(lcm_not_zero); -- cgit v1.2.3 From f9c72d10d6fbf949558cd088389a42213ed7b12d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 31 Mar 2015 12:03:28 -0400 Subject: sunrpc: make debugfs file creation failure non-fatal We currently have a problem that SELinux policy is being enforced when creating debugfs files. If a debugfs file is created as a side effect of doing some syscall, then that creation can fail if the SELinux policy for that process prevents it. This seems wrong. We don't do that for files under /proc, for instance, so Bruce has proposed a patch to fix that. While discussing that patch however, Greg K.H. stated: "No kernel code should care / fail if a debugfs function fails, so please fix up the sunrpc code first." This patch converts all of the sunrpc debugfs setup code to be void return functins, and the callers to not look for errors from those functions. This should allow rpc_clnt and rpc_xprt creation to work, even if the kernel fails to create debugfs files for some reason. Symptoms were failing krb5 mounts on systems using gss-proxy and selinux. Fixes: 388f0c776781 "sunrpc: add a debugfs rpc_xprt directory..." Cc: stable@vger.kernel.org Signed-off-by: Jeff Layton Acked-by: Greg Kroah-Hartman Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/debug.h | 18 +++++++-------- net/sunrpc/clnt.c | 4 +--- net/sunrpc/debugfs.c | 52 ++++++++++++++++++++++++-------------------- net/sunrpc/sunrpc_syms.c | 7 +----- net/sunrpc/xprt.c | 7 +----- 5 files changed, 41 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index c57d8ea0716c..59a7889e15db 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -60,17 +60,17 @@ struct rpc_xprt; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) void rpc_register_sysctl(void); void rpc_unregister_sysctl(void); -int sunrpc_debugfs_init(void); +void sunrpc_debugfs_init(void); void sunrpc_debugfs_exit(void); -int rpc_clnt_debugfs_register(struct rpc_clnt *); +void rpc_clnt_debugfs_register(struct rpc_clnt *); void rpc_clnt_debugfs_unregister(struct rpc_clnt *); -int rpc_xprt_debugfs_register(struct rpc_xprt *); +void rpc_xprt_debugfs_register(struct rpc_xprt *); void rpc_xprt_debugfs_unregister(struct rpc_xprt *); #else -static inline int +static inline void sunrpc_debugfs_init(void) { - return 0; + return; } static inline void @@ -79,10 +79,10 @@ sunrpc_debugfs_exit(void) return; } -static inline int +static inline void rpc_clnt_debugfs_register(struct rpc_clnt *clnt) { - return 0; + return; } static inline void @@ -91,10 +91,10 @@ rpc_clnt_debugfs_unregister(struct rpc_clnt *clnt) return; } -static inline int +static inline void rpc_xprt_debugfs_register(struct rpc_xprt *xprt) { - return 0; + return; } static inline void diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 612aa73bbc60..e6ce1517367f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -303,9 +303,7 @@ static int rpc_client_register(struct rpc_clnt *clnt, struct super_block *pipefs_sb; int err; - err = rpc_clnt_debugfs_register(clnt); - if (err) - return err; + rpc_clnt_debugfs_register(clnt); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index e811f390f9f6..82962f7e6e88 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -129,48 +129,52 @@ static const struct file_operations tasks_fops = { .release = tasks_release, }; -int +void rpc_clnt_debugfs_register(struct rpc_clnt *clnt) { - int len, err; + int len; char name[24]; /* enough for "../../rpc_xprt/ + 8 hex digits + NULL */ + struct rpc_xprt *xprt; /* Already registered? */ - if (clnt->cl_debugfs) - return 0; + if (clnt->cl_debugfs || !rpc_clnt_dir) + return; len = snprintf(name, sizeof(name), "%x", clnt->cl_clid); if (len >= sizeof(name)) - return -EINVAL; + return; /* make the per-client dir */ clnt->cl_debugfs = debugfs_create_dir(name, rpc_clnt_dir); if (!clnt->cl_debugfs) - return -ENOMEM; + return; /* make tasks file */ - err = -ENOMEM; if (!debugfs_create_file("tasks", S_IFREG | S_IRUSR, clnt->cl_debugfs, clnt, &tasks_fops)) goto out_err; - err = -EINVAL; rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + /* no "debugfs" dentry? Don't bother with the symlink. */ + if (!xprt->debugfs) { + rcu_read_unlock(); + return; + } len = snprintf(name, sizeof(name), "../../rpc_xprt/%s", - rcu_dereference(clnt->cl_xprt)->debugfs->d_name.name); + xprt->debugfs->d_name.name); rcu_read_unlock(); + if (len >= sizeof(name)) goto out_err; - err = -ENOMEM; if (!debugfs_create_symlink("xprt", clnt->cl_debugfs, name)) goto out_err; - return 0; + return; out_err: debugfs_remove_recursive(clnt->cl_debugfs); clnt->cl_debugfs = NULL; - return err; } void @@ -226,33 +230,33 @@ static const struct file_operations xprt_info_fops = { .release = xprt_info_release, }; -int +void rpc_xprt_debugfs_register(struct rpc_xprt *xprt) { int len, id; static atomic_t cur_id; char name[9]; /* 8 hex digits + NULL term */ + if (!rpc_xprt_dir) + return; + id = (unsigned int)atomic_inc_return(&cur_id); len = snprintf(name, sizeof(name), "%x", id); if (len >= sizeof(name)) - return -EINVAL; + return; /* make the per-client dir */ xprt->debugfs = debugfs_create_dir(name, rpc_xprt_dir); if (!xprt->debugfs) - return -ENOMEM; + return; /* make tasks file */ if (!debugfs_create_file("info", S_IFREG | S_IRUSR, xprt->debugfs, xprt, &xprt_info_fops)) { debugfs_remove_recursive(xprt->debugfs); xprt->debugfs = NULL; - return -ENOMEM; } - - return 0; } void @@ -266,14 +270,17 @@ void __exit sunrpc_debugfs_exit(void) { debugfs_remove_recursive(topdir); + topdir = NULL; + rpc_clnt_dir = NULL; + rpc_xprt_dir = NULL; } -int __init +void __init sunrpc_debugfs_init(void) { topdir = debugfs_create_dir("sunrpc", NULL); if (!topdir) - goto out; + return; rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); if (!rpc_clnt_dir) @@ -283,10 +290,9 @@ sunrpc_debugfs_init(void) if (!rpc_xprt_dir) goto out_remove; - return 0; + return; out_remove: debugfs_remove_recursive(topdir); topdir = NULL; -out: - return -ENOMEM; + rpc_clnt_dir = NULL; } diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index e37fbed87956..ee5d3d253102 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -98,10 +98,7 @@ init_sunrpc(void) if (err) goto out4; - err = sunrpc_debugfs_init(); - if (err) - goto out5; - + sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); #endif @@ -109,8 +106,6 @@ init_sunrpc(void) init_socket_xprt(); /* clnt sock transport */ return 0; -out5: - unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e3015aede0d9..9949722d99ce 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1331,7 +1331,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) */ struct rpc_xprt *xprt_create_transport(struct xprt_create *args) { - int err; struct rpc_xprt *xprt; struct xprt_class *t; @@ -1372,11 +1371,7 @@ found: return ERR_PTR(-ENOMEM); } - err = rpc_xprt_debugfs_register(xprt); - if (err) { - xprt_destroy(xprt); - return ERR_PTR(err); - } + rpc_xprt_debugfs_register(xprt); dprintk("RPC: created transport %p with %u slots\n", xprt, xprt->max_reqs); -- cgit v1.2.3 From f60e5990d9c1424af9dbca60a23ba2a1c7c1ce90 Mon Sep 17 00:00:00 2001 From: "hannes@stressinduktion.org" Date: Wed, 1 Apr 2015 17:07:44 +0200 Subject: ipv6: protect skb->sk accesses from recursive dereference inside the stack We should not consult skb->sk for output decisions in xmit recursion levels > 0 in the stack. Otherwise local socket settings could influence the result of e.g. tunnel encapsulation process. ipv6 does not conform with this in three places: 1) ip6_fragment: we do consult ipv6_npinfo for frag_size 2) sk_mc_loop in ipv6 uses skb->sk and checks if we should loop the packet back to the local socket 3) ip6_skb_dst_mtu could query the settings from the user socket and force a wrong MTU Furthermore: In sk_mc_loop we could potentially land in WARN_ON(1) if we use a PF_PACKET socket ontop of an IPv6-backed vxlan device. Reuse xmit_recursion as we are currently only interested in protecting tunnel devices. Cc: Jiri Pirko Signed-off-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/net/ip.h | 16 ---------------- include/net/ip6_route.h | 3 ++- include/net/sock.h | 2 ++ net/core/dev.c | 4 +++- net/core/sock.c | 19 +++++++++++++++++++ net/ipv6/ip6_output.c | 3 ++- 7 files changed, 34 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dcf6ec27739b..278738873703 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2185,6 +2185,12 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +DECLARE_PER_CPU(int, xmit_recursion); +static inline int dev_recursion_level(void) +{ + return this_cpu_read(xmit_recursion); +} + struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/include/net/ip.h b/include/net/ip.h index 025c61c0dffb..6cc1eafb153a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -453,22 +453,6 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif -static inline int sk_mc_loop(struct sock *sk) -{ - if (!sk) - return 1; - switch (sk->sk_family) { - case AF_INET: - return inet_sk(sk)->mc_loop; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - return inet6_sk(sk)->mc_loop; -#endif - } - WARN_ON(1); - return 1; -} - bool ip_call_ra_chain(struct sk_buff *skb); /* diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 1d09b46c1e48..eda131d179d9 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -174,7 +174,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); static inline int ip6_skb_dst_mtu(struct sk_buff *skb) { - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; return (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) ? skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb)); diff --git a/include/net/sock.h b/include/net/sock.h index ab186b1d31ff..e4079c28e6b8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1762,6 +1762,8 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); +bool sk_mc_loop(struct sock *sk); + static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); diff --git a/net/core/dev.c b/net/core/dev.c index 962ee9d71964..45109b70664e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2848,7 +2848,9 @@ static void skb_update_prio(struct sk_buff *skb) #define skb_update_prio(skb) #endif -static DEFINE_PER_CPU(int, xmit_recursion); +DEFINE_PER_CPU(int, xmit_recursion); +EXPORT_SYMBOL(xmit_recursion); + #define RECURSION_LIMIT 10 /** diff --git a/net/core/sock.c b/net/core/sock.c index 78e89eb7eb70..71e3e5f1eaa0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -653,6 +653,25 @@ static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) sock_reset_flag(sk, bit); } +bool sk_mc_loop(struct sock *sk) +{ + if (dev_recursion_level()) + return false; + if (!sk) + return true; + switch (sk->sk_family) { + case AF_INET: + return inet_sk(sk)->mc_loop; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + return inet6_sk(sk)->mc_loop; +#endif + } + WARN_ON(1); + return true; +} +EXPORT_SYMBOL(sk_mc_loop); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7e80b61b51ff..36cf0ab685a0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -542,7 +542,8 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; + struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? + inet6_sk(skb->sk) : NULL; struct ipv6hdr *tmp_hdr; struct frag_hdr *fh; unsigned int mtu, hlen, left, len; -- cgit v1.2.3