diff options
58 files changed, 2045 insertions, 1660 deletions
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 638bf17ff869..821de56d1580 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -24,7 +24,6 @@ CONTENTS: 2.1 Basic Usage 2.2 Attaching processes 2.3 Mounting hierarchies by name - 2.4 Notification API 3. Kernel API 3.1 Overview 3.2 Synchronization @@ -472,25 +471,6 @@ you give a subsystem a name. The name of the subsystem appears as part of the hierarchy description in /proc/mounts and /proc/<pid>/cgroups. -2.4 Notification API --------------------- - -There is mechanism which allows to get notifications about changing -status of a cgroup. - -To register a new notification handler you need to: - - create a file descriptor for event notification using eventfd(2); - - open a control file to be monitored (e.g. memory.usage_in_bytes); - - write "<event_fd> <control_fd> <args>" to cgroup.event_control. - Interpretation of args is defined by control file implementation; - -eventfd will be woken up by control file implementation or when the -cgroup is removed. - -To unregister a notification handler just close eventfd. - -NOTE: Support of notifications should be implemented for the control -file. See documentation for the subsystem. 3. Kernel API ============= diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index e2bc132608fd..2622115276aa 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -577,7 +577,7 @@ Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" per-node page counts including "hierarchical_<counter>" which sums up all hierarchical children's values in addition to the memcg's own value. -The ouput format of memory.numa_stat is: +The output format of memory.numa_stat is: total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ... file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ... @@ -670,7 +670,7 @@ page tables. 8.1 Interface -This feature is disabled by default. It can be enabledi (and disabled again) by +This feature is disabled by default. It can be enabled (and disabled again) by writing to memory.move_charge_at_immigrate of the destination cgroup. If you want to enable it: diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt index c4d99ed0b418..52e1da16a309 100644 --- a/Documentation/cgroups/resource_counter.txt +++ b/Documentation/cgroups/resource_counter.txt @@ -97,8 +97,8 @@ to work with it. (struct res_counter *rc, struct res_counter *top, unsinged long val) - Almost same as res_cunter_uncharge() but propagation of uncharge - stops when rc == top. This is useful when kill a res_coutner in + Almost same as res_counter_uncharge() but propagation of uncharge + stops when rc == top. This is useful when kill a res_counter in child cgroup. 2.1 Other accounting routines diff --git a/Documentation/devicetree/bindings/ata/marvell.txt b/Documentation/devicetree/bindings/ata/marvell.txt index b5cdd20cde9c..1c8351604d38 100644 --- a/Documentation/devicetree/bindings/ata/marvell.txt +++ b/Documentation/devicetree/bindings/ata/marvell.txt @@ -1,7 +1,7 @@ * Marvell Orion SATA Required Properties: -- compatibility : "marvell,orion-sata" +- compatibility : "marvell,orion-sata" or "marvell,armada-370-sata" - reg : Address range of controller - interrupts : Interrupt controller is using - nr-ports : Number of SATA ports in use. diff --git a/Documentation/devicetree/bindings/ata/sata_rcar.txt b/Documentation/devicetree/bindings/ata/sata_rcar.txt new file mode 100644 index 000000000000..1e6111333fa8 --- /dev/null +++ b/Documentation/devicetree/bindings/ata/sata_rcar.txt @@ -0,0 +1,18 @@ +* Renesas R-Car SATA + +Required properties: +- compatible : should contain one of the following: + - "renesas,sata-r8a7779" for R-Car H1 + - "renesas,sata-r8a7790" for R-Car H2 + - "renesas,sata-r8a7791" for R-Car M2 +- reg : address and length of the SATA registers; +- interrupts : must consist of one interrupt specifier. + +Example: + +sata: sata@fc600000 { + compatible = "renesas,sata-r8a7779"; + reg = <0xfc600000 0x2000>; + interrupt-parent = <&gic>; + interrupts = <0 100 IRQ_TYPE_LEVEL_HIGH>; +}; diff --git a/arch/arm/boot/dts/armada-370-xp.dtsi b/arch/arm/boot/dts/armada-370-xp.dtsi index 7f10f627ae5b..80ffacd128f8 100644 --- a/arch/arm/boot/dts/armada-370-xp.dtsi +++ b/arch/arm/boot/dts/armada-370-xp.dtsi @@ -152,7 +152,7 @@ }; sata@a0000 { - compatible = "marvell,orion-sata"; + compatible = "marvell,armada-370-sata"; reg = <0xa0000 0x5000>; interrupts = <55>; clocks = <&gateclk 15>, <&gateclk 30>; diff --git a/arch/arm/plat-samsung/include/plat/regs-ata.h b/arch/arm/plat-samsung/include/plat/regs-ata.h deleted file mode 100644 index f5df92fdae26..000000000000 --- a/arch/arm/plat-samsung/include/plat/regs-ata.h +++ /dev/null @@ -1,56 +0,0 @@ -/* linux/arch/arm/plat-samsung/include/plat/regs-ata.h - * - * Copyright (c) 2010 Samsung Electronics Co., Ltd. - * http://www.samsung.com - * - * Samsung CF-ATA register definitions - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. -*/ - -#ifndef __ASM_PLAT_REGS_ATA_H -#define __ASM_PLAT_REGS_ATA_H __FILE__ - -#define S3C_CFATA_REG(x) (x) - -#define S3C_CFATA_MUX S3C_CFATA_REG(0x0) - -#define S3C_ATA_CTRL S3C_CFATA_REG(0x0) -#define S3C_ATA_STATUS S3C_CFATA_REG(0x4) -#define S3C_ATA_CMD S3C_CFATA_REG(0x8) -#define S3C_ATA_SWRST S3C_CFATA_REG(0xc) -#define S3C_ATA_IRQ S3C_CFATA_REG(0x10) -#define S3C_ATA_IRQ_MSK S3C_CFATA_REG(0x14) -#define S3C_ATA_CFG S3C_CFATA_REG(0x18) - -#define S3C_ATA_MDMA_TIME S3C_CFATA_REG(0x28) -#define S3C_ATA_PIO_TIME S3C_CFATA_REG(0x2c) -#define S3C_ATA_UDMA_TIME S3C_CFATA_REG(0x30) -#define S3C_ATA_XFR_NUM S3C_CFATA_REG(0x34) -#define S3C_ATA_XFR_CNT S3C_CFATA_REG(0x38) -#define S3C_ATA_TBUF_START S3C_CFATA_REG(0x3c) -#define S3C_ATA_TBUF_SIZE S3C_CFATA_REG(0x40) -#define S3C_ATA_SBUF_START S3C_CFATA_REG(0x44) -#define S3C_ATA_SBUF_SIZE S3C_CFATA_REG(0x48) -#define S3C_ATA_CADR_TBUF S3C_CFATA_REG(0x4c) -#define S3C_ATA_CADR_SBUF S3C_CFATA_REG(0x50) -#define S3C_ATA_PIO_DTR S3C_CFATA_REG(0x54) -#define S3C_ATA_PIO_FED S3C_CFATA_REG(0x58) -#define S3C_ATA_PIO_SCR S3C_CFATA_REG(0x5c) -#define S3C_ATA_PIO_LLR S3C_CFATA_REG(0x60) -#define S3C_ATA_PIO_LMR S3C_CFATA_REG(0x64) -#define S3C_ATA_PIO_LHR S3C_CFATA_REG(0x68) -#define S3C_ATA_PIO_DVR S3C_CFATA_REG(0x6c) -#define S3C_ATA_PIO_CSD S3C_CFATA_REG(0x70) -#define S3C_ATA_PIO_DAD S3C_CFATA_REG(0x74) -#define S3C_ATA_PIO_READY S3C_CFATA_REG(0x78) -#define S3C_ATA_PIO_RDATA S3C_CFATA_REG(0x7c) - -#define S3C_CFATA_MUX_TRUEIDE 0x01 - -#define S3C_ATA_CFG_SWAP 0x40 -#define S3C_ATA_CFG_IORDYEN 0x02 - -#endif /* __ASM_PLAT_REGS_ATA_H */ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 06534049afba..a760857e6b62 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1303,13 +1303,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, - cft->private, true); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, + &blkcg_policy_throtl, seq_cft(sf)->private, true); return 0; } @@ -1335,19 +1332,17 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, return __blkg_prfill_u64(sf, pd, v); } -static int tg_print_conf_u64(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_conf_u64(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, - &blkcg_policy_throtl, cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, + &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } -static int tg_print_conf_uint(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int tg_print_conf_uint(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, - &blkcg_policy_throtl, cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, + &blkcg_policy_throtl, seq_cft(sf)->private, false); return 0; } @@ -1428,40 +1423,40 @@ static struct cftype throtl_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), - .read_seq_string = tg_print_conf_u64, + .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, .max_write_len = 256, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), - .read_seq_string = tg_print_conf_u64, + .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, .max_write_len = 256, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), - .read_seq_string = tg_print_conf_uint, + .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, .max_write_len = 256, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), - .read_seq_string = tg_print_conf_uint, + .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, .max_write_len = 256, }, { .name = "throttle.io_service_bytes", .private = offsetof(struct tg_stats_cpu, service_bytes), - .read_seq_string = tg_print_cpu_rwstat, + .seq_show = tg_print_cpu_rwstat, }, { .name = "throttle.io_serviced", .private = offsetof(struct tg_stats_cpu, serviced), - .read_seq_string = tg_print_cpu_rwstat, + .seq_show = tg_print_cpu_rwstat, }, { } /* terminate */ }; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4d5cec1ad80d..744833b630c6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1632,11 +1632,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); } -static int cfqg_print_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_weight_device(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, - &blkcg_policy_cfq, 0, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_weight_device, &blkcg_policy_cfq, + 0, false); return 0; } @@ -1650,26 +1650,23 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); } -static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, - struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) { - blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, - &blkcg_policy_cfq, 0, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, + 0, false); return 0; } -static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *sf) +static int cfq_print_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); + seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); return 0; } -static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfq_print_leaf_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); + seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); return 0; } @@ -1762,23 +1759,17 @@ static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, return __cfq_set_weight(css, cft, val, true); } -static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_stat(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, - cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_cfq, seq_cft(sf)->private, false); return 0; } -static int cfqg_print_rwstat(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_rwstat(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, - cft->private, true); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_cfq, seq_cft(sf)->private, true); return 0; } @@ -1798,23 +1789,19 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &sum); } -static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, - &blkcg_policy_cfq, cft->private, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_stat_recursive, &blkcg_policy_cfq, + seq_cft(sf)->private, false); return 0; } -static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, - &blkcg_policy_cfq, cft->private, true); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, + seq_cft(sf)->private, true); return 0; } @@ -1835,13 +1822,11 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, } /* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *sf) +static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) { - struct blkcg *blkcg = css_to_blkcg(css); - - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, - &blkcg_policy_cfq, 0, false); + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, + 0, false); return 0; } #endif /* CONFIG_DEBUG_BLK_CGROUP */ @@ -1851,14 +1836,14 @@ static struct cftype cfq_blkcg_files[] = { { .name = "weight_device", .flags = CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cfqg_print_leaf_weight_device, + .seq_show = cfqg_print_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device, .max_write_len = 256, }, { .name = "weight", .flags = CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cfq_print_leaf_weight, + .seq_show = cfq_print_leaf_weight, .write_u64 = cfq_set_leaf_weight, }, @@ -1866,26 +1851,26 @@ static struct cftype cfq_blkcg_files[] = { { .name = "weight_device", .flags = CFTYPE_NOT_ON_ROOT, - .read_seq_string = cfqg_print_weight_device, + .seq_show = cfqg_print_weight_device, .write_string = cfqg_set_weight_device, .max_write_len = 256, }, { .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, - .read_seq_string = cfq_print_weight, + .seq_show = cfq_print_weight, .write_u64 = cfq_set_weight, }, { .name = "leaf_weight_device", - .read_seq_string = cfqg_print_leaf_weight_device, + .seq_show = cfqg_print_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device, .max_write_len = 256, }, { .name = "leaf_weight", - .read_seq_string = cfq_print_leaf_weight, + .seq_show = cfq_print_leaf_weight, .write_u64 = cfq_set_leaf_weight, }, @@ -1893,114 +1878,114 @@ static struct cftype cfq_blkcg_files[] = { { .name = "time", .private = offsetof(struct cfq_group, stats.time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "sectors", .private = offsetof(struct cfq_group, stats.sectors), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "io_service_bytes", .private = offsetof(struct cfq_group, stats.service_bytes), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_serviced", .private = offsetof(struct cfq_group, stats.serviced), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_service_time", .private = offsetof(struct cfq_group, stats.service_time), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_wait_time", .private = offsetof(struct cfq_group, stats.wait_time), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_merged", .private = offsetof(struct cfq_group, stats.merged), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, { .name = "io_queued", .private = offsetof(struct cfq_group, stats.queued), - .read_seq_string = cfqg_print_rwstat, + .seq_show = cfqg_print_rwstat, }, /* the same statictics which cover the cfqg and its descendants */ { .name = "time_recursive", .private = offsetof(struct cfq_group, stats.time), - .read_seq_string = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_recursive, }, { .name = "sectors_recursive", .private = offsetof(struct cfq_group, stats.sectors), - .read_seq_string = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_recursive, }, { .name = "io_service_bytes_recursive", .private = offsetof(struct cfq_group, stats.service_bytes), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_serviced_recursive", .private = offsetof(struct cfq_group, stats.serviced), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_service_time_recursive", .private = offsetof(struct cfq_group, stats.service_time), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_wait_time_recursive", .private = offsetof(struct cfq_group, stats.wait_time), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_merged_recursive", .private = offsetof(struct cfq_group, stats.merged), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, { .name = "io_queued_recursive", .private = offsetof(struct cfq_group, stats.queued), - .read_seq_string = cfqg_print_rwstat_recursive, + .seq_show = cfqg_print_rwstat_recursive, }, #ifdef CONFIG_DEBUG_BLK_CGROUP { .name = "avg_queue_size", - .read_seq_string = cfqg_print_avg_queue_size, + .seq_show = cfqg_print_avg_queue_size, }, { .name = "group_wait_time", .private = offsetof(struct cfq_group, stats.group_wait_time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "idle_time", .private = offsetof(struct cfq_group, stats.idle_time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "empty_time", .private = offsetof(struct cfq_group, stats.empty_time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "dequeue", .private = offsetof(struct cfq_group, stats.dequeue), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, { .name = "unaccounted_time", .private = offsetof(struct cfq_group, stats.unaccounted_time), - .read_seq_string = cfqg_print_stat, + .seq_show = cfqg_print_stat, }, #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index e3a92a6da39a..74911c2cb1dd 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -83,6 +83,8 @@ enum board_ids { static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent); static int ahci_vt8251_hardreset(struct ata_link *link, unsigned int *class, unsigned long deadline); +static void ahci_mcp89_apple_enable(struct pci_dev *pdev); +static bool is_mcp89_apple(struct pci_dev *pdev); static int ahci_p5wdh_hardreset(struct ata_link *link, unsigned int *class, unsigned long deadline); #ifdef CONFIG_PM @@ -664,6 +666,10 @@ static int ahci_pci_device_resume(struct pci_dev *pdev) if (rc) return rc; + /* Apple BIOS helpfully mangles the registers on resume */ + if (is_mcp89_apple(pdev)) + ahci_mcp89_apple_enable(pdev); + if (pdev->dev.power.power_state.event == PM_EVENT_SUSPEND) { rc = ahci_pci_reset_controller(host); if (rc) @@ -780,6 +786,48 @@ static void ahci_p5wdh_workaround(struct ata_host *host) } } +/* + * Macbook7,1 firmware forcibly disables MCP89 AHCI and changes PCI ID when + * booting in BIOS compatibility mode. We restore the registers but not ID. + */ +static void ahci_mcp89_apple_enable(struct pci_dev *pdev) +{ + u32 val; + + printk(KERN_INFO "ahci: enabling MCP89 AHCI mode\n"); + + pci_read_config_dword(pdev, 0xf8, &val); + val |= 1 << 0x1b; + /* the following changes the device ID, but appears not to affect function */ + /* val = (val & ~0xf0000000) | 0x80000000; */ + pci_write_config_dword(pdev, 0xf8, val); + + pci_read_config_dword(pdev, 0x54c, &val); + val |= 1 << 0xc; + pci_write_config_dword(pdev, 0x54c, val); + + pci_read_config_dword(pdev, 0x4a4, &val); + val &= 0xff; + val |= 0x01060100; + pci_write_config_dword(pdev, 0x4a4, val); + + pci_read_config_dword(pdev, 0x54c, &val); + val &= ~(1 << 0xc); + pci_write_config_dword(pdev, 0x54c, val); + + pci_read_config_dword(pdev, 0xf8, &val); + val &= ~(1 << 0x1b); + pci_write_config_dword(pdev, 0xf8, val); +} + +static bool is_mcp89_apple(struct pci_dev *pdev) +{ + return pdev->vendor == PCI_VENDOR_ID_NVIDIA && + pdev->device == PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA && + pdev->subsystem_vendor == PCI_VENDOR_ID_APPLE && + pdev->subsystem_device == 0xcb89; +} + /* only some SB600 ahci controllers can do 64bit DMA */ static bool ahci_sb600_enable_64bit(struct pci_dev *pdev) { @@ -1100,7 +1148,7 @@ static inline void ahci_gtf_filter_workaround(struct ata_host *host) {} #endif -int ahci_init_interrupts(struct pci_dev *pdev, struct ahci_host_priv *hpriv) +static int ahci_init_interrupts(struct pci_dev *pdev, struct ahci_host_priv *hpriv) { int rc; unsigned int maxvec; @@ -1212,15 +1260,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (pdev->vendor == PCI_VENDOR_ID_MARVELL && !marvell_enable) return -ENODEV; - /* - * For some reason, MCP89 on MacBook 7,1 doesn't work with - * ahci, use ata_generic instead. - */ - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && - pdev->device == PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA && - pdev->subsystem_vendor == PCI_VENDOR_ID_APPLE && - pdev->subsystem_device == 0xcb89) - return -ENODEV; + /* Apple BIOS on MCP89 prevents us using AHCI */ + if (is_mcp89_apple(pdev)) + ahci_mcp89_apple_enable(pdev); /* Promise's PDC42819 is a SAS/SATA controller that has an AHCI mode. * At the moment, we can only use the AHCI mode. Let the users know diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c index 3e23e9941dad..dd4d6f74d7bd 100644 --- a/drivers/ata/ahci_imx.c +++ b/drivers/ata/ahci_imx.c @@ -34,10 +34,21 @@ enum { HOST_TIMER1MS = 0xe0, /* Timer 1-ms */ }; +enum ahci_imx_type { + AHCI_IMX53, + AHCI_IMX6Q, +}; + struct imx_ahci_priv { struct platform_device *ahci_pdev; + enum ahci_imx_type type; + + /* i.MX53 clock */ + struct clk *sata_gate_clk; + /* Common clock */ struct clk *sata_ref_clk; struct clk *ahb_clk; + struct regmap *gpr; bool no_device; bool first_time; @@ -47,6 +58,59 @@ static int ahci_imx_hotplug; module_param_named(hotplug, ahci_imx_hotplug, int, 0644); MODULE_PARM_DESC(hotplug, "AHCI IMX hot-plug support (0=Don't support, 1=support)"); +static int imx_sata_clock_enable(struct device *dev) +{ + struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent); + int ret; + + if (imxpriv->type == AHCI_IMX53) { + ret = clk_prepare_enable(imxpriv->sata_gate_clk); + if (ret < 0) { + dev_err(dev, "prepare-enable sata_gate clock err:%d\n", + ret); + return ret; + } + } + + ret = clk_prepare_enable(imxpriv->sata_ref_clk); + if (ret < 0) { + dev_err(dev, "prepare-enable sata_ref clock err:%d\n", + ret); + goto clk_err; + } + + if (imxpriv->type == AHCI_IMX6Q) { + regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, + IMX6Q_GPR13_SATA_MPLL_CLK_EN, + IMX6Q_GPR13_SATA_MPLL_CLK_EN); + } + + usleep_range(1000, 2000); + + return 0; + +clk_err: + if (imxpriv->type == AHCI_IMX53) + clk_disable_unprepare(imxpriv->sata_gate_clk); + return ret; +} + +static void imx_sata_clock_disable(struct device *dev) +{ + struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent); + + if (imxpriv->type == AHCI_IMX6Q) { + regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, + IMX6Q_GPR13_SATA_MPLL_CLK_EN, + !IMX6Q_GPR13_SATA_MPLL_CLK_EN); + } + + clk_disable_unprepare(imxpriv->sata_ref_clk); + + if (imxpriv->type == AHCI_IMX53) + clk_disable_unprepare(imxpriv->sata_gate_clk); +} + static void ahci_imx_error_handler(struct ata_port *ap) { u32 reg_val; @@ -72,16 +136,29 @@ static void ahci_imx_error_handler(struct ata_port *ap) */ reg_val = readl(mmio + PORT_PHY_CTL); writel(reg_val | PORT_PHY_CTL_PDDQ_LOC, mmio + PORT_PHY_CTL); - regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, - IMX6Q_GPR13_SATA_MPLL_CLK_EN, - !IMX6Q_GPR13_SATA_MPLL_CLK_EN); - clk_disable_unprepare(imxpriv->sata_ref_clk); + imx_sata_clock_disable(ap->dev); imxpriv->no_device = true; } +static int ahci_imx_softreset(struct ata_link *link, unsigned int *class, + unsigned long deadline) +{ + struct ata_port *ap = link->ap; + struct imx_ahci_priv *imxpriv = dev_get_drvdata(ap->dev->parent); + int ret = -EIO; + + if (imxpriv->type == AHCI_IMX53) + ret = ahci_pmp_retry_srst_ops.softreset(link, class, deadline); + else if (imxpriv->type == AHCI_IMX6Q) + ret = ahci_ops.softreset(link, class, deadline); + + return ret; +} + static struct ata_port_operations ahci_imx_ops = { .inherits = &ahci_platform_ops, .error_handler = ahci_imx_error_handler, + .softreset = ahci_imx_softreset, }; static const struct ata_port_info ahci_imx_port_info = { @@ -91,52 +168,15 @@ static const struct ata_port_info ahci_imx_port_info = { .port_ops = &ahci_imx_ops, }; -static int imx6q_sata_init(struct device *dev, void __iomem *mmio) +static int imx_sata_init(struct device *dev, void __iomem *mmio) { int ret = 0; unsigned int reg_val; struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent); - imxpriv->gpr = - syscon_regmap_lookup_by_compatible("fsl,imx6q-iomuxc-gpr"); - if (IS_ERR(imxpriv->gpr)) { - dev_err(dev, "failed to find fsl,imx6q-iomux-gpr regmap\n"); - return PTR_ERR(imxpriv->gpr); - } - - ret = clk_prepare_enable(imxpriv->sata_ref_clk); - if (ret < 0) { - dev_err(dev, "prepare-enable sata_ref clock err:%d\n", ret); + ret = imx_sata_clock_enable(dev); + if (ret < 0) return ret; - } - - /* - * set PHY Paremeters, two steps to configure the GPR13, - * one write for rest of parameters, mask of first write - * is 0x07ffffff, and the other one write for setting - * the mpll_clk_en. - */ - regmap_update_bits(imxpriv->gpr, 0x34, IMX6Q_GPR13_SATA_RX_EQ_VAL_MASK - | IMX6Q_GPR13_SATA_RX_LOS_LVL_MASK - | IMX6Q_GPR13_SATA_RX_DPLL_MODE_MASK - | IMX6Q_GPR13_SATA_SPD_MODE_MASK - | IMX6Q_GPR13_SATA_MPLL_SS_EN - | IMX6Q_GPR13_SATA_TX_ATTEN_MASK - | IMX6Q_GPR13_SATA_TX_BOOST_MASK - | IMX6Q_GPR13_SATA_TX_LVL_MASK - | IMX6Q_GPR13_SATA_MPLL_CLK_EN - | IMX6Q_GPR13_SATA_TX_EDGE_RATE - , IMX6Q_GPR13_SATA_RX_EQ_VAL_3_0_DB - | IMX6Q_GPR13_SATA_RX_LOS_LVL_SATA2M - | IMX6Q_GPR13_SATA_RX_DPLL_MODE_2P_4F - | IMX6Q_GPR13_SATA_SPD_MODE_3P0G - | IMX6Q_GPR13_SATA_MPLL_SS_EN - | IMX6Q_GPR13_SATA_TX_ATTEN_9_16 - | IMX6Q_GPR13_SATA_TX_BOOST_3_33_DB - | IMX6Q_GPR13_SATA_TX_LVL_1_025_V); - regmap_update_bits(imxpriv->gpr, 0x34, IMX6Q_GPR13_SATA_MPLL_CLK_EN, - IMX6Q_GPR13_SATA_MPLL_CLK_EN); - usleep_range(100, 200); /* * Configure the HWINIT bits of the HOST_CAP and HOST_PORTS_IMPL, @@ -162,13 +202,9 @@ static int imx6q_sata_init(struct device *dev, void __iomem *mmio) return 0; } -static void imx6q_sata_exit(struct device *dev) +static void imx_sata_exit(struct device *dev) { - struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent); - - regmap_update_bits(imxpriv->gpr, 0x34, IMX6Q_GPR13_SATA_MPLL_CLK_EN, - !IMX6Q_GPR13_SATA_MPLL_CLK_EN); - clk_disable_unprepare(imxpriv->sata_ref_clk); + imx_sata_clock_disable(dev); } static int imx_ahci_suspend(struct device *dev) @@ -179,12 +215,8 @@ static int imx_ahci_suspend(struct device *dev) * If no_device is set, The CLKs had been gated off in the * initialization so don't do it again here. */ - if (!imxpriv->no_device) { - regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, - IMX6Q_GPR13_SATA_MPLL_CLK_EN, - !IMX6Q_GPR13_SATA_MPLL_CLK_EN); - clk_disable_unprepare(imxpriv->sata_ref_clk); - } + if (!imxpriv->no_device) + imx_sata_clock_disable(dev); return 0; } @@ -192,34 +224,26 @@ static int imx_ahci_suspend(struct device *dev) static int imx_ahci_resume(struct device *dev) { struct imx_ahci_priv *imxpriv = dev_get_drvdata(dev->parent); - int ret; - - if (!imxpriv->no_device) { - ret = clk_prepare_enable(imxpriv->sata_ref_clk); - if (ret < 0) { - dev_err(dev, "pre-enable sata_ref clock err:%d\n", ret); - return ret; - } + int ret = 0; - regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, - IMX6Q_GPR13_SATA_MPLL_CLK_EN, - IMX6Q_GPR13_SATA_MPLL_CLK_EN); - usleep_range(1000, 2000); - } + if (!imxpriv->no_device) + ret = imx_sata_clock_enable(dev); - return 0; + return ret; } -static struct ahci_platform_data imx6q_sata_pdata = { - .init = imx6q_sata_init, - .exit = imx6q_sata_exit, - .ata_port_info = &ahci_imx_port_info, - .suspend = imx_ahci_suspend, - .resume = imx_ahci_resume, +static struct ahci_platform_data imx_sata_pdata = { + .init = imx_sata_init, + .exit = imx_sata_exit, + .ata_port_info = &ahci_imx_port_info, + .suspend = imx_ahci_suspend, + .resume = imx_ahci_resume, + }; static const struct of_device_id imx_ahci_of_match[] = { - { .compatible = "fsl,imx6q-ahci", .data = &imx6q_sata_pdata}, + { .compatible = "fsl,imx53-ahci", .data = (void *)AHCI_IMX53 }, + { .compatible = "fsl,imx6q-ahci", .data = (void *)AHCI_IMX6Q }, {}, }; MODULE_DEVICE_TABLE(of, imx_ahci_of_match); @@ -229,12 +253,20 @@ static int imx_ahci_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct resource *mem, *irq, res[2]; const struct of_device_id *of_id; + enum ahci_imx_type type; const struct ahci_platform_data *pdata = NULL; struct imx_ahci_priv *imxpriv; struct device *ahci_dev; struct platform_device *ahci_pdev; int ret; + of_id = of_match_device(imx_ahci_of_match, dev); + if (!of_id) + return -EINVAL; + + type = (enum ahci_imx_type)of_id->data; + pdata = &imx_sata_pdata; + imxpriv = devm_kzalloc(dev, sizeof(*imxpriv), GFP_KERNEL); if (!imxpriv) { dev_err(dev, "can't alloc ahci_host_priv\n"); @@ -250,6 +282,8 @@ static int imx_ahci_probe(struct platform_device *pdev) imxpriv->no_device = false; imxpriv->first_time = true; + imxpriv->type = type; + imxpriv->ahb_clk = devm_clk_get(dev, "ahb"); if (IS_ERR(imxpriv->ahb_clk)) { dev_err(dev, "can't get ahb clock.\n"); @@ -257,6 +291,15 @@ static int imx_ahci_probe(struct platform_device *pdev) goto err_out; } + if (type == AHCI_IMX53) { + imxpriv->sata_gate_clk = devm_clk_get(dev, "sata_gate"); + if (IS_ERR(imxpriv->sata_gate_clk)) { + dev_err(dev, "can't get sata_gate clock.\n"); + ret = PTR_ERR(imxpriv->sata_gate_clk); + goto err_out; + } + } + imxpriv->sata_ref_clk = devm_clk_get(dev, "sata_ref"); if (IS_ERR(imxpriv->sata_ref_clk)) { dev_err(dev, "can't get sata_ref clock.\n"); @@ -267,14 +310,6 @@ static int imx_ahci_probe(struct platform_device *pdev) imxpriv->ahci_pdev = ahci_pdev; platform_set_drvdata(pdev, imxpriv); - of_id = of_match_device(imx_ahci_of_match, dev); - if (of_id) { - pdata = of_id->data; - } else { - ret = -EINVAL; - goto err_out; - } - mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0); if (!mem || !irq) { @@ -290,6 +325,43 @@ static int imx_ahci_probe(struct platform_device *pdev) ahci_dev->dma_mask = &ahci_dev->coherent_dma_mask; ahci_dev->of_node = dev->of_node; + if (type == AHCI_IMX6Q) { + imxpriv->gpr = syscon_regmap_lookup_by_compatible( + "fsl,imx6q-iomuxc-gpr"); + if (IS_ERR(imxpriv->gpr)) { + dev_err(dev, + "failed to find fsl,imx6q-iomux-gpr regmap\n"); + ret = PTR_ERR(imxpriv->gpr); + goto err_out; + } + + /* + * Set PHY Paremeters, two steps to configure the GPR13, + * one write for rest of parameters, mask of first write + * is 0x07fffffe, and the other one write for setting + * the mpll_clk_en happens in imx_sata_clock_enable(). + */ + regmap_update_bits(imxpriv->gpr, IOMUXC_GPR13, + IMX6Q_GPR13_SATA_RX_EQ_VAL_MASK | + IMX6Q_GPR13_SATA_RX_LOS_LVL_MASK | + IMX6Q_GPR13_SATA_RX_DPLL_MODE_MASK | + IMX6Q_GPR13_SATA_SPD_MODE_MASK | + IMX6Q_GPR13_SATA_MPLL_SS_EN | + IMX6Q_GPR13_SATA_TX_ATTEN_MASK | + IMX6Q_GPR13_SATA_TX_BOOST_MASK | + IMX6Q_GPR13_SATA_TX_LVL_MASK | + IMX6Q_GPR13_SATA_MPLL_CLK_EN | + IMX6Q_GPR13_SATA_TX_EDGE_RATE, + IMX6Q_GPR13_SATA_RX_EQ_VAL_3_0_DB | + IMX6Q_GPR13_SATA_RX_LOS_LVL_SATA2M | + IMX6Q_GPR13_SATA_RX_DPLL_MODE_2P_4F | + IMX6Q_GPR13_SATA_SPD_MODE_3P0G | + IMX6Q_GPR13_SATA_MPLL_SS_EN | + IMX6Q_GPR13_SATA_TX_ATTEN_9_16 | + IMX6Q_GPR13_SATA_TX_BOOST_3_33_DB | + IMX6Q_GPR13_SATA_TX_LVL_1_025_V); + } + ret = platform_device_add_resources(ahci_pdev, res, 2); if (ret) goto err_out; diff --git a/drivers/ata/ata_generic.c b/drivers/ata/ata_generic.c index f8f38a08abc5..7d196656adb5 100644 --- a/drivers/ata/ata_generic.c +++ b/drivers/ata/ata_generic.c @@ -221,13 +221,6 @@ static struct pci_device_id ata_generic[] = { { PCI_DEVICE(PCI_VENDOR_ID_OPTI, PCI_DEVICE_ID_OPTI_82C558), }, { PCI_DEVICE(PCI_VENDOR_ID_CENATEK,PCI_DEVICE_ID_CENATEK_IDE), .driver_data = ATA_GEN_FORCE_DMA }, - /* - * For some reason, MCP89 on MacBook 7,1 doesn't work with - * ahci, use ata_generic instead. - */ - { PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA, - PCI_VENDOR_ID_APPLE, 0xcb89, - .driver_data = ATA_GEN_FORCE_DMA }, #if !defined(CONFIG_PATA_TOSHIBA) && !defined(CONFIG_PATA_TOSHIBA_MODULE) { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1), }, { PCI_DEVICE(PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2), }, diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c index c482f8cadd7a..36605abe5a67 100644 --- a/drivers/ata/libahci.c +++ b/drivers/ata/libahci.c @@ -1764,7 +1764,7 @@ static void ahci_handle_port_interrupt(struct ata_port *ap, } } -void ahci_port_intr(struct ata_port *ap) +static void ahci_port_intr(struct ata_port *ap) { void __iomem *port_mmio = ahci_port_base(ap); u32 status; @@ -1797,7 +1797,7 @@ irqreturn_t ahci_thread_fn(int irq, void *dev_instance) } EXPORT_SYMBOL_GPL(ahci_thread_fn); -void ahci_hw_port_interrupt(struct ata_port *ap) +static void ahci_hw_port_interrupt(struct ata_port *ap) { void __iomem *port_mmio = ahci_port_base(ap); struct ahci_port_priv *pp = ap->private_data; diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 1393a5890ed5..1a3dbd1b196e 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2222,6 +2222,16 @@ int ata_dev_configure(struct ata_device *dev) if (rc) return rc; + /* some WD SATA-1 drives have issues with LPM, turn on NOLPM for them */ + if ((dev->horkage & ATA_HORKAGE_WD_BROKEN_LPM) && + (id[ATA_ID_SATA_CAPABILITY] & 0xe) == 0x2) + dev->horkage |= ATA_HORKAGE_NOLPM; + + if (dev->horkage & ATA_HORKAGE_NOLPM) { + ata_dev_warn(dev, "LPM support broken, forcing max_power\n"); + dev->link->ap->target_lpm_policy = ATA_LPM_MAX_POWER; + } + /* let ACPI work its magic */ rc = ata_acpi_on_devcfg(dev); if (rc) @@ -4216,6 +4226,23 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { "Micron_M500*", NULL, ATA_HORKAGE_NO_NCQ_TRIM, }, { "Crucial_CT???M500SSD1", NULL, ATA_HORKAGE_NO_NCQ_TRIM, }, + /* + * Some WD SATA-I drives spin up and down erratically when the link + * is put into the slumber mode. We don't have full list of the + * affected devices. Disable LPM if the device matches one of the + * known prefixes and is SATA-1. As a side effect LPM partial is + * lost too. + * + * https://bugzilla.kernel.org/show_bug.cgi?id=57211 + */ + { "WDC WD800JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + { "WDC WD1200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + { "WDC WD1600JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + { "WDC WD2000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + { "WDC WD2500JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + { "WDC WD3000JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + { "WDC WD3200JD-*", NULL, ATA_HORKAGE_WD_BROKEN_LPM }, + /* End Marker */ { } }; diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index 92d7797223be..6d8757008318 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -2402,7 +2402,7 @@ static void ata_eh_link_report(struct ata_link *link) struct ata_port *ap = link->ap; struct ata_eh_context *ehc = &link->eh_context; const char *frozen, *desc; - char tries_buf[6]; + char tries_buf[6] = ""; int tag, nr_failed = 0; if (ehc->i.flags & ATA_EHI_QUIET) @@ -2433,9 +2433,8 @@ static void ata_eh_link_report(struct ata_link *link) if (ap->pflags & ATA_PFLAG_FROZEN) frozen = " frozen"; - memset(tries_buf, 0, sizeof(tries_buf)); if (ap->eh_tries < ATA_EH_MAX_TRIES) - snprintf(tries_buf, sizeof(tries_buf) - 1, " t%d", + snprintf(tries_buf, sizeof(tries_buf), " t%d", ap->eh_tries); if (ehc->i.dev) { diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 377eb889f555..ef8567de6a75 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -111,12 +111,14 @@ static const char *ata_lpm_policy_names[] = { [ATA_LPM_MIN_POWER] = "min_power", }; -static ssize_t ata_scsi_lpm_store(struct device *dev, +static ssize_t ata_scsi_lpm_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { - struct Scsi_Host *shost = class_to_shost(dev); + struct Scsi_Host *shost = class_to_shost(device); struct ata_port *ap = ata_shost_to_port(shost); + struct ata_link *link; + struct ata_device *dev; enum ata_lpm_policy policy; unsigned long flags; @@ -132,10 +134,20 @@ static ssize_t ata_scsi_lpm_store(struct device *dev, return -EINVAL; spin_lock_irqsave(ap->lock, flags); + + ata_for_each_link(link, ap, EDGE) { + ata_for_each_dev(dev, &ap->link, ENABLED) { + if (dev->horkage & ATA_HORKAGE_NOLPM) { + count = -EOPNOTSUPP; + goto out_unlock; + } + } + } + ap->target_lpm_policy = policy; ata_port_schedule_eh(ap); +out_unlock: spin_unlock_irqrestore(ap->lock, flags); - return count; } diff --git a/drivers/ata/pata_samsung_cf.c b/drivers/ata/pata_samsung_cf.c index 898e544a7ae8..a79566d05666 100644 --- a/drivers/ata/pata_samsung_cf.c +++ b/drivers/ata/pata_samsung_cf.c @@ -24,11 +24,34 @@ #include <linux/slab.h> #include <linux/platform_data/ata-samsung_cf.h> -#include <plat/regs-ata.h> #define DRV_NAME "pata_samsung_cf" #define DRV_VERSION "0.1" +#define S3C_CFATA_REG(x) (x) +#define S3C_CFATA_MUX S3C_CFATA_REG(0x0) +#define S3C_ATA_CTRL S3C_CFATA_REG(0x0) +#define S3C_ATA_CMD S3C_CFATA_REG(0x8) +#define S3C_ATA_IRQ S3C_CFATA_REG(0x10) +#define S3C_ATA_IRQ_MSK S3C_CFATA_REG(0x14) +#define S3C_ATA_CFG S3C_CFATA_REG(0x18) + +#define S3C_ATA_PIO_TIME S3C_CFATA_REG(0x2c) +#define S3C_ATA_PIO_DTR S3C_CFATA_REG(0x54) +#define S3C_ATA_PIO_FED S3C_CFATA_REG(0x58) +#define S3C_ATA_PIO_SCR S3C_CFATA_REG(0x5c) +#define S3C_ATA_PIO_LLR S3C_CFATA_REG(0x60) +#define S3C_ATA_PIO_LMR S3C_CFATA_REG(0x64) +#define S3C_ATA_PIO_LHR S3C_CFATA_REG(0x68) +#define S3C_ATA_PIO_DVR S3C_CFATA_REG(0x6c) +#define S3C_ATA_PIO_CSD S3C_CFATA_REG(0x70) +#define S3C_ATA_PIO_DAD S3C_CFATA_REG(0x74) +#define S3C_ATA_PIO_RDATA S3C_CFATA_REG(0x7c) + +#define S3C_CFATA_MUX_TRUEIDE 0x01 +#define S3C_ATA_CFG_SWAP 0x40 +#define S3C_ATA_CFG_IORDYEN 0x02 + enum s3c_cpu_type { TYPE_S3C64XX, TYPE_S5PC100, @@ -495,22 +518,10 @@ static int __init pata_s3c_probe(struct platform_device *pdev) info->irq = platform_get_irq(pdev, 0); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (res == NULL) { - dev_err(dev, "failed to get mem resource\n"); - return -EINVAL; - } - - if (!devm_request_mem_region(dev, res->start, - resource_size(res), DRV_NAME)) { - dev_err(dev, "error requesting register region\n"); - return -EBUSY; - } - info->ide_addr = devm_ioremap(dev, res->start, resource_size(res)); - if (!info->ide_addr) { - dev_err(dev, "failed to map IO base address\n"); - return -ENOMEM; - } + info->ide_addr = devm_ioremap_resource(dev, res); + if (IS_ERR(info->ide_addr)) + return PTR_ERR(info->ide_addr); info->clk = devm_clk_get(&pdev->dev, "cfcon"); if (IS_ERR(info->clk)) { diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c index ea3b3dc10f33..870b11eadc6d 100644 --- a/drivers/ata/sata_highbank.c +++ b/drivers/ata/sata_highbank.c @@ -29,7 +29,6 @@ #include <linux/of_address.h> #include <linux/platform_device.h> #include <linux/libata.h> -#include <linux/ahci_platform.h> #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/export.h> diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index 56be31819897..20a7517bd339 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -60,6 +60,7 @@ #include <linux/dma-mapping.h> #include <linux/device.h> #include <linux/clk.h> +#include <linux/phy/phy.h> #include <linux/platform_device.h> #include <linux/ata_platform.h> #include <linux/mbus.h> @@ -304,6 +305,7 @@ enum { MV5_LTMODE = 0x30, MV5_PHY_CTL = 0x0C, SATA_IFCFG = 0x050, + LP_PHY_CTL = 0x058, MV_M2_PREAMP_MASK = 0x7e0, @@ -431,6 +433,7 @@ enum { MV_HP_CUT_THROUGH = (1 << 10), /* can use EDMA cut-through */ MV_HP_FLAG_SOC = (1 << 11), /* SystemOnChip, no PCI */ MV_HP_QUIRK_LED_BLINK_EN = (1 << 12), /* is led blinking enabled? */ + MV_HP_FIX_LP_PHY_CTL = (1 << 13), /* fix speed in LP_PHY_CTL ? */ /* Port private flags (pp_flags) */ MV_PP_FLAG_EDMA_EN = (1 << 0), /* is EDMA engine enabled? */ @@ -563,6 +566,12 @@ struct mv_host_priv { struct clk *clk; struct clk **port_clks; /* + * Some devices have a SATA PHY which can be enabled/disabled + * in order to save power. These are optional: if the platform + * devices does not have any phy, they won't be used. + */ + struct phy **port_phys; + /* * These consistent DMA memory pools give us guaranteed * alignment for hardware-accessed data structures, * and less memory waste in accomplishing the alignment. @@ -1358,6 +1367,7 @@ static int mv_scr_write(struct ata_link *link, unsigned int sc_reg_in, u32 val) if (ofs != 0xffffffffU) { void __iomem *addr = mv_ap_base(link->ap) + ofs; + struct mv_host_priv *hpriv = link->ap->host->private_data; if (sc_reg_in == SCR_CONTROL) { /* * Workaround for 88SX60x1 FEr SATA#26: @@ -1374,6 +1384,18 @@ static int mv_scr_write(struct ata_link *link, unsigned int sc_reg_in, u32 val) */ if ((val & 0xf) == 1 || (readl(addr) & 0xf) == 1) val |= 0xf000; + + if (hpriv->hp_flags & MV_HP_FIX_LP_PHY_CTL) { + void __iomem *lp_phy_addr = + mv_ap_base(link->ap) + LP_PHY_CTL; + /* + * Set PHY speed according to SControl speed. + */ + if ((val & 0xf0) == 0x10) + writelfl(0x7, lp_phy_addr); + else + writelfl(0x227, lp_phy_addr); + } } writelfl(val, addr); return 0; @@ -4076,6 +4098,11 @@ static int mv_platform_probe(struct platform_device *pdev) GFP_KERNEL); if (!hpriv->port_clks) return -ENOMEM; + hpriv->port_phys = devm_kzalloc(&pdev->dev, + sizeof(struct phy *) * n_ports, + GFP_KERNEL); + if (!hpriv->port_phys) + return -ENOMEM; host->private_data = hpriv; hpriv->n_ports = n_ports; hpriv->board_idx = chip_soc; @@ -4097,6 +4124,17 @@ static int mv_platform_probe(struct platform_device *pdev) hpriv->port_clks[port] = clk_get(&pdev->dev, port_number); if (!IS_ERR(hpriv->port_clks[port])) clk_prepare_enable(hpriv->port_clks[port]); + + sprintf(port_number, "port%d", port); + hpriv->port_phys[port] = devm_phy_get(&pdev->dev, port_number); + if (IS_ERR(hpriv->port_phys[port])) { + rc = PTR_ERR(hpriv->port_phys[port]); + hpriv->port_phys[port] = NULL; + if ((rc != -EPROBE_DEFER) && (rc != -ENODEV)) + dev_warn(&pdev->dev, "error getting phy"); + goto err; + } else + phy_power_on(hpriv->port_phys[port]); } /* @@ -4110,6 +4148,15 @@ static int mv_platform_probe(struct platform_device *pdev) if (rc) goto err; + /* + * To allow disk hotplug on Armada 370/XP SoCs, the PHY speed must be + * updated in the LP_PHY_CTL register. + */ + if (pdev->dev.of_node && + of_device_is_compatible(pdev->dev.of_node, + "marvell,armada-370-sata")) + hpriv->hp_flags |= MV_HP_FIX_LP_PHY_CTL; + /* initialize adapter */ rc = mv_init_host(host); if (rc) @@ -4132,6 +4179,8 @@ err: clk_disable_unprepare(hpriv->port_clks[port]); clk_put(hpriv->port_clks[port]); } + if (hpriv->port_phys[port]) + phy_power_off(hpriv->port_phys[port]); } return rc; @@ -4161,6 +4210,8 @@ static int mv_platform_remove(struct platform_device *pdev) clk_disable_unprepare(hpriv->port_clks[port]); clk_put(hpriv->port_clks[port]); } + if (hpriv->port_phys[port]) + phy_power_off(hpriv->port_phys[port]); } return 0; } @@ -4209,6 +4260,7 @@ static int mv_platform_resume(struct platform_device *pdev) #ifdef CONFIG_OF static struct of_device_id mv_sata_dt_ids[] = { + { .compatible = "marvell,armada-370-sata", }, { .compatible = "marvell,orion-sata", }, {}, }; diff --git a/drivers/ata/sata_rcar.c b/drivers/ata/sata_rcar.c index 1dae9a9009f7..2b25bd83fc9d 100644 --- a/drivers/ata/sata_rcar.c +++ b/drivers/ata/sata_rcar.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/ata.h> #include <linux/libata.h> +#include <linux/of_device.h> #include <linux/platform_device.h> #include <linux/clk.h> #include <linux/err.h> @@ -123,12 +124,37 @@ #define SATA_RCAR_DMA_BOUNDARY 0x1FFFFFFEUL +/* Gen2 Physical Layer Control Registers */ +#define RCAR_GEN2_PHY_CTL1_REG 0x1704 +#define RCAR_GEN2_PHY_CTL1 0x34180002 +#define RCAR_GEN2_PHY_CTL1_SS 0xC180 /* Spread Spectrum */ + +#define RCAR_GEN2_PHY_CTL2_REG 0x170C +#define RCAR_GEN2_PHY_CTL2 0x00002303 + +#define RCAR_GEN2_PHY_CTL3_REG 0x171C +#define RCAR_GEN2_PHY_CTL3 0x000B0194 + +#define RCAR_GEN2_PHY_CTL4_REG 0x1724 +#define RCAR_GEN2_PHY_CTL4 0x00030994 + +#define RCAR_GEN2_PHY_CTL5_REG 0x1740 +#define RCAR_GEN2_PHY_CTL5 0x03004001 +#define RCAR_GEN2_PHY_CTL5_DC BIT(1) /* DC connection */ +#define RCAR_GEN2_PHY_CTL5_TR BIT(2) /* Termination Resistor */ + +enum sata_rcar_type { + RCAR_GEN1_SATA, + RCAR_GEN2_SATA, +}; + struct sata_rcar_priv { void __iomem *base; struct clk *clk; + enum sata_rcar_type type; }; -static void sata_rcar_phy_initialize(struct sata_rcar_priv *priv) +static void sata_rcar_gen1_phy_preinit(struct sata_rcar_priv *priv) { void __iomem *base = priv->base; @@ -141,8 +167,8 @@ static void sata_rcar_phy_initialize(struct sata_rcar_priv *priv) iowrite32(0, base + SATAPHYRESET_REG); } -static void sata_rcar_phy_write(struct sata_rcar_priv *priv, u16 reg, u32 val, - int group) +static void sata_rcar_gen1_phy_write(struct sata_rcar_priv *priv, u16 reg, + u32 val, int group) { void __iomem *base = priv->base; int timeout; @@ -170,6 +196,29 @@ static void sata_rcar_phy_write(struct sata_rcar_priv *priv, u16 reg, u32 val, iowrite32(0, base + SATAPHYADDR_REG); } +static void sata_rcar_gen1_phy_init(struct sata_rcar_priv *priv) +{ + sata_rcar_gen1_phy_preinit(priv); + sata_rcar_gen1_phy_write(priv, SATAPCTLR1_REG, 0x00200188, 0); + sata_rcar_gen1_phy_write(priv, SATAPCTLR1_REG, 0x00200188, 1); + sata_rcar_gen1_phy_write(priv, SATAPCTLR3_REG, 0x0000A061, 0); + sata_rcar_gen1_phy_write(priv, SATAPCTLR2_REG, 0x20000000, 0); + sata_rcar_gen1_phy_write(priv, SATAPCTLR2_REG, 0x20000000, 1); + sata_rcar_gen1_phy_write(priv, SATAPCTLR4_REG, 0x28E80000, 0); +} + +static void sata_rcar_gen2_phy_init(struct sata_rcar_priv *priv) +{ + void __iomem *base = priv->base; + + iowrite32(RCAR_GEN2_PHY_CTL1, base + RCAR_GEN2_PHY_CTL1_REG); + iowrite32(RCAR_GEN2_PHY_CTL2, base + RCAR_GEN2_PHY_CTL2_REG); + iowrite32(RCAR_GEN2_PHY_CTL3, base + RCAR_GEN2_PHY_CTL3_REG); + iowrite32(RCAR_GEN2_PHY_CTL4, base + RCAR_GEN2_PHY_CTL4_REG); + iowrite32(RCAR_GEN2_PHY_CTL5 | RCAR_GEN2_PHY_CTL5_DC | + RCAR_GEN2_PHY_CTL5_TR, base + RCAR_GEN2_PHY_CTL5_REG); +} + static void sata_rcar_freeze(struct ata_port *ap) { struct sata_rcar_priv *priv = ap->host->private_data; @@ -738,13 +787,17 @@ static void sata_rcar_init_controller(struct ata_host *host) u32 val; /* reset and setup phy */ - sata_rcar_phy_initialize(priv); - sata_rcar_phy_write(priv, SATAPCTLR1_REG, 0x00200188, 0); - sata_rcar_phy_write(priv, SATAPCTLR1_REG, 0x00200188, 1); - sata_rcar_phy_write(priv, SATAPCTLR3_REG, 0x0000A061, 0); - sata_rcar_phy_write(priv, SATAPCTLR2_REG, 0x20000000, 0); - sata_rcar_phy_write(priv, SATAPCTLR2_REG, 0x20000000, 1); - sata_rcar_phy_write(priv, SATAPCTLR4_REG, 0x28E80000, 0); + switch (priv->type) { + case RCAR_GEN1_SATA: + sata_rcar_gen1_phy_init(priv); + break; + case RCAR_GEN2_SATA: + sata_rcar_gen2_phy_init(priv); + break; + default: + dev_warn(host->dev, "SATA phy is not initialized\n"); + break; + } /* SATA-IP reset state */ val = ioread32(base + ATAPI_CONTROL1_REG); @@ -770,8 +823,40 @@ static void sata_rcar_init_controller(struct ata_host *host) iowrite32(ATAPI_INT_ENABLE_SATAINT, base + ATAPI_INT_ENABLE_REG); } +static struct of_device_id sata_rcar_match[] = { + { + /* Deprecated by "renesas,sata-r8a7779" */ + .compatible = "renesas,rcar-sata", + .data = (void *)RCAR_GEN1_SATA, + }, + { + .compatible = "renesas,sata-r8a7779", + .data = (void *)RCAR_GEN1_SATA, + }, + { + .compatible = "renesas,sata-r8a7790", + .data = (void *)RCAR_GEN2_SATA + }, + { + .compatible = "renesas,sata-r8a7791", + .data = (void *)RCAR_GEN2_SATA + }, + { }, +}; +MODULE_DEVICE_TABLE(of, sata_rcar_match); + +static const struct platform_device_id sata_rcar_id_table[] = { + { "sata_rcar", RCAR_GEN1_SATA }, /* Deprecated by "sata-r8a7779" */ + { "sata-r8a7779", RCAR_GEN1_SATA }, + { "sata-r8a7790", RCAR_GEN2_SATA }, + { "sata-r8a7791", RCAR_GEN2_SATA }, + { }, +}; +MODULE_DEVICE_TABLE(platform, sata_rcar_id_table); + static int sata_rcar_probe(struct platform_device *pdev) { + const struct of_device_id *of_id; struct ata_host *host; struct sata_rcar_priv *priv; struct resource *mem; @@ -787,6 +872,12 @@ static int sata_rcar_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; + of_id = of_match_device(sata_rcar_match, &pdev->dev); + if (of_id) + priv->type = (enum sata_rcar_type)of_id->data; + else + priv->type = platform_get_device_id(pdev)->driver_data; + priv->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(priv->clk)) { dev_err(&pdev->dev, "failed to get access to sata clock\n"); @@ -892,15 +983,10 @@ static const struct dev_pm_ops sata_rcar_pm_ops = { }; #endif -static struct of_device_id sata_rcar_match[] = { - { .compatible = "renesas,rcar-sata", }, - {}, -}; -MODULE_DEVICE_TABLE(of, sata_rcar_match); - static struct platform_driver sata_rcar_driver = { .probe = sata_rcar_probe, .remove = sata_rcar_remove, + .id_table = sata_rcar_id_table, .driver = { .name = DRV_NAME, .owner = THIS_MODULE, diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index fbcc851ed5a5..61bcfc21d2a0 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -163,7 +163,6 @@ static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup) static void bcachecg_destroy(struct cgroup *cgroup) { struct bch_cgroup *cg = cgroup_to_bcache(cgroup); - free_css_id(&bcache_subsys, &cg->css); kfree(cg); } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d90909ec6aa6..a5e34dd6a32c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con, struct msghdr *msg, char *buf) { union sctp_notification *sn = (union sctp_notification *)buf; + struct linger linger; switch (sn->sn_header.sn_type) { case SCTP_SEND_FAILED: @@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con, } add_sock(new_con->sock, new_con); + linger.l_onoff = 1; + linger.l_linger = 0; + ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof(linger)); + if (ret < 0) + log_print("set socket option SO_LINGER failed"); + log_print("connecting to %d sctp association %d", nodeid, (int)sn->sn_assoc_change.sac_assoc_id); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 73f3e4ee4037..49436fa7cd4f 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1032,8 +1032,9 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); rv = filemap_write_and_wait_range(mapping, lstart, end); if (rv) - return rv; - truncate_inode_pages_range(mapping, lstart, end); + goto out; + if (rw == WRITE) + truncate_inode_pages_range(mapping, lstart, end); } rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, @@ -1080,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bh = bh->b_this_page; } while(bh != head); spin_unlock(&sdp->sd_ail_lock); - gfs2_log_unlock(sdp); head = bh = page_buffers(page); do { - gfs2_log_lock(sdp); bd = bh->b_private; if (bd) { gfs2_assert_warn(sdp, bd->bd_bh == bh); - if (!list_empty(&bd->bd_list)) { - if (!buffer_pinned(bh)) - list_del_init(&bd->bd_list); - else - bd = NULL; - } - if (bd) - bd->bd_bh = NULL; + if (!list_empty(&bd->bd_list)) + list_del_init(&bd->bd_list); + bd->bd_bh = NULL; bh->b_private = NULL; - } - gfs2_log_unlock(sdp); - if (bd) kmem_cache_free(gfs2_bufdata_cachep, bd); + } bh = bh->b_this_page; } while (bh != head); + gfs2_log_unlock(sdp); return try_to_free_buffers(page); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 2e5fc268d324..fa32655449c8 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, struct gfs2_leaf *leaf; struct gfs2_dirent *dent; struct qstr name = { .name = "" }; + struct timespec tv = CURRENT_TIME; error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); if (error) @@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, leaf->lf_entries = 0; leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); leaf->lf_next = 0; - memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved)); + leaf->lf_inode = cpu_to_be64(ip->i_no_addr); + leaf->lf_dist = cpu_to_be32(1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2)); dent = (struct gfs2_dirent *)(leaf+1); gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent); *pbh = bh; @@ -1612,11 +1617,31 @@ out: return ret; } +/** + * dir_new_leaf - Add a new leaf onto hash chain + * @inode: The directory + * @name: The name we are adding + * + * This adds a new dir leaf onto an existing leaf when there is not + * enough space to add a new dir entry. This is a last resort after + * we've expanded the hash table to max size and also split existing + * leaf blocks, so it will only occur for very large directories. + * + * The dist parameter is set to 1 for leaf blocks directly attached + * to the hash table, 2 for one layer of indirection, 3 for two layers + * etc. We are thus able to tell the difference between an old leaf + * with dist set to zero (i.e. "don't know") and a new one where we + * set this information for debug/fsck purposes. + * + * Returns: 0 on success, or -ve on error + */ + static int dir_new_leaf(struct inode *inode, const struct qstr *name) { struct buffer_head *bh, *obh; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_leaf *leaf, *oleaf; + u32 dist = 1; int error; u32 index; u64 bn; @@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) if (error) return error; do { + dist++; oleaf = (struct gfs2_leaf *)obh->b_data; bn = be64_to_cpu(oleaf->lf_next); if (!bn) @@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) brelse(obh); return -ENOSPC; } + leaf->lf_dist = cpu_to_be32(dist); oleaf->lf_next = cpu_to_be64(bh->b_blocknr); brelse(bh); brelse(obh); @@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) /** * gfs2_dir_add - Add new filename into directory - * @dip: The GFS2 inode - * @filename: The new name - * @inode: The inode number of the entry - * @type: The type of the entry + * @inode: The directory inode + * @name: The new name + * @nip: The GFS2 inode to be linked in to the directory + * @da: The directory addition info + * + * If the call to gfs2_diradd_alloc_required resulted in there being + * no need to allocate any new directory blocks, then it will contain + * a pointer to the directory entry and the bh in which it resides. We + * can use that without having to repeat the search. If there was no + * free space, then we must now create more space. * * Returns: 0 on success, error code on failure */ int gfs2_dir_add(struct inode *inode, const struct qstr *name, - const struct gfs2_inode *nip) + const struct gfs2_inode *nip, struct gfs2_diradd *da) { struct gfs2_inode *ip = GFS2_I(inode); - struct buffer_head *bh; - struct gfs2_dirent *dent; + struct buffer_head *bh = da->bh; + struct gfs2_dirent *dent = da->dent; + struct timespec tv; struct gfs2_leaf *leaf; int error; while(1) { - dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, - &bh); + if (da->bh == NULL) { + dent = gfs2_dirent_search(inode, name, + gfs2_dirent_find_space, &bh); + } if (dent) { if (IS_ERR(dent)) return PTR_ERR(dent); dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + tv = CURRENT_TIME; if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } + da->dent = NULL; + da->bh = NULL; brelse(bh); ip->i_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); mark_inode_dirty(inode); @@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; + struct timespec tv = CURRENT_TIME; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) if (!entries) gfs2_consist_inode(dip); leaf->lf_entries = cpu_to_be16(--entries); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); } brelse(bh); if (!dip->i_entries) gfs2_consist_inode(dip); dip->i_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; if (S_ISDIR(dentry->d_inode->i_mode)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -2017,22 +2061,36 @@ out: * gfs2_diradd_alloc_required - find if adding entry will require an allocation * @ip: the file being written to * @filname: the filename that's going to be added + * @da: The structure to return dir alloc info * - * Returns: 1 if alloc required, 0 if not, -ve on error + * Returns: 0 if ok, -ve on error */ -int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name) +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, + struct gfs2_diradd *da) { + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf); struct gfs2_dirent *dent; struct buffer_head *bh; + da->nr_blocks = 0; + da->bh = NULL; + da->dent = NULL; + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); if (!dent) { - return 1; + da->nr_blocks = sdp->sd_max_dirres; + if (!(ip->i_diskflags & GFS2_DIF_EXHASH) && + (GFS2_DIRENT_SIZE(name->len) < extra)) + da->nr_blocks = 1; + return 0; } if (IS_ERR(dent)) return PTR_ERR(dent); - brelse(bh); + da->bh = bh; + da->dent = dent; return 0; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 4f03bbd1873f..126c65dda028 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -16,6 +16,14 @@ struct inode; struct gfs2_inode; struct gfs2_inum; +struct buffer_head; +struct gfs2_dirent; + +struct gfs2_diradd { + unsigned nr_blocks; + struct gfs2_dirent *dent; + struct buffer_head *bh; +}; extern struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename, @@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir, extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip); + const struct gfs2_inode *ip, struct gfs2_diradd *da); +static inline void gfs2_dir_no_add(struct gfs2_diradd *da) +{ + if (da->bh) + brelse(da->bh); + da->bh = NULL; +} extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, struct file_ra_state *f_ra); @@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); extern int gfs2_diradd_alloc_required(struct inode *dir, - const struct qstr *filename); + const struct qstr *filename, + struct gfs2_diradd *da); extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp); extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6f7a47c05259..ca0be6c69a26 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp) glock_hash_walk(thaw_glock, sdp); } -static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl) +static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl) { - int ret; spin_lock(&gl->gl_spin); - ret = gfs2_dump_glock(seq, gl); + gfs2_dump_glock(seq, gl); spin_unlock(&gl->gl_spin); - return ret; } static void dump_glock_func(struct gfs2_glock *gl) @@ -1647,10 +1645,9 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) * @seq: the seq_file struct * @gh: the glock holder * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) +static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) { struct task_struct *gh_owner = NULL; char flags_buf[32]; @@ -1666,7 +1663,6 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) gh_owner ? gh_owner->comm : "(ended)", (void *)gh->gh_ip); rcu_read_unlock(); - return 0; } static const char *gflags2str(char *buf, const struct gfs2_glock *gl) @@ -1721,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) * example. The field's are n = number (id of the object), f = flags, * t = type, s = state, r = refcount, e = error, p = pid. * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; const struct gfs2_holder *gh; char gflags_buf[32]; - int error = 0; dtime = jiffies - gl->gl_demote_time; dtime *= 1000000/HZ; /* demote time in uSec */ @@ -1747,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) atomic_read(&gl->gl_revokes), (int)gl->gl_lockref.count, gl->gl_hold_time); - list_for_each_entry(gh, &gl->gl_holders, gh_list) { - error = dump_holder(seq, gh); - if (error) - goto out; - } + list_for_each_entry(gh, &gl->gl_holders, gh_list) + dump_holder(seq, gh); + if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) - error = glops->go_dump(seq, gl); -out: - return error; + glops->go_dump(seq, gl); } static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) @@ -1953,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { - return dump_glock(seq, iter_ptr); + dump_glock(seq, iter_ptr); + return 0; } static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 6647d77366ba..32572f71f027 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); -extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0) extern __printf(2, 3) void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f88dcd925010..3bf0631b5d56 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) static void rgrp_go_sync(struct gfs2_glock *gl) { - struct address_space *metamapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd; int error; @@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); - gfs2_log_flush(gl->gl_sbd, gl); - filemap_fdatawrite(metamapping); - error = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, error); + gfs2_log_flush(sdp, gl); + filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end); + mapping_set_error(mapping, error); gfs2_ail_empty_gl(gl); spin_lock(&gl->gl_spin); @@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl) static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { - struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct address_space *mapping = &sdp->sd_aspace; WARN_ON_ONCE(!(flags & DIO_METADATA)); - gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count)); - truncate_inode_pages(mapping, 0); + gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); + truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); if (gl->gl_object) { struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; @@ -435,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh) * @seq: The iterator * @ip: the inode * - * Returns: 0 on success, -ENOBUFS when we run out of space */ -static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) +static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) { const struct gfs2_inode *ip = gl->gl_object; if (ip == NULL) - return 0; + return; gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, IF2DT(ip->i_inode.i_mode), ip->i_flags, (unsigned int)ip->i_diskflags, (unsigned long long)i_size_read(&ip->i_inode)); - return 0; } /** @@ -558,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_unlock = gfs2_rgrp_go_unlock, .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, - .go_flags = GLOF_ASPACE | GLOF_LVB, + .go_flags = GLOF_LVB, }; const struct gfs2_glock_operations gfs2_trans_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index ba1ea67f4eeb..cf0e34400f71 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -93,6 +93,7 @@ struct gfs2_rgrpd { struct gfs2_rgrp_lvb *rd_rgl; u32 rd_last_alloc; u32 rd_flags; + u32 rd_extfail_pt; /* extent failure point */ #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ @@ -217,7 +218,7 @@ struct gfs2_glock_operations { int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); - int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); void (*go_callback)(struct gfs2_glock *gl, bool remote); const int go_type; const unsigned long go_flags; @@ -350,7 +351,15 @@ struct gfs2_glock { atomic_t gl_ail_count; atomic_t gl_revokes; struct delayed_work gl_work; - struct work_struct gl_delete; + union { + /* For inode and iopen glocks only */ + struct work_struct gl_delete; + /* For rgrp glocks only */ + struct { + loff_t start; + loff_t end; + } gl_vm; + }; struct rcu_head gl_rcu; }; @@ -419,10 +428,13 @@ enum { }; struct gfs2_quota_data { + struct hlist_bl_node qd_hlist; struct list_head qd_list; struct kqid qd_id; + struct gfs2_sbd *qd_sbd; struct lockref qd_lockref; struct list_head qd_lru; + unsigned qd_hash; unsigned long qd_flags; /* QDF_... */ @@ -441,6 +453,7 @@ struct gfs2_quota_data { u64 qd_sync_gen; unsigned long qd_last_warn; + struct rcu_head qd_rcu; }; struct gfs2_trans { @@ -720,13 +733,15 @@ struct gfs2_sbd { spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; - unsigned int sd_quota_chunks; - unsigned char **sd_quota_bitmap; + unsigned long *sd_quota_bitmap; + spinlock_t sd_bitmap_lock; u64 sd_quota_sync_gen; /* Log stuff */ + struct address_space sd_aspace; + spinlock_t sd_log_lock; struct gfs2_trans *sd_log_tr; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 7119504159f1..890588c7fb33 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, ip = GFS2_I(inode); if (!inode) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, brelse(dibh); } +/** + * gfs2_trans_da_blocks - Calculate number of blocks to link inode + * @dip: The directory we are linking into + * @da: The dir add information + * @nr_inodes: The number of inodes involved + * + * This calculate the number of blocks we need to reserve in a + * transaction to link @nr_inodes into a directory. In most cases + * @nr_inodes will be 2 (the directory plus the inode being linked in) + * but in case of rename, 4 may be required. + * + * Returns: Number of blocks + */ + +static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip, + const struct gfs2_diradd *da, + unsigned nr_inodes) +{ + return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) + + (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS; +} + static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip, int arq) + struct gfs2_inode *ip, struct gfs2_diradd *da) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + struct gfs2_alloc_parms ap = { .target = da->nr_blocks, }; int error; - if (arq) { + if (da->nr_blocks) { error = gfs2_quota_lock_check(dip); if (error) goto fail_quota_locks; @@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, if (error) goto fail_quota_locks; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - dip->i_rgd->rd_length + - 2 * RES_DINODE + - RES_STATFS + RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0); if (error) goto fail_ipreserv; } else { @@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; } - error = gfs2_dir_add(&dip->i_inode, name, ip); + error = gfs2_dir_add(&dip->i_inode, name, ip, da); if (error) goto fail_end_trans; @@ -560,7 +579,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct dentry *d; int error; u32 aflags = 0; - int arq; + struct gfs2_diradd da = { .bh = NULL, }; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -585,6 +604,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = PTR_ERR(inode); if (!IS_ERR(inode)) { d = d_splice_alias(inode, dentry); + error = PTR_ERR(d); + if (IS_ERR(d)) + goto fail_gunlock; error = 0; if (file) { if (S_ISREG(inode->i_mode)) { @@ -602,7 +624,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock; } - arq = error = gfs2_diradd_alloc_required(dir, name); + error = gfs2_diradd_alloc_required(dir, name, &da); if (error < 0) goto fail_gunlock; @@ -690,7 +712,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock3; - error = link_dinode(dip, name, ip, arq); + error = link_dinode(dip, name, ip, &da); if (error) goto fail_gunlock3; @@ -719,6 +741,7 @@ fail_free_inode: free_inode_nonrcu(inode); inode = NULL; fail_gunlock: + gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); if (inode && !IS_ERR(inode)) { clear_nlink(inode); @@ -779,6 +802,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, } d = d_splice_alias(inode, dentry); + if (IS_ERR(d)) { + iput(inode); + gfs2_glock_dq_uninit(&gh); + return d; + } if (file && S_ISREG(inode->i_mode)) error = finish_open(file, dentry, gfs2_open_common, opened); @@ -817,7 +845,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; - int alloc_required; + struct gfs2_diradd da = { .bh = NULL, }; int error; if (S_ISDIR(inode->i_mode)) @@ -872,13 +900,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (ip->i_inode.i_nlink == (u32)-1) goto out_gunlock; - alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); + error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da); if (error < 0) goto out_gunlock; - error = 0; - if (alloc_required) { - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; error = gfs2_quota_lock_check(dip); if (error) goto out_gunlock; @@ -887,10 +914,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(dip, sdp->sd_max_dirres) + - 2 * RES_DINODE + RES_STATFS + - RES_QUOTA, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0); if (error) goto out_ipres; } else { @@ -903,7 +927,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (error) goto out_end_trans; - error = gfs2_dir_add(dir, &dentry->d_name, ip); + error = gfs2_dir_add(dir, &dentry->d_name, ip, &da); if (error) goto out_brelse; @@ -919,12 +943,13 @@ out_brelse: out_end_trans: gfs2_trans_end(sdp); out_ipres: - if (alloc_required) + if (da.nr_blocks) gfs2_inplace_release(dip); out_gunlock_q: - if (alloc_required) + if (da.nr_blocks) gfs2_quota_unlock(dip); out_gunlock: + gfs2_dir_no_add(&da); gfs2_glock_dq(ghs + 1); out_child: gfs2_glock_dq(ghs); @@ -1254,7 +1279,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - int alloc_required = 0; + struct gfs2_diradd da = { .nr_blocks = 0, }; unsigned int x; int error; @@ -1388,14 +1413,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; } - if (nip == NULL) - alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); - error = alloc_required; - if (error < 0) - goto out_gunlock; + if (nip == NULL) { + error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da); + if (error) + goto out_gunlock; + } - if (alloc_required) { - struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, }; + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; error = gfs2_quota_lock_check(ndip); if (error) goto out_gunlock; @@ -1404,10 +1429,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock_q; - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(ndip, sdp->sd_max_dirres) + - 4 * RES_DINODE + 4 * RES_LEAF + - RES_STATFS + RES_QUOTA + 4, 0); + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) + + 4 * RES_LEAF + 4, 0); if (error) goto out_ipreserv; } else { @@ -1441,19 +1464,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_end_trans; - error = gfs2_dir_add(ndir, &ndentry->d_name, ip); + error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da); if (error) goto out_end_trans; out_end_trans: gfs2_trans_end(sdp); out_ipreserv: - if (alloc_required) + if (da.nr_blocks) gfs2_inplace_release(ndip); out_gunlock_q: - if (alloc_required) + if (da.nr_blocks) gfs2_quota_unlock(ndip); out_gunlock: + gfs2_dir_no_add(&da); while (x--) { gfs2_glock_dq(ghs + x); gfs2_holder_uninit(ghs + x); @@ -1607,10 +1631,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) ogid = ngid = NO_GID_QUOTA_CHANGE; - error = gfs2_quota_lock(ip, nuid, ngid); + error = get_write_access(inode); if (error) return error; + error = gfs2_rs_alloc(ip); + if (error) + goto out; + + error = gfs2_rindex_update(sdp); + if (error) + goto out; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out; + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { error = gfs2_quota_check(ip, nuid, ngid); @@ -1637,6 +1673,8 @@ out_end_trans: gfs2_trans_end(sdp); out_gunlock_q: gfs2_quota_unlock(ip); +out: + put_write_access(inode); return error; } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 010b9fb9fec6..58f06400b7b8 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); clear_bit(GBF_FULL, &bi->bi_flags); rgd->rd_free_clone = rgd->rd_free; + rgd->rd_extfail_pt = rgd->rd_free; } /** @@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start, static void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_sbd; int error; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + filemap_fdatawrite(mapping); error = filemap_fdatawait(mapping); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 0650db2541ef..c272e73063de 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void) gfs2_str2qstr(&gfs2_qdot, "."); gfs2_str2qstr(&gfs2_qdotdot, ".."); + gfs2_quota_hash_init(); error = gfs2_sys_init(); if (error) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 52f177be3bf8..c7f24690ed05 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) unsigned long index; unsigned int bufnum; + if (mapping == NULL) + mapping = &sdp->sd_aspace; + shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift; index = blkno >> shift; /* convert block to page */ bufnum = blkno - (index << shift); /* block buf index within page */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 52fa88314f5c..1e712b566d76 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -36,6 +36,7 @@ #include "log.h" #include "quota.h" #include "dir.h" +#include "meta_io.h" #include "trace_gfs2.h" #define DO 0 @@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt) static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; + struct address_space *mapping; sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); if (!sdp) @@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_waitqueue_head(&sdp->sd_quota_wait); INIT_LIST_HEAD(&sdp->sd_trunc_list); spin_lock_init(&sdp->sd_trunc_lock); + spin_lock_init(&sdp->sd_bitmap_lock); + + mapping = &sdp->sd_aspace; + + address_space_init_once(mapping); + mapping->a_ops = &gfs2_meta_aops; + mapping->host = sb->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->backing_dev_info = sb->s_bdi; + mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); atomic_set(&sdp->sd_log_pinned, 0); @@ -217,7 +231,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) page = alloc_page(GFP_NOFS); if (unlikely(!page)) - return -ENOBUFS; + return -ENOMEM; ClearPageUptodate(page); ClearPageDirty(page); @@ -956,40 +970,6 @@ fail: return error; } -static int init_threads(struct gfs2_sbd *sdp, int undo) -{ - struct task_struct *p; - int error = 0; - - if (undo) - goto fail_quotad; - - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); - return error; - } - sdp->sd_logd_process = p; - - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); - goto fail; - } - sdp->sd_quotad_process = p; - - return 0; - - -fail_quotad: - kthread_stop(sdp->sd_quotad_process); -fail: - kthread_stop(sdp->sd_logd_process); - return error; -} - static const match_table_t nolock_tokens = { { Opt_jid, "jid=%d\n", }, { Opt_err, NULL }, @@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent goto fail_per_node; } - error = init_threads(sdp, DO); - if (error) - goto fail_per_node; - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_rw(sdp); if (error) { fs_err(sdp, "can't make FS RW: %d\n", error); - goto fail_threads; + goto fail_per_node; } } @@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent gfs2_online_uevent(sdp); return 0; -fail_threads: - init_threads(sdp, UNDO); fail_per_node: init_per_node(sdp, UNDO); fail_inodes: diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 98236d0df3ca..8bec0e3192dd 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -52,6 +52,11 @@ #include <linux/dqblk_xfs.h> #include <linux/lockref.h> #include <linux/list_lru.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> +#include <linux/jhash.h> +#include <linux/vmalloc.h> #include "gfs2.h" #include "incore.h" @@ -67,16 +72,44 @@ #include "inode.h" #include "util.h" -struct gfs2_quota_change_host { - u64 qc_change; - u32 qc_flags; /* GFS2_QCF_... */ - struct kqid qc_id; -}; +#define GFS2_QD_HASH_SHIFT 12 +#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) -/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */ +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ +/* -> sd_bitmap_lock */ static DEFINE_SPINLOCK(qd_lock); struct list_lru gfs2_qd_lru; +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; + +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, + const struct kqid qid) +{ + unsigned int h; + + h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); + h = jhash(&qid, sizeof(struct kqid), h); + + return h & GFS2_QD_HASH_MASK; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&qd_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&qd_hash_table[hash]); +} + +static void gfs2_qd_dealloc(struct rcu_head *rcu) +{ + struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + kmem_cache_free(gfs2_quotad_cachep, qd); +} + static void gfs2_qd_dispose(struct list_head *list) { struct gfs2_quota_data *qd; @@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list) list_del(&qd->qd_list); spin_unlock(&qd_lock); + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); @@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list) atomic_dec(&sdp->sd_quota_count); /* Delete it from the common reclaim list */ - kmem_cache_free(gfs2_quotad_cachep, qd); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); } } @@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd) return offset; } -static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; int error; qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) - return -ENOMEM; + return NULL; + qd->qd_sbd = sdp; qd->qd_lockref.count = 1; spin_lock_init(&qd->qd_lockref.lock); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); + qd->qd_hash = hash; error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; - *qdp = qd; - - return 0; + return qd; fail: kmem_cache_free(gfs2_quotad_cachep, qd); - return error; + return NULL; } -static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, - struct gfs2_quota_data **qdp) +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + struct kqid qid) { - struct gfs2_quota_data *qd = NULL, *new_qd = NULL; - int error, found; - - *qdp = NULL; + struct gfs2_quota_data *qd; + struct hlist_bl_node *h; - for (;;) { - found = 0; - spin_lock(&qd_lock); - list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { - if (qid_eq(qd->qd_id, qid) && - lockref_get_not_dead(&qd->qd_lockref)) { - list_lru_del(&gfs2_qd_lru, &qd->qd_lru); - found = 1; - break; - } + hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { + if (!qid_eq(qd->qd_id, qid)) + continue; + if (qd->qd_sbd != sdp) + continue; + if (lockref_get_not_dead(&qd->qd_lockref)) { + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + return qd; } + } - if (!found) - qd = NULL; + return NULL; +} - if (!qd && new_qd) { - qd = new_qd; - list_add(&qd->qd_list, &sdp->sd_quota_list); - atomic_inc(&sdp->sd_quota_count); - new_qd = NULL; - } - spin_unlock(&qd_lock); +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd, *new_qd; + unsigned int hash = gfs2_qd_hash(sdp, qid); - if (qd) { - if (new_qd) { - gfs2_glock_put(new_qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, new_qd); - } - *qdp = qd; - return 0; - } + rcu_read_lock(); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + rcu_read_unlock(); - error = qd_alloc(sdp, qid, &new_qd); - if (error) - return error; + if (qd) + return 0; + + new_qd = qd_alloc(hash, sdp, qid); + if (!new_qd) + return -ENOMEM; + + spin_lock(&qd_lock); + spin_lock_bucket(hash); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + if (qd == NULL) { + *qdp = new_qd; + list_add(&new_qd->qd_list, &sdp->sd_quota_list); + hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); + atomic_inc(&sdp->sd_quota_count); } + spin_unlock_bucket(hash); + spin_unlock(&qd_lock); + + if (qd) { + gfs2_glock_put(new_qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, new_qd); + } + + return 0; } + static void qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; @@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd) static int slot_get(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; - unsigned int c, o = 0, b; - unsigned char byte = 0; + struct gfs2_sbd *sdp = qd->qd_sbd; + unsigned int bit; + int error = 0; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); + if (qd->qd_slot_count != 0) + goto out; - if (qd->qd_slot_count++) { - spin_unlock(&qd_lock); - return 0; + error = -ENOSPC; + bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); + if (bit < sdp->sd_quota_slots) { + set_bit(bit, sdp->sd_quota_bitmap); + qd->qd_slot = bit; +out: + qd->qd_slot_count++; } + spin_unlock(&sdp->sd_bitmap_lock); - for (c = 0; c < sdp->sd_quota_chunks; c++) - for (o = 0; o < PAGE_SIZE; o++) { - byte = sdp->sd_quota_bitmap[c][o]; - if (byte != 0xFF) - goto found; - } - - goto fail; - -found: - for (b = 0; b < 8; b++) - if (!(byte & (1 << b))) - break; - qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b; - - if (qd->qd_slot >= sdp->sd_quota_slots) - goto fail; - - sdp->sd_quota_bitmap[c][o] |= 1 << b; - - spin_unlock(&qd_lock); - - return 0; - -fail: - qd->qd_slot_count--; - spin_unlock(&qd_lock); - return -ENOSPC; + return error; } static void slot_hold(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); qd->qd_slot_count++; - spin_unlock(&qd_lock); -} - -static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, - unsigned int bit, int new_value) -{ - unsigned int c, o, b = bit; - int old_value; - - c = b / (8 * PAGE_SIZE); - b %= 8 * PAGE_SIZE; - o = b / 8; - b %= 8; - - old_value = (bitmap[c][o] & (1 << b)); - gfs2_assert_withdraw(sdp, !old_value != !new_value); - - if (new_value) - bitmap[c][o] |= 1 << b; - else - bitmap[c][o] &= ~(1 << b); + spin_unlock(&sdp->sd_bitmap_lock); } static void slot_put(struct gfs2_quota_data *qd) { - struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd; + struct gfs2_sbd *sdp = qd->qd_sbd; - spin_lock(&qd_lock); + spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_count); if (!--qd->qd_slot_count) { - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0); + BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } - spin_unlock(&qd_lock); + spin_unlock(&sdp->sd_bitmap_lock); } static int bh_get(struct gfs2_quota_data *qd) @@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, list_move_tail(&qd->qd_list, &sdp->sd_quota_list); set_bit(QDF_LOCKED, &qd->qd_flags); qd->qd_change_sync = qd->qd_change; - gfs2_assert_warn(sdp, qd->qd_slot_count); - qd->qd_slot_count++; + slot_hold(qd); return 1; } @@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) return error; } -static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf) -{ - const struct gfs2_quota_change *str = buf; - - qc->qc_change = be64_to_cpu(str->qc_change); - qc->qc_flags = be32_to_cpu(str->qc_flags); - qc->qc_id = make_kqid(&init_user_ns, - (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA, - be32_to_cpu(str->qc_id)); -} - int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); @@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; + unsigned int hash; + unsigned int bm_size; u64 dblock; u32 extlen = 0; int error; @@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) return -EIO; sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; - sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); - + bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); + bm_size *= sizeof(unsigned long); error = -ENOMEM; - - sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, - sizeof(unsigned char *), GFP_NOFS); + sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN); + if (sdp->sd_quota_bitmap == NULL) + sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL); if (!sdp->sd_quota_bitmap) return error; - for (x = 0; x < sdp->sd_quota_chunks; x++) { - sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS); - if (!sdp->sd_quota_bitmap[x]) - goto fail; - } + memset(sdp->sd_quota_bitmap, 0, bm_size); for (x = 0; x < blocks; x++) { struct buffer_head *bh; + const struct gfs2_quota_change *qc; unsigned int y; if (!extlen) { @@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) goto fail; } + qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; y++, slot++) { - struct gfs2_quota_change_host qc; struct gfs2_quota_data *qd; - - gfs2_quota_change_in(&qc, bh->b_data + - sizeof(struct gfs2_meta_header) + - y * sizeof(struct gfs2_quota_change)); - if (!qc.qc_change) + s64 qc_change = be64_to_cpu(qc->qc_change); + u32 qc_flags = be32_to_cpu(qc->qc_flags); + enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? + USRQUOTA : GRPQUOTA; + struct kqid qc_id = make_kqid(&init_user_ns, qtype, + be32_to_cpu(qc->qc_id)); + qc++; + if (!qc_change) continue; - error = qd_alloc(sdp, qc.qc_id, &qd); - if (error) { + hash = gfs2_qd_hash(sdp, qc_id); + qd = qd_alloc(hash, sdp, qc_id); + if (qd == NULL) { brelse(bh); goto fail; } set_bit(QDF_CHANGE, &qd->qd_flags); - qd->qd_change = qc.qc_change; + qd->qd_change = qc_change; qd->qd_slot = slot; qd->qd_slot_count = 1; spin_lock(&qd_lock); - gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1); + BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); spin_unlock(&qd_lock); + spin_lock_bucket(hash); + hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); + spin_unlock_bucket(hash); + found++; } @@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { struct list_head *head = &sdp->sd_quota_list; struct gfs2_quota_data *qd; - unsigned int x; spin_lock(&qd_lock); while (!list_empty(head)) { qd = list_entry(head->prev, struct gfs2_quota_data, qd_list); - /* - * To be removed in due course... we should be able to - * ensure that all refs to the qd have done by this point - * so that this rather odd test is not required - */ - spin_lock(&qd->qd_lockref.lock); - if (qd->qd_lockref.count > 1 || - (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) { - spin_unlock(&qd->qd_lockref.lock); - list_move(&qd->qd_list, head); - spin_unlock(&qd_lock); - schedule(); - spin_lock(&qd_lock); - continue; - } - spin_unlock(&qd->qd_lockref.lock); - list_del(&qd->qd_list); + /* Also remove if this qd exists in the reclaim list */ list_lru_del(&gfs2_qd_lru, &qd->qd_lru); atomic_dec(&sdp->sd_quota_count); spin_unlock(&qd_lock); - if (!qd->qd_lockref.count) { - gfs2_assert_warn(sdp, !qd->qd_change); - gfs2_assert_warn(sdp, !qd->qd_slot_count); - } else - gfs2_assert_warn(sdp, qd->qd_slot_count == 1); + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_count); gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_glock_put(qd->qd_gl); - kmem_cache_free(gfs2_quotad_cachep, qd); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); spin_lock(&qd_lock); } @@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); if (sdp->sd_quota_bitmap) { - for (x = 0; x < sdp->sd_quota_chunks; x++) - kfree(sdp->sd_quota_bitmap[x]); - kfree(sdp->sd_quota_bitmap); + if (is_vmalloc_addr(sdp->sd_quota_bitmap)) + vfree(sdp->sd_quota_bitmap); + else + kfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; } } @@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = { .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; + +void __init gfs2_quota_hash_init(void) +{ + unsigned i; + + for(i = 0; i < GFS2_QD_HASH_SIZE; i++) + INIT_HLIST_BL_HEAD(&qd_hash_table[i]); +} diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 96e4f34a03b0..55d506eb3c4a 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) extern const struct quotactl_ops gfs2_quotactl_ops; extern struct shrinker gfs2_qd_shrinker; extern struct list_lru gfs2_qd_lru; +extern void __init gfs2_quota_hash_init(void); #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c8d6161bd682..a1da21349235 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -57,6 +57,11 @@ * 3 = Used (metadata) */ +struct gfs2_extent { + struct gfs2_rbm rbm; + u32 len; +}; + static const char valid_change[16] = { /* current */ /* n */ 0, 1, 1, 1, @@ -65,8 +70,9 @@ static const char valid_change[16] = { 1, 0, 0, 0 }; -static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, - const struct gfs2_inode *ip, bool nowrap); +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap); /** @@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) /* return reserved blocks to the rgrp */ BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; + /* The rgrp extent failure point is likely not to increase; + it will only do so if the freed blocks are somehow + contiguous with a span of free blocks that follows. Still, + it will force the number to be recalculated later. */ + rgd->rd_extfail_pt += rs->rs_free; rs->rs_free = 0; clear_bit(GBF_FULL, &bi->bi_flags); - smp_mb__after_clear_bit(); } } @@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd) static int read_rindex_entry(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); struct gfs2_rindex buf; int error; @@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_gl->gl_object = rgd; + rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; + rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; if (rgd->rd_data > sdp->sd_max_rg_data) @@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; } if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); @@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh) if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) return 0; - return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object); + return gfs2_rgrp_bh_get(rgd); } /** @@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) return; - ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true); + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap); if (ret == 0) { rs->rs_rbm = rbm; rs->rs_free = extlen; @@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, * @rbm: The current position in the resource group * @ip: The inode for which we are searching for blocks * @minext: The minimum extent length + * @maxext: A pointer to the maximum extent structure * * This checks the current position in the rgrp to see whether there is * a reservation covering this block. If not then this function is a @@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, const struct gfs2_inode *ip, - u32 minext) + u32 minext, + struct gfs2_extent *maxext) { u64 block = gfs2_rbm_to_block(rbm); u32 extlen = 1; @@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, */ if (minext) { extlen = gfs2_free_extlen(rbm, minext); - nblock = block + extlen; - if (extlen < minext) + if (extlen <= maxext->len) goto fail; } @@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, * and skip if parts of it are already reserved */ nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); - if (nblock == block) - return 0; + if (nblock == block) { + if (!minext || extlen >= minext) + return 0; + + if (extlen > maxext->len) { + maxext->len = extlen; + maxext->rbm = *rbm; + } fail: + nblock = block + extlen; + } ret = gfs2_rbm_from_block(rbm, nblock); if (ret < 0) return ret; @@ -1568,30 +1592,38 @@ fail: * gfs2_rbm_find - Look for blocks of a particular state * @rbm: Value/result starting position and final position * @state: The state which we want to find - * @minext: The requested extent length (0 for a single block) + * @minext: Pointer to the requested extent length (NULL for a single block) + * This is updated to be the actual reservation size. * @ip: If set, check for reservations * @nowrap: Stop looking at the end of the rgrp, rather than wrapping * around until we've reached the starting point. + * @ap: the allocation parameters * * Side effects: * - If looking for free blocks, we set GBF_FULL on each bitmap which * has no free blocks in it. + * - If looking for free blocks, we set rd_extfail_pt on each rgrp which + * has come up short on a free block search. * * Returns: 0 on success, -ENOSPC if there is no block of the requested state */ -static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, - const struct gfs2_inode *ip, bool nowrap) +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + const struct gfs2_inode *ip, bool nowrap, + const struct gfs2_alloc_parms *ap) { struct buffer_head *bh; int initial_bii; u32 initial_offset; + int first_bii = rbm->bii; + u32 first_offset = rbm->offset; u32 offset; u8 *buffer; int n = 0; int iters = rbm->rgd->rd_length; int ret; struct gfs2_bitmap *bi; + struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, }; /* If we are not starting at the beginning of a bitmap, then we * need to add one to the bitmap count to ensure that we search @@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext, return 0; initial_bii = rbm->bii; - ret = gfs2_reservation_check_and_update(rbm, ip, minext); + ret = gfs2_reservation_check_and_update(rbm, ip, + minext ? *minext : 0, + &maxext); if (ret == 0) return 0; if (ret > 0) { @@ -1655,6 +1689,24 @@ next_iter: break; } + if (minext == NULL || state != GFS2_BLKST_FREE) + return -ENOSPC; + + /* If the extent was too small, and it's smaller than the smallest + to have failed before, remember for future reference that it's + useless to search this rgrp again for this amount or more. */ + if ((first_offset == 0) && (first_bii == 0) && + (*minext < rbm->rgd->rd_extfail_pt)) + rbm->rgd->rd_extfail_pt = *minext; + + /* If the maximum extent we found is big enough to fulfill the + minimum requirements, use it anyway. */ + if (maxext.len) { + *rbm = maxext.rbm; + *minext = maxext.len; + return 0; + } + return -ENOSPC; } @@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip while (1) { down_write(&sdp->sd_log_flush_lock); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, + true, NULL); up_write(&sdp->sd_log_flush_lock); if (error == -ENOSPC) break; @@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a } /* Skip unuseable resource groups */ - if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) + if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | + GFS2_RDF_ERROR)) || + (ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) @@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a return 0; } - /* Drop reservation, if we couldn't use reserved rgrp */ - if (gfs2_rs_active(rs)) - gfs2_rs_deltree(rs); check_rgrp: /* Check for unlinked inodes which can be reclaimed */ if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, ip->i_no_addr); skip_rgrp: + /* Drop reservation, if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(rs); + /* Unlock rgrp if required */ if (!rg_locked) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); @@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, * */ -int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) { struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_blkreserv *trs; const struct rb_node *n; if (rgd == NULL) - return 0; - gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n", + return; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, - rgd->rd_reserved); + rgd->rd_reserved, rgd->rd_extfail_pt); spin_lock(&rgd->rd_rsspin); for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { trs = rb_entry(n, struct gfs2_blkreserv, rs_node); dump_rs(seq, trs); } spin_unlock(&rgd->rd_rsspin); - return 0; } static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) @@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, int error; gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL); if (error == -ENOSPC) { gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false, + NULL); } /* Since all blocks are reserved in advance, this shouldn't happen */ if (error) { - fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n", + fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n", (unsigned long long)ip->i_no_addr, error, *nblocks, - test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags)); + test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags), + rbm.rgd->rd_extfail_pt); goto rgrp_error; } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 3a10d2ffbbe7..463ab2e95d1c 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); -extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); +extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 35da5b19c0de..60f60f6181f3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) return 0; } +static int init_threads(struct gfs2_sbd *sdp) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + return 0; + +fail: + kthread_stop(sdp->sd_logd_process); + return error; +} + /** * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one * @sdp: the filesystem @@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) struct gfs2_log_header_host head; int error; - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + error = init_threads(sdp); if (error) return error; + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh); + if (error) + goto fail_threads; + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head); @@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) fail: t_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&t_gh); - +fail_threads: + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); return error; } @@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) struct gfs2_holder t_gh; int error; + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + flush_workqueue(gfs2_delete_workqueue); gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -857,9 +893,6 @@ restart: } spin_unlock(&sdp->sd_jindex_spin); - kthread_stop(sdp->sd_quotad_process); - kthread_stop(sdp->sd_logd_process); - if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_ro(sdp); if (error) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 39c1d9469677..5c097596104b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -21,6 +21,7 @@ #include <linux/xattr.h> #include <linux/fs.h> #include <linux/percpu-refcount.h> +#include <linux/seq_file.h> #ifdef CONFIG_CGROUPS @@ -28,8 +29,6 @@ struct cgroupfs_root; struct cgroup_subsys; struct inode; struct cgroup; -struct css_id; -struct eventfd_ctx; extern int cgroup_init_early(void); extern int cgroup_init(void); @@ -79,8 +78,6 @@ struct cgroup_subsys_state { struct cgroup_subsys_state *parent; unsigned long flags; - /* ID for this css, if possible */ - struct css_id __rcu *id; /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; @@ -239,10 +236,6 @@ struct cgroup { struct rcu_head rcu_head; struct work_struct destroy_work; - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - /* directory xattrs */ struct simple_xattrs xattrs; }; @@ -280,6 +273,9 @@ enum { * - "tasks" is removed. Everything should be at process * granularity. Use "cgroup.procs" instead. * + * - "cgroup.procs" is not sorted. pids will be unique unless they + * got recycled inbetween reads. + * * - "release_agent" and "notify_on_release" are removed. * Replacement notification mechanism will be implemented. * @@ -320,9 +316,6 @@ struct cgroupfs_root { /* Unique id for this hierarchy. */ int hierarchy_id; - /* A list running through the attached subsystems */ - struct list_head subsys_list; - /* The root cgroup for this hierarchy */ struct cgroup top_cgroup; @@ -389,16 +382,6 @@ struct css_set { }; /* - * cgroup_map_cb is an abstract callback API for reporting map-valued - * control files - */ - -struct cgroup_map_cb { - int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); - void *state; -}; - -/* * struct cftype: handler definitions for cgroup control files * * When reading/writing to a file: @@ -445,10 +428,6 @@ struct cftype { */ struct cgroup_subsys *ss; - int (*open)(struct inode *inode, struct file *file); - ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() @@ -458,24 +437,14 @@ struct cftype { * read_s64() is a signed version of read_u64() */ s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); - /* - * read_map() is used for defining a map of key/value - * pairs. It should call cb->fill(cb, key, value) for each - * entry. The key/value pairs (and their ordering) should not - * change between reboots. - */ - int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb); - /* - * read_seq_string() is used for outputting a simple sequence - * using seqfile. - */ - int (*read_seq_string)(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m); - ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft, - struct file *file, - const char __user *buf, size_t nbytes, loff_t *ppos); + /* generic seq_file read interface */ + int (*seq_show)(struct seq_file *sf, void *v); + + /* optional ops, implement all or none */ + void *(*seq_start)(struct seq_file *sf, loff_t *ppos); + void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); + void (*seq_stop)(struct seq_file *sf, void *v); /* * write_u64() is a shortcut for the common case of accepting @@ -504,27 +473,6 @@ struct cftype { * kick type for multiplexing. */ int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); - - int (*release)(struct inode *inode, struct file *file); - - /* - * register_event() callback will be used to add new userspace - * waiter for changes related to the cftype. Implement it if - * you want to provide this functionality. Use eventfd_signal() - * on eventfd to send notification to userspace. - */ - int (*register_event)(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args); - /* - * unregister_event() callback will be called when userspace - * closes the eventfd or on cgroup removing. - * This callback must be implemented, if you want provide - * notification functionality. - */ - void (*unregister_event)(struct cgroup_subsys_state *css, - struct cftype *cft, - struct eventfd_ctx *eventfd); }; /* @@ -538,6 +486,26 @@ struct cftype_set { }; /* + * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't + * access directly. + */ +struct cfent { + struct list_head node; + struct dentry *dentry; + struct cftype *type; + struct cgroup_subsys_state *css; + + /* file xattrs */ + struct simple_xattrs xattrs; +}; + +/* seq_file->private points to the following, only ->priv is public */ +struct cgroup_open_file { + struct cfent *cfe; + void *priv; +}; + +/* * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This * function can be called as long as @cgrp is accessible. */ @@ -552,6 +520,18 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) return rcu_dereference(cgrp->name)->name; } +static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + struct cgroup_open_file *of = seq->private; + return of->cfe->css; +} + +static inline struct cftype *seq_cft(struct seq_file *seq) +{ + struct cgroup_open_file *of = seq->private; + return of->cfe->type; +} + int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); @@ -631,12 +611,8 @@ struct cgroup_subsys { #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; - /* - * Link to parent, and list entry in parent's children. - * Protected by cgroup_lock() - */ + /* link to parent, protected by cgroup_lock() */ struct cgroupfs_root *root; - struct list_head sibling; /* list of cftype_sets */ struct list_head cftsets; diff --git a/include/linux/libata.h b/include/linux/libata.h index 9b503376738f..bec6dbe939a0 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -419,6 +419,8 @@ enum { ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */ ATA_HORKAGE_ATAPI_DMADIR = (1 << 18), /* device requires dmadir */ ATA_HORKAGE_NO_NCQ_TRIM = (1 << 19), /* don't use queued TRIM */ + ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */ + ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 3f3788d49362..3e4535876d37 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -7,6 +7,7 @@ #include <linux/gfp.h> #include <linux/types.h> #include <linux/cgroup.h> +#include <linux/eventfd.h> struct vmpressure { unsigned long scanned; @@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr); extern void vmpressure_cleanup(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); -extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); -extern int vmpressure_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, +extern int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); -extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, +extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h index b2de1f9a88d6..0f24c07aed51 100644 --- a/include/uapi/linux/gfs2_ondisk.h +++ b/include/uapi/linux/gfs2_ondisk.h @@ -319,7 +319,16 @@ struct gfs2_leaf { __be32 lf_dirent_format; /* Format of the dirents */ __be64 lf_next; /* Next leaf, if overflow */ - __u8 lf_reserved[64]; + union { + __u8 lf_reserved[64]; + struct { + __be64 lf_inode; /* Dir inode number */ + __be32 lf_dist; /* Dist from inode on chain */ + __be32 lf_nsec; /* Last ins/del usecs */ + __be64 lf_sec; /* Last ins/del in secs */ + __u8 lf_reserved2[40]; + }; + }; }; /* diff --git a/init/Kconfig b/init/Kconfig index 5236dc562a36..8d402e33b7fc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -854,7 +854,6 @@ config NUMA_BALANCING menuconfig CGROUPS boolean "Control Group support" - depends on EVENTFD help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory @@ -921,6 +920,7 @@ config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS select MM_OWNER + select EVENTFD help Provides a memory resource controller that manages both anonymous memory and page cache. (See Documentation/cgroups/memory.txt) @@ -1160,7 +1160,6 @@ config UIDGID_STRICT_TYPE_CHECKS config SCHED_AUTOGROUP bool "Automatic process group scheduling" - select EVENTFD select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bc1dcabe9217..e2f46ba37f72 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -41,7 +41,6 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/backing-dev.h> -#include <linux/seq_file.h> #include <linux/slab.h> #include <linux/magic.h> #include <linux/spinlock.h> @@ -56,15 +55,20 @@ #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ -#include <linux/eventfd.h> -#include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> -#include <linux/file.h> #include <linux/atomic.h> /* + * pidlists linger the following amount before being destroyed. The goal + * is avoiding frequent destruction in the middle of consecutive read calls + * Expiring in the middle is a performance problem not a correctness one. + * 1 sec should be enough. + */ +#define CGROUP_PIDLIST_DESTROY_DELAY HZ + +/* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. * @@ -89,6 +93,19 @@ static DEFINE_MUTEX(cgroup_mutex); static DEFINE_MUTEX(cgroup_root_mutex); +#define cgroup_assert_mutex_or_rcu_locked() \ + rcu_lockdep_assert(rcu_read_lock_held() || \ + lockdep_is_held(&cgroup_mutex), \ + "cgroup_mutex or RCU read lock required"); + +#ifdef CONFIG_LOCKDEP +#define cgroup_assert_mutex_or_root_locked() \ + WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ + !lockdep_is_held(&cgroup_root_mutex))) +#else +#define cgroup_assert_mutex_or_root_locked() do { } while (0) +#endif + /* * cgroup destruction makes heavy use of work items and there can be a lot * of concurrent destructions. Use a separate workqueue so that cgroup @@ -98,6 +115,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); static struct workqueue_struct *cgroup_destroy_wq; /* + * pidlist destructions need to be flushed on cgroup destruction. Use a + * separate workqueue as flush domain. + */ +static struct workqueue_struct *cgroup_pidlist_destroy_wq; + +/* * Generate an array of cgroup subsystem pointers. At boot time, this is * populated with the built in subsystems, and modular subsystems are * registered after that. The mutable section of this array is protected by @@ -119,49 +142,6 @@ static struct cgroupfs_root cgroup_dummy_root; /* dummy_top is a shorthand for the dummy hierarchy's top cgroup */ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; -/* - * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. - */ -struct cfent { - struct list_head node; - struct dentry *dentry; - struct cftype *type; - struct cgroup_subsys_state *css; - - /* file xattrs */ - struct simple_xattrs xattrs; -}; - -/* - * cgroup_event represents events which userspace want to receive. - */ -struct cgroup_event { - /* - * css which the event belongs to. - */ - struct cgroup_subsys_state *css; - /* - * Control file which the event associated. - */ - struct cftype *cft; - /* - * eventfd to signal userspace about the event. - */ - struct eventfd_ctx *eventfd; - /* - * Each of these stored in a list by the cgroup. - */ - struct list_head list; - /* - * All fields below needed to unregister event when - * userspace closes eventfd. - */ - poll_table pt; - wait_queue_head_t *wqh; - wait_queue_t wait; - struct work_struct remove; -}; - /* The list of hierarchy roots */ static LIST_HEAD(cgroup_roots); @@ -200,6 +180,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); static int cgroup_file_release(struct inode *inode, struct file *file); +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** * cgroup_css - obtain a cgroup's css for the specified subsystem @@ -262,16 +243,32 @@ static int notify_on_release(const struct cgroup *cgrp) } /** + * for_each_css - iterate all css's of a cgroup + * @css: the iteration cursor + * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end + * @cgrp: the target cgroup to iterate css's of + * + * Should be called under cgroup_mutex. + */ +#define for_each_css(css, ssid, cgrp) \ + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((css) = rcu_dereference_check( \ + (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_mutex)))) { } \ + else + +/** * for_each_subsys - iterate all loaded cgroup subsystems * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end + * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end * - * Should be called under cgroup_mutex. + * Iterates through all loaded subsystems. Should be called under + * cgroup_mutex or cgroup_root_mutex. */ -#define for_each_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \ - if (({ lockdep_assert_held(&cgroup_mutex); \ - !((ss) = cgroup_subsys[i]); })) { } \ +#define for_each_subsys(ss, ssid) \ + for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ + (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ + if (!((ss) = cgroup_subsys[(ssid)])) { } \ else /** @@ -286,10 +283,6 @@ static int notify_on_release(const struct cgroup *cgrp) for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ (((ss) = cgroup_subsys[i]) || true); (i)++) -/* iterate each subsystem attached to a hierarchy */ -#define for_each_root_subsys(root, ss) \ - list_for_each_entry((ss), &(root)->subsys_list, sibling) - /* iterate across the active hierarchies */ #define for_each_active_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) @@ -863,11 +856,7 @@ static void cgroup_free_fn(struct work_struct *work) */ deactivate_super(cgrp->root->sb); - /* - * if we're getting rid of the cgroup, refcount should ensure - * that there are no pidlists left. - */ - BUG_ON(!list_empty(&cgrp->pidlists)); + cgroup_pidlist_destroy_all(cgrp); simple_xattrs_free(&cgrp->xattrs); @@ -1050,7 +1039,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, cgroup_css(cgroup_dummy_top, ss)); cgroup_css(cgrp, ss)->cgroup = cgrp; - list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) ss->bind(cgroup_css(cgrp, ss)); @@ -1069,7 +1057,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; - list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); /* subsystem is now free - drop reference on module */ module_put(ss->module); @@ -1096,10 +1083,12 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) { struct cgroupfs_root *root = dentry->d_sb->s_fs_info; struct cgroup_subsys *ss; + int ssid; mutex_lock(&cgroup_root_mutex); - for_each_root_subsys(root, ss) - seq_printf(seq, ",%s", ss->name); + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(seq, ",%s", ss->name); if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) @@ -1362,8 +1351,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->event_list); - spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); } @@ -1371,7 +1358,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; - INIT_LIST_HEAD(&root->subsys_list); INIT_LIST_HEAD(&root->root_list); root->number_of_cgroups = 1; cgrp->root = root; @@ -1693,7 +1679,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return ERR_PTR(ret); } -static void cgroup_kill_sb(struct super_block *sb) { +static void cgroup_kill_sb(struct super_block *sb) +{ struct cgroupfs_root *root = sb->s_fs_info; struct cgroup *cgrp = &root->top_cgroup; struct cgrp_cset_link *link, *tmp_link; @@ -1976,8 +1963,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, bool threadgroup) { int retval, i, group_size; - struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroupfs_root *root = cgrp->root; + struct cgroup_subsys_state *css, *failed_css = NULL; /* threadgroup list cursor and array */ struct task_struct *leader = tsk; struct task_and_cgroup *tc; @@ -2050,13 +2037,11 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 1: check that we can legitimately attach to the cgroup. */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss->can_attach) { - retval = ss->can_attach(css, &tset); + for_each_css(css, i, cgrp) { + if (css->ss->can_attach) { + retval = css->ss->can_attach(css, &tset); if (retval) { - failed_ss = ss; + failed_css = css; goto out_cancel_attach; } } @@ -2092,12 +2077,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 4: do subsystem attach callbacks. */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss->attach) - ss->attach(css, &tset); - } + for_each_css(css, i, cgrp) + if (css->ss->attach) + css->ss->attach(css, &tset); /* * step 5: success! and cleanup @@ -2114,13 +2096,11 @@ out_put_css_set_refs: } out_cancel_attach: if (retval) { - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (ss == failed_ss) + for_each_css(css, i, cgrp) { + if (css == failed_css) break; - if (ss->cancel_attach) - ss->cancel_attach(css, &tset); + if (css->ss->cancel_attach) + css->ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -2148,7 +2128,7 @@ retry_find_task: tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - ret= -ESRCH; + ret = -ESRCH; goto out_unlock_cgroup; } /* @@ -2260,10 +2240,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css, return 0; } -static int cgroup_release_agent_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_release_agent_show(struct seq_file *seq, void *v) { - struct cgroup *cgrp = css->cgroup; + struct cgroup *cgrp = seq_css(seq)->cgroup; if (!cgroup_lock_live_group(cgrp)) return -ENODEV; @@ -2273,174 +2252,129 @@ static int cgroup_release_agent_show(struct cgroup_subsys_state *css, return 0; } -static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); return 0; } /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - const char __user *userbuf, size_t nbytes, - loff_t *unused_ppos) +static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, + size_t nbytes, loff_t *ppos) { - char buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - char *end; + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cfe->css; + size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; + char *buf; + int ret; - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) + if (nbytes >= max_bytes) return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - buffer[nbytes] = 0; /* nul-terminate */ - if (cft->write_u64) { - u64 val = simple_strtoull(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_u64(css, cft, val); + buf = kmalloc(nbytes + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, userbuf, nbytes)) { + ret = -EFAULT; + goto out_free; + } + + buf[nbytes] = '\0'; + + if (cft->write_string) { + ret = cft->write_string(css, cft, strstrip(buf)); + } else if (cft->write_u64) { + unsigned long long v; + ret = kstrtoull(buf, 0, &v); + if (!ret) + ret = cft->write_u64(css, cft, v); + } else if (cft->write_s64) { + long long v; + ret = kstrtoll(buf, 0, &v); + if (!ret) + ret = cft->write_s64(css, cft, v); + } else if (cft->trigger) { + ret = cft->trigger(css, (unsigned int)cft->private); } else { - s64 val = simple_strtoll(strstrip(buffer), &end, 0); - if (*end) - return -EINVAL; - retval = cft->write_s64(css, cft, val); + ret = -EINVAL; } - if (!retval) - retval = nbytes; - return retval; +out_free: + kfree(buf); + return ret ?: nbytes; } -static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - const char __user *userbuf, size_t nbytes, - loff_t *unused_ppos) +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; - int retval = 0; - size_t max_bytes = cft->max_write_len; - char *buffer = local_buffer; + struct cftype *cft = seq_cft(seq); - if (!max_bytes) - max_bytes = sizeof(local_buffer) - 1; - if (nbytes >= max_bytes) - return -E2BIG; - /* Allocate a dynamic buffer if we need one */ - if (nbytes >= sizeof(local_buffer)) { - buffer = kmalloc(nbytes + 1, GFP_KERNEL); - if (buffer == NULL) - return -ENOMEM; - } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { - retval = -EFAULT; - goto out; + if (cft->seq_start) { + return cft->seq_start(seq, ppos); + } else { + /* + * The same behavior and code as single_open(). Returns + * !NULL if pos is at the beginning; otherwise, NULL. + */ + return NULL + !*ppos; } - - buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(css, cft, strstrip(buffer)); - if (!retval) - retval = nbytes; -out: - if (buffer != local_buffer) - kfree(buffer); - return retval; } -static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) +static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; + struct cftype *cft = seq_cft(seq); - if (cft->write) - return cft->write(css, cft, file, buf, nbytes, ppos); - if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); - if (cft->write_string) - return cgroup_write_string(css, cft, file, buf, nbytes, ppos); - if (cft->trigger) { - int ret = cft->trigger(css, (unsigned int)cft->private); - return ret ? ret : nbytes; + if (cft->seq_next) { + return cft->seq_next(seq, v, ppos); + } else { + /* + * The same behavior and code as single_open(), always + * terminate after the initial read. + */ + ++*ppos; + return NULL; } - return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(css, cft); - int len = sprintf(tmp, "%llu\n", (unsigned long long) val); + struct cftype *cft = seq_cft(seq); - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); + if (cft->seq_stop) + cft->seq_stop(seq, v); } -static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(css, cft); - int len = sprintf(tmp, "%lld\n", (long long) val); + struct cftype *cft = seq_cft(m); + struct cgroup_subsys_state *css = seq_css(m); - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); -} + if (cft->seq_show) + return cft->seq_show(m, arg); -static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - - if (cft->read) - return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); - if (cft->read_s64) - return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); - return -EINVAL; -} - -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - -static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) -{ - struct seq_file *sf = cb->state; - return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); -} - -static int cgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct cfent *cfe = m->private; - struct cftype *cft = cfe->type; - struct cgroup_subsys_state *css = cfe->css; - - if (cft->read_map) { - struct cgroup_map_cb cb = { - .fill = cgroup_map_add, - .state = m, - }; - return cft->read_map(css, cft, &cb); - } - return cft->read_seq_string(css, cft, m); + seq_printf(m, "%llu\n", cft->read_u64(css, cft)); + else if (cft->read_s64) + seq_printf(m, "%lld\n", cft->read_s64(css, cft)); + else + return -EINVAL; + return 0; } -static const struct file_operations cgroup_seqfile_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = seq_lseek, - .release = cgroup_file_release, +static struct seq_operations cgroup_seq_operations = { + .start = cgroup_seqfile_start, + .next = cgroup_seqfile_next, + .stop = cgroup_seqfile_stop, + .show = cgroup_seqfile_show, }; static int cgroup_file_open(struct inode *inode, struct file *file) @@ -2449,6 +2383,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); struct cgroup_subsys_state *css; + struct cgroup_open_file *of; int err; err = generic_file_open(inode, file); @@ -2478,32 +2413,26 @@ static int cgroup_file_open(struct inode *inode, struct file *file) WARN_ON_ONCE(cfe->css && cfe->css != css); cfe->css = css; - if (cft->read_map || cft->read_seq_string) { - file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, cfe); - } else if (cft->open) { - err = cft->open(inode, file); + of = __seq_open_private(file, &cgroup_seq_operations, + sizeof(struct cgroup_open_file)); + if (of) { + of->cfe = cfe; + return 0; } - if (css->ss && err) + if (css->ss) css_put(css); - return err; + return -ENOMEM; } static int cgroup_file_release(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); struct cgroup_subsys_state *css = cfe->css; - int ret = 0; - if (cft->release) - ret = cft->release(inode, file); if (css->ss) css_put(css); - if (file->f_op == &cgroup_seqfile_operations) - single_release(inode, file); - return ret; + return seq_release_private(inode, file); } /* @@ -2614,7 +2543,7 @@ static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) } static const struct file_operations cgroup_file_operations = { - .read = cgroup_file_read, + .read = seq_read, .write = cgroup_file_write, .llseek = generic_file_llseek, .open = cgroup_file_open, @@ -2639,16 +2568,6 @@ static const struct inode_operations cgroup_dir_inode_operations = { .removexattr = cgroup_removexattr, }; -/* - * Check if a file is a control file - */ -static inline struct cftype *__file_cft(struct file *file) -{ - if (file_inode(file)->i_fop != &cgroup_file_operations) - return ERR_PTR(-EINVAL); - return __d_cft(file->f_dentry); -} - static int cgroup_create_file(struct dentry *dentry, umode_t mode, struct super_block *sb) { @@ -2706,12 +2625,11 @@ static umode_t cgroup_file_mode(const struct cftype *cft) if (cft->mode) return cft->mode; - if (cft->read || cft->read_u64 || cft->read_s64 || - cft->read_map || cft->read_seq_string) + if (cft->read_u64 || cft->read_s64 || cft->seq_show) mode |= S_IRUGO; - if (cft->write || cft->write_u64 || cft->write_s64 || - cft->write_string || cft->trigger) + if (cft->write_u64 || cft->write_s64 || cft->write_string || + cft->trigger) mode |= S_IWUSR; return mode; @@ -3007,9 +2925,9 @@ static void cgroup_enable_task_cg_lists(void) * @parent_css: css whose children to walk * * This function returns the next child of @parent_css and should be called - * under RCU read lock. The only requirement is that @parent_css and - * @pos_css are accessible. The next sibling is guaranteed to be returned - * regardless of their states. + * under either cgroup_mutex or RCU read lock. The only requirement is + * that @parent_css and @pos_css are accessible. The next sibling is + * guaranteed to be returned regardless of their states. */ struct cgroup_subsys_state * css_next_child(struct cgroup_subsys_state *pos_css, @@ -3019,7 +2937,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -3066,10 +2984,10 @@ EXPORT_SYMBOL_GPL(css_next_child); * to visit for pre-order traversal of @root's descendants. @root is * included in the iteration and the first node to be visited. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct next descendant as long as both @pos - * and @root are accessible and @pos is a descendant of @root. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct next descendant as long + * as both @pos and @root are accessible and @pos is a descendant of @root. */ struct cgroup_subsys_state * css_next_descendant_pre(struct cgroup_subsys_state *pos, @@ -3077,7 +2995,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3108,17 +3026,17 @@ EXPORT_SYMBOL_GPL(css_next_descendant_pre); * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct rightmost descendant as long as @pos is - * accessible. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct rightmost descendant as + * long as @pos is accessible. */ struct cgroup_subsys_state * css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); do { last = pos; @@ -3154,10 +3072,11 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * to visit for post-order traversal of @root's descendants. @root is * included in the iteration and the last node to be visited. * - * While this function requires RCU read locking, it doesn't require the - * whole traversal to be contained in a single RCU critical section. This - * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * While this function requires cgroup_mutex or RCU read locking, it + * doesn't require the whole traversal to be contained in a single critical + * section. This function will return the correct next descendant as long + * as both @pos and @cgroup are accessible and @pos is a descendant of + * @cgroup. */ struct cgroup_subsys_state * css_next_descendant_post(struct cgroup_subsys_state *pos, @@ -3165,7 +3084,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -3504,14 +3423,12 @@ struct cgroup_pidlist { pid_t *list; /* how many elements the above list has */ int length; - /* how many files are using the current array */ - int use_count; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; - /* protects the other fields */ - struct rw_semaphore rwsem; + /* for delayed destruction */ + struct delayed_work destroy_dwork; }; /* @@ -3527,6 +3444,7 @@ static void *pidlist_allocate(int count) else return kmalloc(count * sizeof(pid_t), GFP_KERNEL); } + static void pidlist_free(void *p) { if (is_vmalloc_addr(p)) @@ -3536,6 +3454,47 @@ static void pidlist_free(void *p) } /* + * Used to destroy all pidlists lingering waiting for destroy timer. None + * should be left afterwards. + */ +static void cgroup_pidlist_destroy_all(struct cgroup *cgrp) +{ + struct cgroup_pidlist *l, *tmp_l; + + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); + mutex_unlock(&cgrp->pidlist_mutex); + + flush_workqueue(cgroup_pidlist_destroy_wq); + BUG_ON(!list_empty(&cgrp->pidlists)); +} + +static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, + destroy_dwork); + struct cgroup_pidlist *tofree = NULL; + + mutex_lock(&l->owner->pidlist_mutex); + + /* + * Destroy iff we didn't get queued again. The state won't change + * as destroy_dwork can only be queued while locked. + */ + if (!delayed_work_pending(dwork)) { + list_del(&l->links); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + tofree = l; + } + + mutex_unlock(&l->owner->pidlist_mutex); + kfree(tofree); +} + +/* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ @@ -3565,52 +3524,92 @@ after: return dest; } +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + * + * All this extra complexity was caused by the original implementation + * committing to an entirely unnecessary property. In the long term, we + * want to do away with it. Explicitly scramble sort order if + * sane_behavior so that no such expectation exists in the new interface. + * + * Scrambling is done by swapping every two consecutive bits, which is + * non-identity one-to-one mapping which disturbs sort order sufficiently. + */ +static pid_t pid_fry(pid_t pid) +{ + unsigned a = pid & 0x55555555; + unsigned b = pid & 0xAAAAAAAA; + + return (a << 1) | (b >> 1); +} + +static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) +{ + if (cgroup_sane_behavior(cgrp)) + return pid_fry(pid); + else + return pid; +} + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } +static int fried_cmppid(const void *a, const void *b) +{ + return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); +} + +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = task_active_pid_ns(current); + + lockdep_assert_held(&cgrp->pidlist_mutex); + + list_for_each_entry(l, &cgrp->pidlists, links) + if (l->key.type == type && l->key.ns == ns) + return l; + return NULL; +} + /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ -static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, - enum cgroup_filetype type) +static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, + enum cgroup_filetype type) { struct cgroup_pidlist *l; - /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = task_active_pid_ns(current); - /* - * We can't drop the pidlist_mutex before taking the l->rwsem in case - * the last ref-holder is trying to remove l from the list at the same - * time. Holding the pidlist_mutex precludes somebody taking whichever - * list we find out from under us - compare release_pid_array(). - */ - mutex_lock(&cgrp->pidlist_mutex); - list_for_each_entry(l, &cgrp->pidlists, links) { - if (l->key.type == type && l->key.ns == ns) { - /* make sure l doesn't vanish out from under us */ - down_write(&l->rwsem); - mutex_unlock(&cgrp->pidlist_mutex); - return l; - } - } + lockdep_assert_held(&cgrp->pidlist_mutex); + + l = cgroup_pidlist_find(cgrp, type); + if (l) + return l; + /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); - if (!l) { - mutex_unlock(&cgrp->pidlist_mutex); + if (!l) return l; - } - init_rwsem(&l->rwsem); - down_write(&l->rwsem); + + INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; - l->key.ns = get_pid_ns(ns); + /* don't need task_nsproxy() if we're looking at ourself */ + l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); - mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3627,6 +3626,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct task_struct *tsk; struct cgroup_pidlist *l; + lockdep_assert_held(&cgrp->pidlist_mutex); + /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the @@ -3653,20 +3654,24 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); + if (cgroup_sane_behavior(cgrp)) + sort(array, length, sizeof(pid_t), fried_cmppid, NULL); + else + sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(array, length); - l = cgroup_pidlist_find(cgrp, type); + + l = cgroup_pidlist_find_create(cgrp, type); if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); pidlist_free(array); return -ENOMEM; } - /* store array, freeing old if necessary - lock already held */ + + /* store array, freeing old if necessary */ pidlist_free(l->list); l->list = array; l->length = length; - l->use_count++; - up_write(&l->rwsem); *lp = l; return 0; } @@ -3740,20 +3745,45 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pidlist *l = s->private; + struct cgroup_open_file *of = s->private; + struct cgroup *cgrp = seq_css(s)->cgroup; + struct cgroup_pidlist *l; + enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; - int *iter; + int *iter, ret; + + mutex_lock(&cgrp->pidlist_mutex); + + /* + * !NULL @of->priv indicates that this isn't the first start() + * after open. If the matching pidlist is around, we can use that. + * Look for it. Note that @of->priv can't be used directly. It + * could already have been destroyed. + */ + if (of->priv) + of->priv = cgroup_pidlist_find(cgrp, type); + + /* + * Either this is the first start() after open or the matching + * pidlist has been destroyed inbetween. Create a new one. + */ + if (!of->priv) { + ret = pidlist_array_load(cgrp, type, + (struct cgroup_pidlist **)&of->priv); + if (ret) + return ERR_PTR(ret); + } + l = of->priv; - down_read(&l->rwsem); if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (l->list[mid] == pid) { + if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) index = mid + 1; else end = mid; @@ -3764,19 +3794,25 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; - *pos = *iter; + *pos = cgroup_pid_fry(cgrp, *iter); return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pidlist *l = s->private; - up_read(&l->rwsem); + struct cgroup_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; + + if (l) + mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, + CGROUP_PIDLIST_DESTROY_DELAY); + mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pidlist *l = s->private; + struct cgroup_open_file *of = s->private; + struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; /* @@ -3787,7 +3823,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) if (p >= end) { return NULL; } else { - *pos = *p; + *pos = cgroup_pid_fry(seq_css(s)->cgroup, *p); return p; } } @@ -3808,92 +3844,6 @@ static const struct seq_operations cgroup_pidlist_seq_operations = { .show = cgroup_pidlist_show, }; -static void cgroup_release_pid_array(struct cgroup_pidlist *l) -{ - /* - * the case where we're the last user of this particular pidlist will - * have us remove it from the cgroup's list, which entails taking the - * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> - * pidlist_mutex, we have to take pidlist_mutex first. - */ - mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->rwsem); - BUG_ON(!l->use_count); - if (!--l->use_count) { - /* we're the last user if refcount is 0; remove and free */ - list_del(&l->links); - mutex_unlock(&l->owner->pidlist_mutex); - pidlist_free(l->list); - put_pid_ns(l->key.ns); - up_write(&l->rwsem); - kfree(l); - return; - } - mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->rwsem); -} - -static int cgroup_pidlist_release(struct inode *inode, struct file *file) -{ - struct cgroup_pidlist *l; - if (!(file->f_mode & FMODE_READ)) - return 0; - /* - * the seq_file will only be initialized if the file was opened for - * reading; hence we check if it's not null only in that case. - */ - l = ((struct seq_file *)file->private_data)->private; - cgroup_release_pid_array(l); - return seq_release(inode, file); -} - -static const struct file_operations cgroup_pidlist_operations = { - .read = seq_read, - .llseek = seq_lseek, - .write = cgroup_file_write, - .release = cgroup_pidlist_release, -}; - -/* - * The following functions handle opens on a file that displays a pidlist - * (tasks or procs). Prepare an array of the process/thread IDs of whoever's - * in the cgroup. - */ -/* helper function for the two below it */ -static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) -{ - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct cgroup_pidlist *l; - int retval; - - /* Nothing to do for write-only files */ - if (!(file->f_mode & FMODE_READ)) - return 0; - - /* have the array populated */ - retval = pidlist_array_load(cgrp, type, &l); - if (retval) - return retval; - /* configure file information */ - file->f_op = &cgroup_pidlist_operations; - - retval = seq_open(file, &cgroup_pidlist_seq_operations); - if (retval) { - cgroup_release_pid_array(l); - return retval; - } - ((struct seq_file *)file->private_data)->private = l; - return 0; -} -static int cgroup_tasks_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); -} -static int cgroup_procs_open(struct inode *unused, struct file *file) -{ - return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); -} - static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3928,202 +3878,6 @@ static void cgroup_dput(struct cgroup *cgrp) deactivate_super(sb); } -/* - * Unregister event and free resources. - * - * Gets called from workqueue. - */ -static void cgroup_event_remove(struct work_struct *work) -{ - struct cgroup_event *event = container_of(work, struct cgroup_event, - remove); - struct cgroup_subsys_state *css = event->css; - - remove_wait_queue(event->wqh, &event->wait); - - event->cft->unregister_event(css, event->cft, event->eventfd); - - /* Notify userspace the event is going away. */ - eventfd_signal(event->eventfd, 1); - - eventfd_ctx_put(event->eventfd); - kfree(event); - css_put(css); -} - -/* - * Gets called on POLLHUP on eventfd when user closes it. - * - * Called with wqh->lock held and interrupts disabled. - */ -static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, - int sync, void *key) -{ - struct cgroup_event *event = container_of(wait, - struct cgroup_event, wait); - struct cgroup *cgrp = event->css->cgroup; - unsigned long flags = (unsigned long)key; - - if (flags & POLLHUP) { - /* - * If the event has been detached at cgroup removal, we - * can simply return knowing the other side will cleanup - * for us. - * - * We can't race against event freeing since the other - * side will require wqh->lock via remove_wait_queue(), - * which we hold. - */ - spin_lock(&cgrp->event_list_lock); - if (!list_empty(&event->list)) { - list_del_init(&event->list); - /* - * We are in atomic context, but cgroup_event_remove() - * may sleep, so we have to call it in workqueue. - */ - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - } - - return 0; -} - -static void cgroup_event_ptable_queue_proc(struct file *file, - wait_queue_head_t *wqh, poll_table *pt) -{ - struct cgroup_event *event = container_of(pt, - struct cgroup_event, pt); - - event->wqh = wqh; - add_wait_queue(wqh, &event->wait); -} - -/* - * Parse input and register new cgroup event handler. - * - * Input must be in format '<event_fd> <control_fd> <args>'. - * Interpretation of args is defined by control file implementation. - */ -static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, - struct cftype *cft, const char *buffer) -{ - struct cgroup *cgrp = dummy_css->cgroup; - struct cgroup_event *event; - struct cgroup_subsys_state *cfile_css; - unsigned int efd, cfd; - struct fd efile; - struct fd cfile; - char *endp; - int ret; - - efd = simple_strtoul(buffer, &endp, 10); - if (*endp != ' ') - return -EINVAL; - buffer = endp + 1; - - cfd = simple_strtoul(buffer, &endp, 10); - if ((*endp != ' ') && (*endp != '\0')) - return -EINVAL; - buffer = endp + 1; - - event = kzalloc(sizeof(*event), GFP_KERNEL); - if (!event) - return -ENOMEM; - - INIT_LIST_HEAD(&event->list); - init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); - init_waitqueue_func_entry(&event->wait, cgroup_event_wake); - INIT_WORK(&event->remove, cgroup_event_remove); - - efile = fdget(efd); - if (!efile.file) { - ret = -EBADF; - goto out_kfree; - } - - event->eventfd = eventfd_ctx_fileget(efile.file); - if (IS_ERR(event->eventfd)) { - ret = PTR_ERR(event->eventfd); - goto out_put_efile; - } - - cfile = fdget(cfd); - if (!cfile.file) { - ret = -EBADF; - goto out_put_eventfd; - } - - /* the process need read permission on control file */ - /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile.file), MAY_READ); - if (ret < 0) - goto out_put_cfile; - - event->cft = __file_cft(cfile.file); - if (IS_ERR(event->cft)) { - ret = PTR_ERR(event->cft); - goto out_put_cfile; - } - - if (!event->cft->ss) { - ret = -EBADF; - goto out_put_cfile; - } - - /* - * Determine the css of @cfile, verify it belongs to the same - * cgroup as cgroup.event_control, and associate @event with it. - * Remaining events are automatically removed on cgroup destruction - * but the removal is asynchronous, so take an extra ref. - */ - rcu_read_lock(); - - ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); - if (event->css && event->css == cfile_css && css_tryget(event->css)) - ret = 0; - - rcu_read_unlock(); - if (ret) - goto out_put_cfile; - - if (!event->cft->register_event || !event->cft->unregister_event) { - ret = -EINVAL; - goto out_put_css; - } - - ret = event->cft->register_event(event->css, event->cft, - event->eventfd, buffer); - if (ret) - goto out_put_css; - - efile.file->f_op->poll(efile.file, &event->pt); - - spin_lock(&cgrp->event_list_lock); - list_add(&event->list, &cgrp->event_list); - spin_unlock(&cgrp->event_list_lock); - - fdput(cfile); - fdput(efile); - - return 0; - -out_put_css: - css_put(event->css); -out_put_cfile: - fdput(cfile); -out_put_eventfd: - eventfd_ctx_put(event->eventfd); -out_put_efile: - fdput(efile); -out_kfree: - kfree(event); - - return ret; -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4143,17 +3897,15 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css, static struct cftype cgroup_base_files[] = { { .name = "cgroup.procs", - .open = cgroup_procs_open, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_PROCS, .write_u64 = cgroup_procs_write, - .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { - .name = "cgroup.event_control", - .write_string = cgroup_write_event_control, - .mode = S_IWUGO, - }, - { .name = "cgroup.clone_children", .flags = CFTYPE_INSANE, .read_u64 = cgroup_clone_children_read, @@ -4162,7 +3914,7 @@ static struct cftype cgroup_base_files[] = { { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cgroup_sane_behavior_show, + .seq_show = cgroup_sane_behavior_show, }, /* @@ -4173,9 +3925,12 @@ static struct cftype cgroup_base_files[] = { { .name = "tasks", .flags = CFTYPE_INSANE, /* use "procs" instead */ - .open = cgroup_tasks_open, + .seq_start = cgroup_pidlist_start, + .seq_next = cgroup_pidlist_next, + .seq_stop = cgroup_pidlist_stop, + .seq_show = cgroup_pidlist_show, + .private = CGROUP_FILE_TASKS, .write_u64 = cgroup_tasks_write, - .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, { @@ -4187,7 +3942,7 @@ static struct cftype cgroup_base_files[] = { { .name = "release_agent", .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, - .read_seq_string = cgroup_release_agent_show, + .seq_show = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, }, @@ -4333,6 +4088,62 @@ static void offline_css(struct cgroup_subsys_state *css) RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } +/** + * create_css - create a cgroup_subsys_state + * @cgrp: the cgroup new css will be associated with + * @ss: the subsys of new css + * + * Create a new css associated with @cgrp - @ss pair. On success, the new + * css is online and installed in @cgrp with all interface files created. + * Returns 0 on success, -errno on failure. + */ +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +{ + struct cgroup *parent = cgrp->parent; + struct cgroup_subsys_state *css; + int err; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); + + css = ss->css_alloc(cgroup_css(parent, ss)); + if (IS_ERR(css)) + return PTR_ERR(css); + + err = percpu_ref_init(&css->refcnt, css_release); + if (err) + goto err_free; + + init_css(css, ss, cgrp); + + err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + if (err) + goto err_free; + + err = online_css(css); + if (err) + goto err_free; + + dget(cgrp->dentry); + css_get(css->parent); + + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } + + return 0; + +err_free: + percpu_ref_cancel_init(&css->refcnt); + ss->css_free(css); + return err; +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4344,11 +4155,10 @@ static void offline_css(struct cgroup_subsys_state *css) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { - struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; - int err = 0; + int ssid, err = 0; struct cgroup_subsys *ss; struct super_block *sb = root->sb; @@ -4404,23 +4214,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css; - - css = ss->css_alloc(cgroup_css(parent, ss)); - if (IS_ERR(css)) { - err = PTR_ERR(css); - goto err_free_all; - } - css_ar[ss->subsys_id] = css; - - err = percpu_ref_init(&css->refcnt, css_release); - if (err) - goto err_free_all; - - init_css(css, ss, cgrp); - } - /* * Create directory. cgroup_create_file() returns with the new * directory locked on success so that it can be populated without @@ -4428,7 +4221,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_free_all; + goto err_unlock; lockdep_assert_held(&dentry->d_inode->i_mutex); cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4440,55 +4233,31 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* hold a ref to the parent's dentry */ dget(parent->dentry); - /* creation succeeded, notify subsystems */ - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - err = online_css(css); - if (err) - goto err_destroy; - - /* each css holds a ref to the cgroup's dentry and parent css */ - dget(dentry); - css_get(css->parent); - - /* mark it consumed for error path */ - css_ar[ss->subsys_id] = NULL; - - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); - ss->warned_broken_hierarchy = true; - } - } - + /* + * @cgrp is now fully operational. If something fails after this + * point, it'll be released via the normal destruction path. + */ idr_replace(&root->cgroup_idr, cgrp, cgrp->id); err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; - err = cgroup_populate_dir(cgrp, root->subsys_mask); - if (err) - goto err_destroy; + /* let's create and online css's */ + for_each_subsys(ss, ssid) { + if (root->subsys_mask & (1 << ssid)) { + err = create_css(cgrp, ss); + if (err) + goto err_destroy; + } + } mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; -err_free_all: - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - if (css) { - percpu_ref_cancel_init(&css->refcnt); - ss->css_free(css); - } - } +err_unlock: mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); @@ -4501,14 +4270,6 @@ err_free_cgrp: return err; err_destroy: - for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; - - if (css) { - percpu_ref_cancel_init(&css->refcnt); - ss->css_free(css); - } - } cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); mutex_unlock(&dentry->d_inode->i_mutex); @@ -4631,10 +4392,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct dentry *d = cgrp->dentry; - struct cgroup_event *event, *tmp; - struct cgroup_subsys *ss; + struct cgroup_subsys_state *css; struct cgroup *child; bool empty; + int ssid; lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4670,12 +4431,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * will be invoked to perform the rest of destruction once the * percpu refs of all css's are confirmed to be killed. */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); - - if (css) - kill_css(css); - } + for_each_css(css, ssid, cgrp) + kill_css(css); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4710,18 +4467,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) dget(d); cgroup_d_remove_dir(d); - /* - * Unregister events and notify userspace. - * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace. - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del_init(&event->list); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - return 0; }; @@ -4792,7 +4537,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); /* Create the top cgroup state for this subsystem */ - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ @@ -4866,6 +4610,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) cgroup_init_cftsets(ss); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); cgroup_subsys[ss->subsys_id] = ss; /* @@ -4877,11 +4622,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return PTR_ERR(css); } - list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ @@ -4911,14 +4656,18 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ret = online_css(css); - if (ret) + if (ret) { + ss->css_free(css); goto err_unload; + } /* success! */ + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; err_unload: + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); /* @ss can't be mounted here as try_module_get() would fail */ cgroup_unload_subsys(ss); @@ -4937,6 +4686,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys); void cgroup_unload_subsys(struct cgroup_subsys *ss) { struct cgrp_cset_link *link; + struct cgroup_subsys_state *css; BUG_ON(ss->module == NULL); @@ -4948,15 +4698,15 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) BUG_ON(ss->root != &cgroup_dummy_root); mutex_lock(&cgroup_mutex); + mutex_lock(&cgroup_root_mutex); - offline_css(cgroup_css(cgroup_dummy_top, ss)); + css = cgroup_css(cgroup_dummy_top, ss); + if (css) + offline_css(css); /* deassign the subsys_id */ cgroup_subsys[ss->subsys_id] = NULL; - /* remove subsystem from the dummy root's list of subsystems */ - list_del_init(&ss->sibling); - /* * disentangle the css from all css_sets attached to the dummy * top. as in loading, we need to pay our respects to the hashtable @@ -4979,9 +4729,11 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * need to free before marking as null because ss->css_free needs * the cgrp->subsys pointer to find their state. */ - ss->css_free(cgroup_css(cgroup_dummy_top, ss)); + if (css) + ss->css_free(css); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); } EXPORT_SYMBOL_GPL(cgroup_unload_subsys); @@ -5100,6 +4852,15 @@ static int __init cgroup_wq_init(void) */ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); + + /* + * Used to destroy pidlists and separate to serve as flush domain. + * Cap @max_active to 1 too. + */ + cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", + 0, 1); + BUG_ON(!cgroup_pidlist_destroy_wq); + return 0; } core_initcall(cgroup_wq_init); @@ -5143,11 +4904,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int count = 0; + int ssid, count = 0; seq_printf(m, "%d:", root->hierarchy_id); - for_each_root_subsys(root, ss) - seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + for_each_subsys(ss, ssid) + if (root->subsys_mask & (1 << ssid)) + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); if (strlen(root->name)) seq_printf(m, "%sname=%s", count ? "," : "", root->name); @@ -5488,16 +5250,16 @@ __setup("cgroup_disable=", cgroup_disable); * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under RCU read lock. The caller is responsible for - * pinning the returned css if it needs to be accessed outside the RCU - * critical section. + * Must be called under cgroup_mutex or RCU read lock. The caller is + * responsible for pinning the returned css if it needs to be accessed + * outside the critical section. */ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct cgroup *cgrp; - WARN_ON_ONCE(!rcu_read_lock_held()); + cgroup_assert_mutex_or_rcu_locked(); /* is @dentry a cgroup dir? */ if (!dentry->d_inode || @@ -5520,9 +5282,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&cgroup_mutex), - "css_from_id() needs proper protection"); + cgroup_assert_mutex_or_rcu_locked(); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) @@ -5570,9 +5330,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, return count; } -static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, - struct cftype *cft, - struct seq_file *seq) +static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; @@ -5597,9 +5355,9 @@ static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *seq) +static int cgroup_css_links_read(struct seq_file *seq, void *v) { + struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; read_lock(&css_set_lock); @@ -5645,12 +5403,12 @@ static struct cftype debug_files[] = { { .name = "current_css_set_cg_links", - .read_seq_string = current_css_set_cg_links_read, + .seq_show = current_css_set_cg_links_read, }, { .name = "cgroup_css_links", - .read_seq_string = cgroup_css_links_read, + .seq_show = cgroup_css_links_read, }, { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f0ff64d0ebaa..6c3154e477f6 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -301,10 +301,9 @@ out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *m) +static int freezer_read(struct seq_file *m, void *v) { - struct cgroup_subsys_state *pos; + struct cgroup_subsys_state *css = seq_css(m), *pos; rcu_read_lock(); @@ -458,7 +457,7 @@ static struct cftype files[] = { { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, - .read_seq_string = freezer_read, + .seq_show = freezer_read, .write_string = freezer_write, }, { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4772034b4b17..4410ac6a55f1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1731,66 +1731,41 @@ out_unlock: * used, list of ranges of sequential numbers, is variable length, * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. - * A single large read to a buffer that crosses a page boundary is - * ok, because the result being copied to user land is not recomputed - * across a page fault. */ - -static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +static int cpuset_common_seq_show(struct seq_file *sf, void *v) { - size_t count; - - mutex_lock(&callback_mutex); - count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); - mutex_unlock(&callback_mutex); + struct cpuset *cs = css_cs(seq_css(sf)); + cpuset_filetype_t type = seq_cft(sf)->private; + ssize_t count; + char *buf, *s; + int ret = 0; - return count; -} - -static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) -{ - size_t count; + count = seq_get_buf(sf, &buf); + s = buf; mutex_lock(&callback_mutex); - count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); - mutex_unlock(&callback_mutex); - - return count; -} - -static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) -{ - struct cpuset *cs = css_cs(css); - cpuset_filetype_t type = cft->private; - char *page; - ssize_t retval = 0; - char *s; - - if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) - return -ENOMEM; - - s = page; switch (type) { case FILE_CPULIST: - s += cpuset_sprintf_cpulist(s, cs); + s += cpulist_scnprintf(s, count, cs->cpus_allowed); break; case FILE_MEMLIST: - s += cpuset_sprintf_memlist(s, cs); + s += nodelist_scnprintf(s, count, cs->mems_allowed); break; default: - retval = -EINVAL; - goto out; + ret = -EINVAL; + goto out_unlock; } - *s++ = '\n'; - retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); -out: - free_page((unsigned long)page); - return retval; + if (s < buf + count - 1) { + *s++ = '\n'; + seq_commit(sf, s - buf); + } else { + seq_commit(sf, -1); + } +out_unlock: + mutex_unlock(&callback_mutex); + return ret; } static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -1847,7 +1822,7 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) static struct cftype files[] = { { .name = "cpus", - .read = cpuset_common_file_read, + .seq_show = cpuset_common_seq_show, .write_string = cpuset_write_resmask, .max_write_len = (100U + 6 * NR_CPUS), .private = FILE_CPULIST, @@ -1855,7 +1830,7 @@ static struct cftype files[] = { { .name = "mems", - .read = cpuset_common_file_read, + .seq_show = cpuset_common_seq_show, .write_string = cpuset_write_resmask, .max_write_len = (100U + 6 * MAX_NUMNODES), .private = FILE_MEMLIST, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ae36cc11fe5..4d6964e49711 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7854,15 +7854,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpu_stats_show(struct seq_file *sf, void *v) { - struct task_group *tg = css_tg(css); + struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - cb->fill(cb, "nr_periods", cfs_b->nr_periods); - cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); - cb->fill(cb, "throttled_time", cfs_b->throttled_time); + seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); + seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); + seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); return 0; } @@ -7916,7 +7915,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .read_map = cpu_stats_show, + .seq_show = cpu_stats_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f64722ff0299..622e0818f905 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -163,10 +163,9 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - struct cpuacct *ca = css_ca(css); + struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; int i; @@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct seq_file *sf, void *v) { - struct cpuacct *ca = css_ca(css); + struct cpuacct *ca = css_ca(seq_css(sf)); int cpu; s64 val = 0; @@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, val += kcpustat->cpustat[CPUTIME_NICE]; } val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); val = 0; for_each_online_cpu(cpu) { @@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css, } val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); return 0; } @@ -220,11 +218,11 @@ static struct cftype files[] = { }, { .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, + .seq_show = cpuacct_percpu_seq_show, }, { .name = "stat", - .read_map = cpuacct_stats_show, + .seq_show = cpuacct_stats_show, }, { } /* terminate */ }; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b010eac595d2..82ef9f3b7473 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4789,6 +4789,7 @@ static int workqueue_cpu_down_callback(struct notifier_block *nfb, /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); break; } return NOTIFY_OK; @@ -4828,6 +4829,7 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); + destroy_work_on_stack(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 1a53d497a8c5..963b7034a51b 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -120,6 +120,9 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); + WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", + atomic_read(&ref->count)); + /* @ref is viewed as dead on all CPUs, send out kill confirmation */ if (ref->confirm_kill) ref->confirm_kill(ref); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index bda8e44f6fde..d747a84e09b0 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -242,22 +242,16 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } -static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - u64 val; - char str[64]; - int idx, name, len; + int idx, name; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(cft->private); name = MEMFILE_ATTR(cft->private); - val = res_counter_read_u64(&h_cg->hugepage[idx], name); - len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); - return simple_read_from_buffer(buf, nbytes, ppos, str, len); + return res_counter_read_u64(&h_cg->hugepage[idx], name); } static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, @@ -337,28 +331,28 @@ static void __init __hugetlb_cgroup_file_init(int idx) cft = &h->cgroup_files[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); - cft->read = hugetlb_cgroup_read; + cft->read_u64 = hugetlb_cgroup_read_u64; cft->write_string = hugetlb_cgroup_write; /* Add the usage file */ cft = &h->cgroup_files[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); - cft->read = hugetlb_cgroup_read; + cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX usage file */ cft = &h->cgroup_files[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); cft->trigger = hugetlb_cgroup_reset; - cft->read = hugetlb_cgroup_read; + cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ cft = &h->cgroup_files[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); cft->trigger = hugetlb_cgroup_reset; - cft->read = hugetlb_cgroup_read; + cft->read_u64 = hugetlb_cgroup_read_u64; /* NULL terminate the last cft */ cft = &h->cgroup_files[4]; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 57b16083f046..67dd2a881433 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -45,6 +45,7 @@ #include <linux/swapops.h> #include <linux/spinlock.h> #include <linux/eventfd.h> +#include <linux/poll.h> #include <linux/sort.h> #include <linux/fs.h> #include <linux/seq_file.h> @@ -55,6 +56,7 @@ #include <linux/cpu.h> #include <linux/oom.h> #include <linux/lockdep.h> +#include <linux/file.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list { struct eventfd_ctx *eventfd; }; +/* + * cgroup_event represents events which userspace want to receive. + */ +struct mem_cgroup_event { + /* + * memcg which the event belongs to. + */ + struct mem_cgroup *memcg; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to this event. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace closes + * the eventfd or on cgroup removing. This callback must be set, + * if you want provide notification functionality. + */ + void (*unregister_event)(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd); + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; + static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); @@ -331,6 +373,10 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif + /* List of events which userspace want to receive */ + struct list_head event_list; + spinlock_t event_list_lock; + struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; @@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; } -struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) -{ - return &mem_cgroup_from_css(css)->vmpressure; -} - static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -2979,10 +3020,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) } #ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct memcg_cache_params *params; if (!memcg_can_account_kmem(memcg)) @@ -5115,14 +5155,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } -static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct file *file, - char __user *buf, size_t nbytes, loff_t *ppos) +static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - char str[64]; u64 val; - int name, len; + int name; enum res_type type; type = MEMFILE_TYPE(cft->private); @@ -5148,8 +5186,7 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, BUG(); } - len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); - return simple_read_from_buffer(buf, nbytes, ppos, str, len); + return val; } static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) @@ -5386,8 +5423,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #endif #ifdef CONFIG_NUMA -static int memcg_numa_stat_show(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat { const char *name; @@ -5403,7 +5439,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css, const struct numa_stat *stat; int nid; unsigned long nr; - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); @@ -5442,10 +5478,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); } -static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, - struct seq_file *m) +static int memcg_stat_show(struct seq_file *m, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct mem_cgroup *mi; unsigned int i; @@ -5654,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - enum res_type type = MEMFILE_TYPE(cft->private); u64 threshold, usage; int i, size, ret; @@ -5737,13 +5770,23 @@ unlock: return ret; } -static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); +} + +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) +{ + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); +} + +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, enum res_type type) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; - enum res_type type = MEMFILE_TYPE(cft->private); u64 usage; int i, j, size; @@ -5816,14 +5859,23 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } -static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); +} + +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) +{ + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); +} + +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; - enum res_type type = MEMFILE_TYPE(cft->private); - BUG_ON(type != _OOM_TYPE); event = kmalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; @@ -5841,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, return 0; } -static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd) +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; - enum res_type type = MEMFILE_TYPE(cft->private); - - BUG_ON(type != _OOM_TYPE); spin_lock(&memcg_oom_lock); @@ -5862,17 +5910,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, spin_unlock(&memcg_oom_lock); } -static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct cgroup_map_cb *cb) +static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - - cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); - if (atomic_read(&memcg->under_oom)) - cb->fill(cb, "under_oom", 1); - else - cb->fill(cb, "under_oom", 0); + seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); + seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); return 0; } @@ -5965,41 +6008,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) } #endif +/* + * DO NOT USE IN NEW FILES. + * + * "cgroup.event_control" implementation. + * + * This is way over-engineered. It tries to support fully configurable + * events for each user. Such level of flexibility is completely + * unnecessary especially in the light of the planned unified hierarchy. + * + * Please deprecate this and replace with something simpler if at all + * possible. + */ + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void memcg_event_remove(struct work_struct *work) +{ + struct mem_cgroup_event *event = + container_of(work, struct mem_cgroup_event, remove); + struct mem_cgroup *memcg = event->memcg; + + remove_wait_queue(event->wqh, &event->wait); + + event->unregister_event(memcg, event->eventfd); + + /* Notify userspace the event is going away. */ + eventfd_signal(event->eventfd, 1); + + eventfd_ctx_put(event->eventfd); + kfree(event); + css_put(&memcg->css); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int memcg_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct mem_cgroup_event *event = + container_of(wait, struct mem_cgroup_event, wait); + struct mem_cgroup *memcg = event->memcg; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + /* + * If the event has been detached at cgroup removal, we + * can simply return knowing the other side will cleanup + * for us. + * + * We can't race against event freeing since the other + * side will require wqh->lock via remove_wait_queue(), + * which we hold. + */ + spin_lock(&memcg->event_list_lock); + if (!list_empty(&event->list)) { + list_del_init(&event->list); + /* + * We are in atomic context, but cgroup_event_remove() + * may sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); + } + + return 0; +} + +static void memcg_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct mem_cgroup_event *event = + container_of(pt, struct mem_cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * DO NOT USE IN NEW FILES. + * + * Parse input and register new cgroup event handler. + * + * Input must be in format '<event_fd> <control_fd> <args>'. + * Interpretation of args is defined by control file implementation. + */ +static int memcg_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event; + struct cgroup_subsys_state *cfile_css; + unsigned int efd, cfd; + struct fd efile; + struct fd cfile; + const char *name; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + event->memcg = memcg; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, memcg_event_wake); + INIT_WORK(&event->remove, memcg_event_remove); + + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out_kfree; + } + + event->eventfd = eventfd_ctx_fileget(efile.file); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto out_put_efile; + } + + cfile = fdget(cfd); + if (!cfile.file) { + ret = -EBADF; + goto out_put_eventfd; + } + + /* the process need read permission on control file */ + /* AV: shouldn't we check that it's been opened for read instead? */ + ret = inode_permission(file_inode(cfile.file), MAY_READ); + if (ret < 0) + goto out_put_cfile; + + /* + * Determine the event callbacks and set them in @event. This used + * to be done via struct cftype but cgroup core no longer knows + * about these events. The following is crude but the whole thing + * is for compatibility anyway. + * + * DO NOT ADD NEW FILES. + */ + name = cfile.file->f_dentry->d_name.name; + + if (!strcmp(name, "memory.usage_in_bytes")) { + event->register_event = mem_cgroup_usage_register_event; + event->unregister_event = mem_cgroup_usage_unregister_event; + } else if (!strcmp(name, "memory.oom_control")) { + event->register_event = mem_cgroup_oom_register_event; + event->unregister_event = mem_cgroup_oom_unregister_event; + } else if (!strcmp(name, "memory.pressure_level")) { + event->register_event = vmpressure_register_event; + event->unregister_event = vmpressure_unregister_event; + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { + event->register_event = memsw_cgroup_usage_register_event; + event->unregister_event = memsw_cgroup_usage_unregister_event; + } else { + ret = -EINVAL; + goto out_put_cfile; + } + + /* + * Verify @cfile should belong to @css. Also, remaining events are + * automatically removed on cgroup destruction but the removal is + * asynchronous, so take an extra ref on @css. + */ + rcu_read_lock(); + + ret = -EINVAL; + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, + &mem_cgroup_subsys); + if (cfile_css == css && css_tryget(css)) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; + + ret = event->register_event(memcg, event->eventfd, buffer); + if (ret) + goto out_put_css; + + efile.file->f_op->poll(efile.file, &event->pt); + + spin_lock(&memcg->event_list_lock); + list_add(&event->list, &memcg->event_list); + spin_unlock(&memcg->event_list_lock); + + fdput(cfile); + fdput(efile); + + return 0; + +out_put_css: + css_put(css); +out_put_cfile: + fdput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fdput(efile); +out_kfree: + kfree(event); + + return ret; +} + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), - .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, + .read_u64 = mem_cgroup_read_u64, }, { .name = "max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), .write_string = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "soft_limit_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), .write_string = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "stat", - .read_seq_string = memcg_stat_show, + .seq_show = memcg_stat_show, }, { .name = "force_empty", @@ -6012,6 +6275,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_hierarchy_read, }, { + .name = "cgroup.event_control", /* XXX: for compat */ + .write_string = memcg_write_event_control, + .flags = CFTYPE_NO_PREFIX, + .mode = S_IWUGO, + }, + { .name = "swappiness", .read_u64 = mem_cgroup_swappiness_read, .write_u64 = mem_cgroup_swappiness_write, @@ -6023,21 +6292,17 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "oom_control", - .read_map = mem_cgroup_oom_control_read, + .seq_show = mem_cgroup_oom_control_read, .write_u64 = mem_cgroup_oom_control_write, - .register_event = mem_cgroup_oom_register_event, - .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, { .name = "pressure_level", - .register_event = vmpressure_register_event, - .unregister_event = vmpressure_unregister_event, }, #ifdef CONFIG_NUMA { .name = "numa_stat", - .read_seq_string = memcg_numa_stat_show, + .seq_show = memcg_numa_stat_show, }, #endif #ifdef CONFIG_MEMCG_KMEM @@ -6045,29 +6310,29 @@ static struct cftype mem_cgroup_files[] = { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), .write_string = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.failcnt", .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "kmem.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .read_seq_string = mem_cgroup_slabinfo_read, + .seq_show = mem_cgroup_slabinfo_read, }, #endif #endif @@ -6079,27 +6344,25 @@ static struct cftype memsw_cgroup_files[] = { { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), - .read = mem_cgroup_read, - .register_event = mem_cgroup_usage_register_event, - .unregister_event = mem_cgroup_usage_unregister_event, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.max_usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.limit_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), .write_string = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { .name = "memsw.failcnt", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), .trigger = mem_cgroup_reset, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read_u64, }, { }, /* terminate */ }; @@ -6271,6 +6534,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); + INIT_LIST_HEAD(&memcg->event_list); + spin_lock_init(&memcg->event_list_lock); return &memcg->css; @@ -6346,6 +6611,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup_event *event, *tmp; + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace. + */ + spin_lock(&memcg->event_list_lock); + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { + list_del_init(&event->list); + schedule_work(&event->remove); + } + spin_unlock(&memcg->event_list_lock); kmem_cgroup_css_offline(memcg); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index d8bd2c500aa4..cfd162882c00 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -452,7 +452,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * - * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { diff --git a/mm/percpu.c b/mm/percpu.c index 65fd8a749712..036cfe07050f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1689,10 +1689,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, max_distance += ai->unit_size; /* warn if maximum distance is further than 75% of vmalloc space */ - if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + if (max_distance > VMALLOC_TOTAL * 3 / 4) { pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " "space 0x%lx\n", max_distance, - (unsigned long)(VMALLOC_END - VMALLOC_START)); + VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index e0f62837c3f4..196970a4541f 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd - * @css: css that is interested in vmpressure notifications - * @cft: cgroup control files handle + * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with * @args: event arguments (used to set up a pressure level threshold) * @@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or * "critical"). * - * This function should not be used directly, just pass it to (struct - * cftype).register_event, and then cgroup core will handle everything by - * itself. + * To be used as memcg event method. */ -int vmpressure_register_event(struct cgroup_subsys_state *css, - struct cftype *cft, struct eventfd_ctx *eventfd, - const char *args) +int vmpressure_register_event(struct mem_cgroup *memcg, + struct eventfd_ctx *eventfd, const char *args) { - struct vmpressure *vmpr = css_to_vmpressure(css); + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; int level; @@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css, /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure - * @css: css handle - * @cft: cgroup control files handle + * @memcg: memcg handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * * This function does internal manipulations to detach the @eventfd from * the vmpressure notifications, and then frees internal resources * associated with the @eventfd (but the @eventfd itself is not freed). * - * This function should not be used directly, just pass it to (struct - * cftype).unregister_event, and then cgroup core will handle everything - * by itself. + * To be used as memcg event method. */ -void vmpressure_unregister_event(struct cgroup_subsys_state *css, - struct cftype *cft, +void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd) { - struct vmpressure *vmpr = css_to_vmpressure(css); + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9b7cf6c85f82..56cbb69ba024 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -173,14 +173,14 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) return css->cgroup->id; } -static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, - struct cgroup_map_cb *cb) +static int read_priomap(struct seq_file *sf, void *v) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) - cb->fill(cb, dev->name, netprio_prio(css, dev)); + seq_printf(sf, "%s %u\n", dev->name, + netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); return 0; } @@ -238,7 +238,7 @@ static struct cftype ss_files[] = { }, { .name = "ifpriomap", - .read_map = read_priomap, + .seq_show = read_priomap, .write_string = write_priomap, }, { } /* terminate */ diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 7c2a0a71049e..d3b6d2cd3a06 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -274,10 +274,9 @@ static void set_majmin(char *str, unsigned m) sprintf(str, "%u", m); } -static int devcgroup_seq_read(struct cgroup_subsys_state *css, - struct cftype *cft, struct seq_file *m) +static int devcgroup_seq_show(struct seq_file *m, void *v) { - struct dev_cgroup *devcgroup = css_to_devcgroup(css); + struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m)); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; @@ -679,7 +678,7 @@ static struct cftype dev_cgroup_files[] = { }, { .name = "list", - .read_seq_string = devcgroup_seq_read, + .seq_show = devcgroup_seq_show, .private = DEVCG_LIST, }, { } /* terminate */ |