diff options
74 files changed, 2110 insertions, 2441 deletions
diff --git a/Documentation/devicetree/bindings/thermal/exynos-thermal.txt b/Documentation/devicetree/bindings/thermal/exynos-thermal.txt index ae738f562acc..695150a4136b 100644 --- a/Documentation/devicetree/bindings/thermal/exynos-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/exynos-thermal.txt @@ -12,6 +12,7 @@ "samsung,exynos5420-tmu-ext-triminfo" for TMU channels 2, 3 and 4 Exynos5420 (Must pass triminfo base and triminfo clock) "samsung,exynos5440-tmu" + "samsung,exynos7-tmu" - interrupt-parent : The phandle for the interrupt controller - reg : Address range of the thermal registers. For soc's which has multiple instances of TMU and some registers are shared across all TMU's like @@ -32,13 +33,28 @@ - clocks : The main clocks for TMU device -- 1. operational clock for TMU channel -- 2. optional clock to access the shared registers of TMU channel + -- 3. optional special clock for functional operation - clock-names : Thermal system clock name -- "tmu_apbif" operational clock for current TMU channel -- "tmu_triminfo_apbif" clock to access the shared triminfo register for current TMU channel + -- "tmu_sclk" clock for functional operation of the current TMU + channel - vtmu-supply: This entry is optional and provides the regulator node supplying voltage to TMU. If needed this entry can be placed inside board/platform specific dts file. +Following properties are mandatory (depending on SoC): +- samsung,tmu_gain: Gain value for internal TMU operation. +- samsung,tmu_reference_voltage: Value of TMU IP block's reference voltage +- samsung,tmu_noise_cancel_mode: Mode for noise cancellation +- samsung,tmu_efuse_value: Default level of temperature - it is needed when + in factory fusing produced wrong value +- samsung,tmu_min_efuse_value: Minimum temperature fused value +- samsung,tmu_max_efuse_value: Maximum temperature fused value +- samsung,tmu_first_point_trim: First point trimming value +- samsung,tmu_second_point_trim: Second point trimming value +- samsung,tmu_default_temp_offset: Default temperature offset +- samsung,tmu_cal_type: Callibration type Example 1): @@ -51,6 +67,7 @@ Example 1): clock-names = "tmu_apbif"; status = "disabled"; vtmu-supply = <&tmu_regulator_node>; + #include "exynos4412-tmu-sensor-conf.dtsi" }; Example 2): @@ -61,6 +78,7 @@ Example 2): interrupts = <0 58 0>; clocks = <&clock 21>; clock-names = "tmu_apbif"; + #include "exynos5440-tmu-sensor-conf.dtsi" }; Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register") @@ -70,6 +88,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register") interrupts = <0 184 0>; clocks = <&clock 318>, <&clock 318>; clock-names = "tmu_apbif", "tmu_triminfo_apbif"; + #include "exynos4412-tmu-sensor-conf.dtsi" }; tmu_cpu3: tmu@1006c000 { @@ -78,6 +97,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register") interrupts = <0 185 0>; clocks = <&clock 318>, <&clock 319>; clock-names = "tmu_apbif", "tmu_triminfo_apbif"; + #include "exynos4412-tmu-sensor-conf.dtsi" }; tmu_gpu: tmu@100a0000 { @@ -86,6 +106,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register") interrupts = <0 215 0>; clocks = <&clock 319>, <&clock 318>; clock-names = "tmu_apbif", "tmu_triminfo_apbif"; + #include "exynos4412-tmu-sensor-conf.dtsi" }; Note: For multi-instance tmu each instance should have an alias correctly diff --git a/Documentation/devicetree/bindings/thermal/thermal.txt b/Documentation/devicetree/bindings/thermal/thermal.txt index f5db6b72a36f..29fe0bfae38e 100644 --- a/Documentation/devicetree/bindings/thermal/thermal.txt +++ b/Documentation/devicetree/bindings/thermal/thermal.txt @@ -251,24 +251,24 @@ ocp { }; thermal-zones { - cpu-thermal: cpu-thermal { + cpu_thermal: cpu-thermal { polling-delay-passive = <250>; /* milliseconds */ polling-delay = <1000>; /* milliseconds */ thermal-sensors = <&bandgap0>; trips { - cpu-alert0: cpu-alert { + cpu_alert0: cpu-alert0 { temperature = <90000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "active"; }; - cpu-alert1: cpu-alert { + cpu_alert1: cpu-alert1 { temperature = <100000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "passive"; }; - cpu-crit: cpu-crit { + cpu_crit: cpu-crit { temperature = <125000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "critical"; @@ -277,17 +277,17 @@ thermal-zones { cooling-maps { map0 { - trip = <&cpu-alert0>; - cooling-device = <&fan0 THERMAL_NO_LIMITS 4>; + trip = <&cpu_alert0>; + cooling-device = <&fan0 THERMAL_NO_LIMIT 4>; }; map1 { - trip = <&cpu-alert1>; - cooling-device = <&fan0 5 THERMAL_NO_LIMITS>; + trip = <&cpu_alert1>; + cooling-device = <&fan0 5 THERMAL_NO_LIMIT>; }; map2 { - trip = <&cpu-alert1>; + trip = <&cpu_alert1>; cooling-device = - <&cpu0 THERMAL_NO_LIMITS THERMAL_NO_LIMITS>; + <&cpu0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; }; }; }; @@ -298,13 +298,13 @@ used to monitor the zone 'cpu-thermal' using its sole sensor. A fan device (fan0) is controlled via I2C bus 1, at address 0x48, and has ten different cooling states 0-9. It is used to remove the heat out of the thermal zone 'cpu-thermal' using its cooling states -from its minimum to 4, when it reaches trip point 'cpu-alert0' +from its minimum to 4, when it reaches trip point 'cpu_alert0' at 90C, as an example of active cooling. The same cooling device is used at -'cpu-alert1', but from 5 to its maximum state. The cpu@0 device is also +'cpu_alert1', but from 5 to its maximum state. The cpu@0 device is also linked to the same thermal zone, 'cpu-thermal', as a passive cooling device, -using all its cooling states at trip point 'cpu-alert1', +using all its cooling states at trip point 'cpu_alert1', which is a trip point at 100C. On the thermal zone 'cpu-thermal', at the -temperature of 125C, represented by the trip point 'cpu-crit', the silicon +temperature of 125C, represented by the trip point 'cpu_crit', the silicon is not reliable anymore. (b) - IC with several internal sensors @@ -329,7 +329,7 @@ ocp { }; thermal-zones { - cpu-thermal: cpu-thermal { + cpu_thermal: cpu-thermal { polling-delay-passive = <250>; /* milliseconds */ polling-delay = <1000>; /* milliseconds */ @@ -338,12 +338,12 @@ thermal-zones { trips { /* each zone within the SoC may have its own trips */ - cpu-alert: cpu-alert { + cpu_alert: cpu-alert { temperature = <100000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "passive"; }; - cpu-crit: cpu-crit { + cpu_crit: cpu-crit { temperature = <125000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "critical"; @@ -356,7 +356,7 @@ thermal-zones { }; }; - gpu-thermal: gpu-thermal { + gpu_thermal: gpu-thermal { polling-delay-passive = <120>; /* milliseconds */ polling-delay = <1000>; /* milliseconds */ @@ -365,12 +365,12 @@ thermal-zones { trips { /* each zone within the SoC may have its own trips */ - gpu-alert: gpu-alert { + gpu_alert: gpu-alert { temperature = <90000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "passive"; }; - gpu-crit: gpu-crit { + gpu_crit: gpu-crit { temperature = <105000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "critical"; @@ -383,7 +383,7 @@ thermal-zones { }; }; - dsp-thermal: dsp-thermal { + dsp_thermal: dsp-thermal { polling-delay-passive = <50>; /* milliseconds */ polling-delay = <1000>; /* milliseconds */ @@ -392,12 +392,12 @@ thermal-zones { trips { /* each zone within the SoC may have its own trips */ - dsp-alert: gpu-alert { + dsp_alert: dsp-alert { temperature = <90000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "passive"; }; - dsp-crit: gpu-crit { + dsp_crit: gpu-crit { temperature = <135000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "critical"; @@ -457,7 +457,7 @@ ocp { }; thermal-zones { - cpu-thermal: cpu-thermal { + cpu_thermal: cpu-thermal { polling-delay-passive = <250>; /* milliseconds */ polling-delay = <1000>; /* milliseconds */ @@ -508,7 +508,7 @@ with many sensors and many cooling devices. /* * An IC with several temperature sensor. */ - adc-dummy: sensor@0x50 { + adc_dummy: sensor@0x50 { ... #thermal-sensor-cells = <1>; /* sensor internal ID */ }; @@ -520,7 +520,7 @@ thermal-zones { polling-delay = <2500>; /* milliseconds */ /* sensor ID */ - thermal-sensors = <&adc-dummy 4>; + thermal-sensors = <&adc_dummy 4>; trips { ... @@ -531,14 +531,14 @@ thermal-zones { }; }; - board-thermal: board-thermal { + board_thermal: board-thermal { polling-delay-passive = <1000>; /* milliseconds */ polling-delay = <2500>; /* milliseconds */ /* sensor ID */ - thermal-sensors = <&adc-dummy 0>, /* pcb top edge */ - <&adc-dummy 1>, /* lcd */ - <&adc-dymmy 2>; /* back cover */ + thermal-sensors = <&adc_dummy 0>, /* pcb top edge */ + <&adc_dummy 1>, /* lcd */ + <&adc_dummy 2>; /* back cover */ /* * An array of coefficients describing the sensor * linear relation. E.g.: @@ -548,22 +548,22 @@ thermal-zones { trips { /* Trips are based on resulting linear equation */ - cpu-trip: cpu-trip { + cpu_trip: cpu-trip { temperature = <60000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "passive"; }; - gpu-trip: gpu-trip { + gpu_trip: gpu-trip { temperature = <55000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "passive"; } - lcd-trip: lcp-trip { + lcd_trip: lcp-trip { temperature = <53000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "passive"; }; - crit-trip: crit-trip { + crit_trip: crit-trip { temperature = <68000>; /* millicelsius */ hysteresis = <2000>; /* millicelsius */ type = "critical"; @@ -572,17 +572,17 @@ thermal-zones { cooling-maps { map0 { - trip = <&cpu-trip>; + trip = <&cpu_trip>; cooling-device = <&cpu0 0 2>; contribution = <55>; }; map1 { - trip = <&gpu-trip>; + trip = <&gpu_trip>; cooling-device = <&gpu0 0 2>; contribution = <20>; }; map2 { - trip = <&lcd-trip>; + trip = <&lcd_trip>; cooling-device = <&lcd0 5 10>; contribution = <15>; }; diff --git a/MAINTAINERS b/MAINTAINERS index 1921ed58d1a0..7cfcee4e2bea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2433,7 +2433,8 @@ F: arch/powerpc/oprofile/*cell* F: arch/powerpc/platforms/cell/ CEPH DISTRIBUTED FILE SYSTEM CLIENT -M: Sage Weil <sage@inktank.com> +M: Yan, Zheng <zyan@redhat.com> +M: Sage Weil <sage@redhat.com> L: ceph-devel@vger.kernel.org W: http://ceph.com/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git @@ -7998,8 +7999,8 @@ S: Supported F: drivers/net/wireless/ath/wcn36xx/ RADOS BLOCK DEVICE (RBD) -M: Yehuda Sadeh <yehuda@inktank.com> -M: Sage Weil <sage@inktank.com> +M: Ilya Dryomov <idryomov@gmail.com> +M: Sage Weil <sage@redhat.com> M: Alex Elder <elder@kernel.org> M: ceph-devel@vger.kernel.org W: http://ceph.com/ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 67fc3d2b0aab..a0c35bf6cb92 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -476,12 +476,14 @@ static inline int pmd_present(pmd_t pmd) */ static inline int pte_protnone(pte_t pte) { - return pte_flags(pte) & _PAGE_PROTNONE; + return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; } static inline int pmd_protnone(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_PROTNONE; + return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT)) + == _PAGE_PROTNONE; } #endif /* CONFIG_NUMA_BALANCING */ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9273d0969ebd..5b9c6d5c3636 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1292,6 +1292,9 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, struct blkg_rwstat rwstat = { }, tmp; int i, cpu; + if (tg->stats_cpu == NULL) + return 0; + for_each_possible_cpu(cpu) { struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8a86b62466f7..b40af3203089 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -38,6 +38,7 @@ #include <linux/kernel.h> #include <linux/device.h> #include <linux/module.h> +#include <linux/blk-mq.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/slab.h> @@ -340,9 +341,7 @@ struct rbd_device { char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ - struct list_head rq_queue; /* incoming rq queue */ spinlock_t lock; /* queue, flags, open_count */ - struct work_struct rq_work; struct rbd_image_header header; unsigned long flags; /* possibly lock protected */ @@ -360,6 +359,9 @@ struct rbd_device { atomic_t parent_ref; struct rbd_device *parent; + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + /* protects updating the header */ struct rw_semaphore header_rwsem; @@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, /* * We support a 64-bit length, but ultimately it has to be - * passed to blk_end_request(), which takes an unsigned int. + * passed to the block layer, which just supports a 32-bit + * length field. */ obj_request->xferred = osd_req->r_reply_op_len[0]; rbd_assert(obj_request->xferred < (u64)UINT_MAX); @@ -2275,7 +2278,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) more = obj_request->which < img_request->obj_request_count - 1; } else { rbd_assert(img_request->rq != NULL); - more = blk_end_request(img_request->rq, result, xferred); + + more = blk_update_request(img_request->rq, result, xferred); + if (!more) + __blk_mq_end_request(img_request->rq, result); } return more; @@ -3304,8 +3310,10 @@ out: return ret; } -static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) +static void rbd_queue_workfn(struct work_struct *work) { + struct request *rq = blk_mq_rq_from_pdu(work); + struct rbd_device *rbd_dev = rq->q->queuedata; struct rbd_img_request *img_request; struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; @@ -3314,6 +3322,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) u64 mapping_size; int result; + if (rq->cmd_type != REQ_TYPE_FS) { + dout("%s: non-fs request type %d\n", __func__, + (int) rq->cmd_type); + result = -EIO; + goto err; + } + if (rq->cmd_flags & REQ_DISCARD) op_type = OBJ_OP_DISCARD; else if (rq->cmd_flags & REQ_WRITE) @@ -3359,6 +3374,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; /* Shouldn't happen */ } + blk_mq_start_request(rq); + down_read(&rbd_dev->header_rwsem); mapping_size = rbd_dev->mapping.size; if (op_type != OBJ_OP_READ) { @@ -3404,53 +3421,18 @@ err_rq: rbd_warn(rbd_dev, "%s %llx at %llx result %d", obj_op_name(op_type), length, offset, result); ceph_put_snap_context(snapc); - blk_end_request_all(rq, result); +err: + blk_mq_end_request(rq, result); } -static void rbd_request_workfn(struct work_struct *work) +static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct rbd_device *rbd_dev = - container_of(work, struct rbd_device, rq_work); - struct request *rq, *next; - LIST_HEAD(requests); - - spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */ - list_splice_init(&rbd_dev->rq_queue, &requests); - spin_unlock_irq(&rbd_dev->lock); + struct request *rq = bd->rq; + struct work_struct *work = blk_mq_rq_to_pdu(rq); - list_for_each_entry_safe(rq, next, &requests, queuelist) { - list_del_init(&rq->queuelist); - rbd_handle_request(rbd_dev, rq); - } -} - -/* - * Called with q->queue_lock held and interrupts disabled, possibly on - * the way to schedule(). Do not sleep here! - */ -static void rbd_request_fn(struct request_queue *q) -{ - struct rbd_device *rbd_dev = q->queuedata; - struct request *rq; - int queued = 0; - - rbd_assert(rbd_dev); - - while ((rq = blk_fetch_request(q))) { - /* Ignore any non-FS requests that filter through. */ - if (rq->cmd_type != REQ_TYPE_FS) { - dout("%s: non-fs request type %d\n", __func__, - (int) rq->cmd_type); - __blk_end_request_all(rq, 0); - continue; - } - - list_add_tail(&rq->queuelist, &rbd_dev->rq_queue); - queued++; - } - - if (queued) - queue_work(rbd_wq, &rbd_dev->rq_work); + queue_work(rbd_wq, work); + return BLK_MQ_RQ_QUEUE_OK; } /* @@ -3511,6 +3493,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev) del_gendisk(disk); if (disk->queue) blk_cleanup_queue(disk->queue); + blk_mq_free_tag_set(&rbd_dev->tag_set); } put_disk(disk); } @@ -3694,7 +3677,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) ret = rbd_dev_header_info(rbd_dev); if (ret) - return ret; + goto out; /* * If there is a parent, see if it has disappeared due to the @@ -3703,30 +3686,46 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev) if (rbd_dev->parent) { ret = rbd_dev_v2_parent_info(rbd_dev); if (ret) - return ret; + goto out; } if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { - if (rbd_dev->mapping.size != rbd_dev->header.image_size) - rbd_dev->mapping.size = rbd_dev->header.image_size; + rbd_dev->mapping.size = rbd_dev->header.image_size; } else { /* validate mapped snapshot's EXISTS flag */ rbd_exists_validate(rbd_dev); } +out: up_write(&rbd_dev->header_rwsem); - - if (mapping_size != rbd_dev->mapping.size) + if (!ret && mapping_size != rbd_dev->mapping.size) rbd_dev_update_size(rbd_dev); + return ret; +} + +static int rbd_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct work_struct *work = blk_mq_rq_to_pdu(rq); + + INIT_WORK(work, rbd_queue_workfn); return 0; } +static struct blk_mq_ops rbd_mq_ops = { + .queue_rq = rbd_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = rbd_init_request, +}; + static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; struct request_queue *q; u64 segment_size; + int err; /* create gendisk info */ disk = alloc_disk(single_major ? @@ -3744,10 +3743,25 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) disk->fops = &rbd_bd_ops; disk->private_data = rbd_dev; - q = blk_init_queue(rbd_request_fn, &rbd_dev->lock); - if (!q) + memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); + rbd_dev->tag_set.ops = &rbd_mq_ops; + rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; + rbd_dev->tag_set.numa_node = NUMA_NO_NODE; + rbd_dev->tag_set.flags = + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + rbd_dev->tag_set.nr_hw_queues = 1; + rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); + + err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); + if (err) goto out_disk; + q = blk_mq_init_queue(&rbd_dev->tag_set); + if (IS_ERR(q)) { + err = PTR_ERR(q); + goto out_tag_set; + } + /* We use the default size, but let's be explicit about it. */ blk_queue_physical_block_size(q, SECTOR_SIZE); @@ -3773,10 +3787,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->disk = disk; return 0; +out_tag_set: + blk_mq_free_tag_set(&rbd_dev->tag_set); out_disk: put_disk(disk); - - return -ENOMEM; + return err; } /* @@ -4033,8 +4048,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, return NULL; spin_lock_init(&rbd_dev->lock); - INIT_LIST_HEAD(&rbd_dev->rq_queue); - INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn); rbd_dev->flags = 0; atomic_set(&rbd_dev->parent_ref, 0); INIT_LIST_HEAD(&rbd_dev->node); @@ -4274,32 +4287,22 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) } /* - * We always update the parent overlap. If it's zero we - * treat it specially. + * We always update the parent overlap. If it's zero we issue + * a warning, as we will proceed as if there was no parent. */ - rbd_dev->parent_overlap = overlap; if (!overlap) { - - /* A null parent_spec indicates it's the initial probe */ - if (parent_spec) { - /* - * The overlap has become zero, so the clone - * must have been resized down to 0 at some - * point. Treat this the same as a flatten. - */ - rbd_dev_parent_put(rbd_dev); - pr_info("%s: clone image now standalone\n", - rbd_dev->disk->disk_name); + /* refresh, careful to warn just once */ + if (rbd_dev->parent_overlap) + rbd_warn(rbd_dev, + "clone now standalone (overlap became 0)"); } else { - /* - * For the initial probe, if we find the - * overlap is zero we just pretend there was - * no parent image. - */ - rbd_warn(rbd_dev, "ignoring parent with overlap 0"); + /* initial probe */ + rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); } } + rbd_dev->parent_overlap = overlap; + out: ret = 0; out_err: @@ -4771,36 +4774,6 @@ static inline size_t next_token(const char **buf) } /* - * Finds the next token in *buf, and if the provided token buffer is - * big enough, copies the found token into it. The result, if - * copied, is guaranteed to be terminated with '\0'. Note that *buf - * must be terminated with '\0' on entry. - * - * Returns the length of the token found (not including the '\0'). - * Return value will be 0 if no token is found, and it will be >= - * token_size if the token would not fit. - * - * The *buf pointer will be updated to point beyond the end of the - * found token. Note that this occurs even if the token buffer is - * too small to hold it. - */ -static inline size_t copy_token(const char **buf, - char *token, - size_t token_size) -{ - size_t len; - - len = next_token(buf); - if (len < token_size) { - memcpy(token, *buf, len); - *(token + len) = '\0'; - } - *buf += len; - - return len; -} - -/* * Finds the next token in *buf, dynamically allocates a buffer big * enough to hold a copy of it, and copies the token into the new * buffer. The copy is guaranteed to be terminated with '\0'. Note diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 0f9a2c3c0e0d..1b06fc4640e2 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -26,13 +26,21 @@ config ARM_VEXPRESS_SPC_CPUFREQ config ARM_EXYNOS_CPUFREQ - bool + tristate "SAMSUNG EXYNOS CPUfreq Driver" + depends on CPU_EXYNOS4210 || SOC_EXYNOS4212 || SOC_EXYNOS4412 || SOC_EXYNOS5250 + depends on THERMAL + help + This adds the CPUFreq driver for Samsung EXYNOS platforms. + Supported SoC versions are: + Exynos4210, Exynos4212, Exynos4412, and Exynos5250. + + If in doubt, say N. config ARM_EXYNOS4210_CPUFREQ bool "SAMSUNG EXYNOS4210" depends on CPU_EXYNOS4210 + depends on ARM_EXYNOS_CPUFREQ default y - select ARM_EXYNOS_CPUFREQ help This adds the CPUFreq driver for Samsung EXYNOS4210 SoC (S5PV310 or S5PC210). @@ -42,8 +50,8 @@ config ARM_EXYNOS4210_CPUFREQ config ARM_EXYNOS4X12_CPUFREQ bool "SAMSUNG EXYNOS4x12" depends on SOC_EXYNOS4212 || SOC_EXYNOS4412 + depends on ARM_EXYNOS_CPUFREQ default y - select ARM_EXYNOS_CPUFREQ help This adds the CPUFreq driver for Samsung EXYNOS4X12 SoC (EXYNOS4212 or EXYNOS4412). @@ -53,28 +61,14 @@ config ARM_EXYNOS4X12_CPUFREQ config ARM_EXYNOS5250_CPUFREQ bool "SAMSUNG EXYNOS5250" depends on SOC_EXYNOS5250 + depends on ARM_EXYNOS_CPUFREQ default y - select ARM_EXYNOS_CPUFREQ help This adds the CPUFreq driver for Samsung EXYNOS5250 SoC. If in doubt, say N. -config ARM_EXYNOS5440_CPUFREQ - bool "SAMSUNG EXYNOS5440" - depends on SOC_EXYNOS5440 - depends on HAVE_CLK && OF - select PM_OPP - default y - help - This adds the CPUFreq driver for Samsung EXYNOS5440 - SoC. The nature of exynos5440 clock controller is - different than previous exynos controllers so not using - the common exynos framework. - - If in doubt, say N. - config ARM_EXYNOS_CPU_FREQ_BOOST_SW bool "EXYNOS Frequency Overclocking - Software" depends on ARM_EXYNOS_CPUFREQ && THERMAL @@ -90,6 +84,20 @@ config ARM_EXYNOS_CPU_FREQ_BOOST_SW If in doubt, say N. +config ARM_EXYNOS5440_CPUFREQ + tristate "SAMSUNG EXYNOS5440" + depends on SOC_EXYNOS5440 + depends on HAVE_CLK && OF + select PM_OPP + default y + help + This adds the CPUFreq driver for Samsung EXYNOS5440 + SoC. The nature of exynos5440 clock controller is + different than previous exynos controllers so not using + the common exynos framework. + + If in doubt, say N. + config ARM_HIGHBANK_CPUFREQ tristate "Calxeda Highbank-based" depends on ARCH_HIGHBANK && CPUFREQ_DT && REGULATOR diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index 8b4220ac888b..82a1821471fd 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -52,10 +52,11 @@ obj-$(CONFIG_ARM_DT_BL_CPUFREQ) += arm_big_little_dt.o obj-$(CONFIG_ARCH_DAVINCI) += davinci-cpufreq.o obj-$(CONFIG_UX500_SOC_DB8500) += dbx500-cpufreq.o -obj-$(CONFIG_ARM_EXYNOS_CPUFREQ) += exynos-cpufreq.o -obj-$(CONFIG_ARM_EXYNOS4210_CPUFREQ) += exynos4210-cpufreq.o -obj-$(CONFIG_ARM_EXYNOS4X12_CPUFREQ) += exynos4x12-cpufreq.o -obj-$(CONFIG_ARM_EXYNOS5250_CPUFREQ) += exynos5250-cpufreq.o +obj-$(CONFIG_ARM_EXYNOS_CPUFREQ) += arm-exynos-cpufreq.o +arm-exynos-cpufreq-y := exynos-cpufreq.o +arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS4210_CPUFREQ) += exynos4210-cpufreq.o +arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS4X12_CPUFREQ) += exynos4x12-cpufreq.o +arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS5250_CPUFREQ) += exynos5250-cpufreq.o obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ) += exynos5440-cpufreq.o obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ) += highbank-cpufreq.o obj-$(CONFIG_ARM_IMX6Q_CPUFREQ) += imx6q-cpufreq.o diff --git a/drivers/cpufreq/exynos-cpufreq.c b/drivers/cpufreq/exynos-cpufreq.c index f99a0b0b7c06..5e98c6b1f284 100644 --- a/drivers/cpufreq/exynos-cpufreq.c +++ b/drivers/cpufreq/exynos-cpufreq.c @@ -18,10 +18,13 @@ #include <linux/cpufreq.h> #include <linux/platform_device.h> #include <linux/of.h> +#include <linux/cpu_cooling.h> +#include <linux/cpu.h> #include "exynos-cpufreq.h" static struct exynos_dvfs_info *exynos_info; +static struct thermal_cooling_device *cdev; static struct regulator *arm_regulator; static unsigned int locking_frequency; @@ -156,6 +159,7 @@ static struct cpufreq_driver exynos_driver = { static int exynos_cpufreq_probe(struct platform_device *pdev) { + struct device_node *cpus, *np; int ret = -EINVAL; exynos_info = kzalloc(sizeof(*exynos_info), GFP_KERNEL); @@ -198,9 +202,36 @@ static int exynos_cpufreq_probe(struct platform_device *pdev) /* Done here as we want to capture boot frequency */ locking_frequency = clk_get_rate(exynos_info->cpu_clk) / 1000; - if (!cpufreq_register_driver(&exynos_driver)) + ret = cpufreq_register_driver(&exynos_driver); + if (ret) + goto err_cpufreq_reg; + + cpus = of_find_node_by_path("/cpus"); + if (!cpus) { + pr_err("failed to find cpus node\n"); + return 0; + } + + np = of_get_next_child(cpus, NULL); + if (!np) { + pr_err("failed to find cpus child node\n"); + of_node_put(cpus); return 0; + } + + if (of_find_property(np, "#cooling-cells", NULL)) { + cdev = of_cpufreq_cooling_register(np, + cpu_present_mask); + if (IS_ERR(cdev)) + pr_err("running cpufreq without cooling device: %ld\n", + PTR_ERR(cdev)); + } + of_node_put(np); + of_node_put(cpus); + + return 0; +err_cpufreq_reg: dev_err(&pdev->dev, "failed to register cpufreq driver\n"); regulator_put(arm_regulator); err_vdd_arm: diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c index d717f3dab6f1..668fb1bdea9e 100644 --- a/drivers/thermal/of-thermal.c +++ b/drivers/thermal/of-thermal.c @@ -497,6 +497,9 @@ thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data, if (sensor_specs.np == sensor_np && id == sensor_id) { tzd = thermal_zone_of_add_sensor(child, sensor_np, data, ops); + if (!IS_ERR(tzd)) + tzd->ops->set_mode(tzd, THERMAL_DEVICE_ENABLED); + of_node_put(sensor_specs.np); of_node_put(child); goto exit; diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index 9c6ce548e363..3aa46ac7cdbc 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -193,19 +193,20 @@ static u32 rk_tsadcv2_temp_to_code(long temp) static long rk_tsadcv2_code_to_temp(u32 code) { - int high, low, mid; - - low = 0; - high = ARRAY_SIZE(v2_code_table) - 1; - mid = (high + low) / 2; - - if (code > v2_code_table[low].code || code < v2_code_table[high].code) - return 125000; /* No code available, return max temperature */ + unsigned int low = 0; + unsigned int high = ARRAY_SIZE(v2_code_table) - 1; + unsigned int mid = (low + high) / 2; + unsigned int num; + unsigned long denom; + + /* Invalid code, return -EAGAIN */ + if (code > TSADCV2_DATA_MASK) + return -EAGAIN; - while (low <= high) { - if (code >= v2_code_table[mid].code && code < - v2_code_table[mid - 1].code) - return v2_code_table[mid].temp; + while (low <= high && mid) { + if (code >= v2_code_table[mid].code && + code < v2_code_table[mid - 1].code) + break; else if (code < v2_code_table[mid].code) low = mid + 1; else @@ -213,7 +214,16 @@ static long rk_tsadcv2_code_to_temp(u32 code) mid = (low + high) / 2; } - return 125000; + /* + * The 5C granularity provided by the table is too much. Let's + * assume that the relationship between sensor readings and + * temperature between 2 table entries is linear and interpolate + * to produce less granular result. + */ + num = v2_code_table[mid].temp - v2_code_table[mid - 1].temp; + num *= v2_code_table[mid - 1].code - code; + denom = v2_code_table[mid - 1].code - v2_code_table[mid].code; + return v2_code_table[mid - 1].temp + (num / denom); } /** diff --git a/drivers/thermal/samsung/Kconfig b/drivers/thermal/samsung/Kconfig index c43306ecc0ab..c8e35c1a43dc 100644 --- a/drivers/thermal/samsung/Kconfig +++ b/drivers/thermal/samsung/Kconfig @@ -7,12 +7,3 @@ config EXYNOS_THERMAL the TMU, reports temperature and handles cooling action if defined. This driver uses the Exynos core thermal APIs and TMU configuration data from the supported SoCs. - -config EXYNOS_THERMAL_CORE - bool "Core thermal framework support for EXYNOS SOCs" - depends on EXYNOS_THERMAL - help - If you say yes here you get support for EXYNOS TMU - (Thermal Management Unit) common registration/unregistration - functions to the core thermal layer and also to use the generic - CPU cooling APIs. diff --git a/drivers/thermal/samsung/Makefile b/drivers/thermal/samsung/Makefile index c09d83095dc2..1e47d0d89ce0 100644 --- a/drivers/thermal/samsung/Makefile +++ b/drivers/thermal/samsung/Makefile @@ -3,5 +3,3 @@ # obj-$(CONFIG_EXYNOS_THERMAL) += exynos_thermal.o exynos_thermal-y := exynos_tmu.o -exynos_thermal-y += exynos_tmu_data.o -exynos_thermal-$(CONFIG_EXYNOS_THERMAL_CORE) += exynos_thermal_common.o diff --git a/drivers/thermal/samsung/exynos_thermal_common.c b/drivers/thermal/samsung/exynos_thermal_common.c deleted file mode 100644 index 6dc3815cc73f..000000000000 --- a/drivers/thermal/samsung/exynos_thermal_common.c +++ /dev/null @@ -1,427 +0,0 @@ -/* - * exynos_thermal_common.c - Samsung EXYNOS common thermal file - * - * Copyright (C) 2013 Samsung Electronics - * Amit Daniel Kachhap <amit.daniel@samsung.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#include <linux/cpu_cooling.h> -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/thermal.h> - -#include "exynos_thermal_common.h" - -struct exynos_thermal_zone { - enum thermal_device_mode mode; - struct thermal_zone_device *therm_dev; - struct thermal_cooling_device *cool_dev[MAX_COOLING_DEVICE]; - unsigned int cool_dev_size; - struct platform_device *exynos4_dev; - struct thermal_sensor_conf *sensor_conf; - bool bind; -}; - -/* Get mode callback functions for thermal zone */ -static int exynos_get_mode(struct thermal_zone_device *thermal, - enum thermal_device_mode *mode) -{ - struct exynos_thermal_zone *th_zone = thermal->devdata; - if (th_zone) - *mode = th_zone->mode; - return 0; -} - -/* Set mode callback functions for thermal zone */ -static int exynos_set_mode(struct thermal_zone_device *thermal, - enum thermal_device_mode mode) -{ - struct exynos_thermal_zone *th_zone = thermal->devdata; - if (!th_zone) { - dev_err(&thermal->device, - "thermal zone not registered\n"); - return 0; - } - - mutex_lock(&thermal->lock); - - if (mode == THERMAL_DEVICE_ENABLED && - !th_zone->sensor_conf->trip_data.trigger_falling) - thermal->polling_delay = IDLE_INTERVAL; - else - thermal->polling_delay = 0; - - mutex_unlock(&thermal->lock); - - th_zone->mode = mode; - thermal_zone_device_update(thermal); - dev_dbg(th_zone->sensor_conf->dev, - "thermal polling set for duration=%d msec\n", - thermal->polling_delay); - return 0; -} - - -/* Get trip type callback functions for thermal zone */ -static int exynos_get_trip_type(struct thermal_zone_device *thermal, int trip, - enum thermal_trip_type *type) -{ - struct exynos_thermal_zone *th_zone = thermal->devdata; - int max_trip = th_zone->sensor_conf->trip_data.trip_count; - int trip_type; - - if (trip < 0 || trip >= max_trip) - return -EINVAL; - - trip_type = th_zone->sensor_conf->trip_data.trip_type[trip]; - - if (trip_type == SW_TRIP) - *type = THERMAL_TRIP_CRITICAL; - else if (trip_type == THROTTLE_ACTIVE) - *type = THERMAL_TRIP_ACTIVE; - else if (trip_type == THROTTLE_PASSIVE) - *type = THERMAL_TRIP_PASSIVE; - else - return -EINVAL; - - return 0; -} - -/* Get trip temperature callback functions for thermal zone */ -static int exynos_get_trip_temp(struct thermal_zone_device *thermal, int trip, - unsigned long *temp) -{ - struct exynos_thermal_zone *th_zone = thermal->devdata; - int max_trip = th_zone->sensor_conf->trip_data.trip_count; - - if (trip < 0 || trip >= max_trip) - return -EINVAL; - - *temp = th_zone->sensor_conf->trip_data.trip_val[trip]; - /* convert the temperature into millicelsius */ - *temp = *temp * MCELSIUS; - - return 0; -} - -/* Get critical temperature callback functions for thermal zone */ -static int exynos_get_crit_temp(struct thermal_zone_device *thermal, - unsigned long *temp) -{ - struct exynos_thermal_zone *th_zone = thermal->devdata; - int max_trip = th_zone->sensor_conf->trip_data.trip_count; - /* Get the temp of highest trip*/ - return exynos_get_trip_temp(thermal, max_trip - 1, temp); -} - -/* Bind callback functions for thermal zone */ -static int exynos_bind(struct thermal_zone_device *thermal, - struct thermal_cooling_device *cdev) -{ - int ret = 0, i, tab_size, level; - struct freq_clip_table *tab_ptr, *clip_data; - struct exynos_thermal_zone *th_zone = thermal->devdata; - struct thermal_sensor_conf *data = th_zone->sensor_conf; - - tab_ptr = (struct freq_clip_table *)data->cooling_data.freq_data; - tab_size = data->cooling_data.freq_clip_count; - - if (tab_ptr == NULL || tab_size == 0) - return 0; - - /* find the cooling device registered*/ - for (i = 0; i < th_zone->cool_dev_size; i++) - if (cdev == th_zone->cool_dev[i]) - break; - - /* No matching cooling device */ - if (i == th_zone->cool_dev_size) - return 0; - - /* Bind the thermal zone to the cpufreq cooling device */ - for (i = 0; i < tab_size; i++) { - clip_data = (struct freq_clip_table *)&(tab_ptr[i]); - level = cpufreq_cooling_get_level(0, clip_data->freq_clip_max); - if (level == THERMAL_CSTATE_INVALID) - return 0; - switch (GET_ZONE(i)) { - case MONITOR_ZONE: - case WARN_ZONE: - if (thermal_zone_bind_cooling_device(thermal, i, cdev, - level, 0)) { - dev_err(data->dev, - "error unbinding cdev inst=%d\n", i); - ret = -EINVAL; - } - th_zone->bind = true; - break; - default: - ret = -EINVAL; - } - } - - return ret; -} - -/* Unbind callback functions for thermal zone */ -static int exynos_unbind(struct thermal_zone_device *thermal, - struct thermal_cooling_device *cdev) -{ - int ret = 0, i, tab_size; - struct exynos_thermal_zone *th_zone = thermal->devdata; - struct thermal_sensor_conf *data = th_zone->sensor_conf; - - if (th_zone->bind == false) - return 0; - - tab_size = data->cooling_data.freq_clip_count; - - if (tab_size == 0) - return 0; - - /* find the cooling device registered*/ - for (i = 0; i < th_zone->cool_dev_size; i++) - if (cdev == th_zone->cool_dev[i]) - break; - - /* No matching cooling device */ - if (i == th_zone->cool_dev_size) - return 0; - - /* Bind the thermal zone to the cpufreq cooling device */ - for (i = 0; i < tab_size; i++) { - switch (GET_ZONE(i)) { - case MONITOR_ZONE: - case WARN_ZONE: - if (thermal_zone_unbind_cooling_device(thermal, i, - cdev)) { - dev_err(data->dev, - "error unbinding cdev inst=%d\n", i); - ret = -EINVAL; - } - th_zone->bind = false; - break; - default: - ret = -EINVAL; - } - } - return ret; -} - -/* Get temperature callback functions for thermal zone */ -static int exynos_get_temp(struct thermal_zone_device *thermal, - unsigned long *temp) -{ - struct exynos_thermal_zone *th_zone = thermal->devdata; - void *data; - - if (!th_zone->sensor_conf) { - dev_err(&thermal->device, - "Temperature sensor not initialised\n"); - return -EINVAL; - } - data = th_zone->sensor_conf->driver_data; - *temp = th_zone->sensor_conf->read_temperature(data); - /* convert the temperature into millicelsius */ - *temp = *temp * MCELSIUS; - return 0; -} - -/* Get temperature callback functions for thermal zone */ -static int exynos_set_emul_temp(struct thermal_zone_device *thermal, - unsigned long temp) -{ - void *data; - int ret = -EINVAL; - struct exynos_thermal_zone *th_zone = thermal->devdata; - - if (!th_zone->sensor_conf) { - dev_err(&thermal->device, - "Temperature sensor not initialised\n"); - return -EINVAL; - } - data = th_zone->sensor_conf->driver_data; - if (th_zone->sensor_conf->write_emul_temp) - ret = th_zone->sensor_conf->write_emul_temp(data, temp); - return ret; -} - -/* Get the temperature trend */ -static int exynos_get_trend(struct thermal_zone_device *thermal, - int trip, enum thermal_trend *trend) -{ - int ret; - unsigned long trip_temp; - - ret = exynos_get_trip_temp(thermal, trip, &trip_temp); - if (ret < 0) - return ret; - - if (thermal->temperature >= trip_temp) - *trend = THERMAL_TREND_RAISE_FULL; - else - *trend = THERMAL_TREND_DROP_FULL; - - return 0; -} -/* Operation callback functions for thermal zone */ -static struct thermal_zone_device_ops exynos_dev_ops = { - .bind = exynos_bind, - .unbind = exynos_unbind, - .get_temp = exynos_get_temp, - .set_emul_temp = exynos_set_emul_temp, - .get_trend = exynos_get_trend, - .get_mode = exynos_get_mode, - .set_mode = exynos_set_mode, - .get_trip_type = exynos_get_trip_type, - .get_trip_temp = exynos_get_trip_temp, - .get_crit_temp = exynos_get_crit_temp, -}; - -/* - * This function may be called from interrupt based temperature sensor - * when threshold is changed. - */ -void exynos_report_trigger(struct thermal_sensor_conf *conf) -{ - unsigned int i; - char data[10]; - char *envp[] = { data, NULL }; - struct exynos_thermal_zone *th_zone; - - if (!conf || !conf->pzone_data) { - pr_err("Invalid temperature sensor configuration data\n"); - return; - } - - th_zone = conf->pzone_data; - - if (th_zone->bind == false) { - for (i = 0; i < th_zone->cool_dev_size; i++) { - if (!th_zone->cool_dev[i]) - continue; - exynos_bind(th_zone->therm_dev, - th_zone->cool_dev[i]); - } - } - - thermal_zone_device_update(th_zone->therm_dev); - - mutex_lock(&th_zone->therm_dev->lock); - /* Find the level for which trip happened */ - for (i = 0; i < th_zone->sensor_conf->trip_data.trip_count; i++) { - if (th_zone->therm_dev->last_temperature < - th_zone->sensor_conf->trip_data.trip_val[i] * MCELSIUS) - break; - } - - if (th_zone->mode == THERMAL_DEVICE_ENABLED && - !th_zone->sensor_conf->trip_data.trigger_falling) { - if (i > 0) - th_zone->therm_dev->polling_delay = ACTIVE_INTERVAL; - else - th_zone->therm_dev->polling_delay = IDLE_INTERVAL; - } - - snprintf(data, sizeof(data), "%u", i); - kobject_uevent_env(&th_zone->therm_dev->device.kobj, KOBJ_CHANGE, envp); - mutex_unlock(&th_zone->therm_dev->lock); -} - -/* Register with the in-kernel thermal management */ -int exynos_register_thermal(struct thermal_sensor_conf *sensor_conf) -{ - int ret; - struct exynos_thermal_zone *th_zone; - - if (!sensor_conf || !sensor_conf->read_temperature) { - pr_err("Temperature sensor not initialised\n"); - return -EINVAL; - } - - th_zone = devm_kzalloc(sensor_conf->dev, - sizeof(struct exynos_thermal_zone), GFP_KERNEL); - if (!th_zone) - return -ENOMEM; - - th_zone->sensor_conf = sensor_conf; - /* - * TODO: 1) Handle multiple cooling devices in a thermal zone - * 2) Add a flag/name in cooling info to map to specific - * sensor - */ - if (sensor_conf->cooling_data.freq_clip_count > 0) { - th_zone->cool_dev[th_zone->cool_dev_size] = - cpufreq_cooling_register(cpu_present_mask); - if (IS_ERR(th_zone->cool_dev[th_zone->cool_dev_size])) { - ret = PTR_ERR(th_zone->cool_dev[th_zone->cool_dev_size]); - if (ret != -EPROBE_DEFER) - dev_err(sensor_conf->dev, - "Failed to register cpufreq cooling device: %d\n", - ret); - goto err_unregister; - } - th_zone->cool_dev_size++; - } - - th_zone->therm_dev = thermal_zone_device_register( - sensor_conf->name, sensor_conf->trip_data.trip_count, - 0, th_zone, &exynos_dev_ops, NULL, 0, - sensor_conf->trip_data.trigger_falling ? 0 : - IDLE_INTERVAL); - - if (IS_ERR(th_zone->therm_dev)) { - dev_err(sensor_conf->dev, - "Failed to register thermal zone device\n"); - ret = PTR_ERR(th_zone->therm_dev); - goto err_unregister; - } - th_zone->mode = THERMAL_DEVICE_ENABLED; - sensor_conf->pzone_data = th_zone; - - dev_info(sensor_conf->dev, - "Exynos: Thermal zone(%s) registered\n", sensor_conf->name); - - return 0; - -err_unregister: - exynos_unregister_thermal(sensor_conf); - return ret; -} - -/* Un-Register with the in-kernel thermal management */ -void exynos_unregister_thermal(struct thermal_sensor_conf *sensor_conf) -{ - int i; - struct exynos_thermal_zone *th_zone; - - if (!sensor_conf || !sensor_conf->pzone_data) { - pr_err("Invalid temperature sensor configuration data\n"); - return; - } - - th_zone = sensor_conf->pzone_data; - - thermal_zone_device_unregister(th_zone->therm_dev); - - for (i = 0; i < th_zone->cool_dev_size; ++i) - cpufreq_cooling_unregister(th_zone->cool_dev[i]); - - dev_info(sensor_conf->dev, - "Exynos: Kernel Thermal management unregistered\n"); -} diff --git a/drivers/thermal/samsung/exynos_thermal_common.h b/drivers/thermal/samsung/exynos_thermal_common.h deleted file mode 100644 index cd4471925cdd..000000000000 --- a/drivers/thermal/samsung/exynos_thermal_common.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * exynos_thermal_common.h - Samsung EXYNOS common header file - * - * Copyright (C) 2013 Samsung Electronics - * Amit Daniel Kachhap <amit.daniel@samsung.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#ifndef _EXYNOS_THERMAL_COMMON_H -#define _EXYNOS_THERMAL_COMMON_H - -/* In-kernel thermal framework related macros & definations */ -#define SENSOR_NAME_LEN 16 -#define MAX_TRIP_COUNT 8 -#define MAX_COOLING_DEVICE 4 - -#define ACTIVE_INTERVAL 500 -#define IDLE_INTERVAL 10000 -#define MCELSIUS 1000 - -/* CPU Zone information */ -#define PANIC_ZONE 4 -#define WARN_ZONE 3 -#define MONITOR_ZONE 2 -#define SAFE_ZONE 1 - -#define GET_ZONE(trip) (trip + 2) -#define GET_TRIP(zone) (zone - 2) - -enum trigger_type { - THROTTLE_ACTIVE = 1, - THROTTLE_PASSIVE, - SW_TRIP, - HW_TRIP, -}; - -/** - * struct freq_clip_table - * @freq_clip_max: maximum frequency allowed for this cooling state. - * @temp_level: Temperature level at which the temperature clipping will - * happen. - * @mask_val: cpumask of the allowed cpu's where the clipping will take place. - * - * This structure is required to be filled and passed to the - * cpufreq_cooling_unregister function. - */ -struct freq_clip_table { - unsigned int freq_clip_max; - unsigned int temp_level; - const struct cpumask *mask_val; -}; - -struct thermal_trip_point_conf { - int trip_val[MAX_TRIP_COUNT]; - int trip_type[MAX_TRIP_COUNT]; - int trip_count; - unsigned char trigger_falling; -}; - -struct thermal_cooling_conf { - struct freq_clip_table freq_data[MAX_TRIP_COUNT]; - int freq_clip_count; -}; - -struct thermal_sensor_conf { - char name[SENSOR_NAME_LEN]; - int (*read_temperature)(void *data); - int (*write_emul_temp)(void *drv_data, unsigned long temp); - struct thermal_trip_point_conf trip_data; - struct thermal_cooling_conf cooling_data; - void *driver_data; - void *pzone_data; - struct device *dev; -}; - -/*Functions used exynos based thermal sensor driver*/ -#ifdef CONFIG_EXYNOS_THERMAL_CORE -void exynos_unregister_thermal(struct thermal_sensor_conf *sensor_conf); -int exynos_register_thermal(struct thermal_sensor_conf *sensor_conf); -void exynos_report_trigger(struct thermal_sensor_conf *sensor_conf); -#else -static inline void -exynos_unregister_thermal(struct thermal_sensor_conf *sensor_conf) { return; } - -static inline int -exynos_register_thermal(struct thermal_sensor_conf *sensor_conf) { return 0; } - -static inline void -exynos_report_trigger(struct thermal_sensor_conf *sensor_conf) { return; } - -#endif /* CONFIG_EXYNOS_THERMAL_CORE */ -#endif /* _EXYNOS_THERMAL_COMMON_H */ diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c index d2f1e62a4232..fbeedc072cc2 100644 --- a/drivers/thermal/samsung/exynos_tmu.c +++ b/drivers/thermal/samsung/exynos_tmu.c @@ -1,6 +1,10 @@ /* * exynos_tmu.c - Samsung EXYNOS TMU (Thermal Management Unit) * + * Copyright (C) 2014 Samsung Electronics + * Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com> + * Lukasz Majewski <l.majewski@samsung.com> + * * Copyright (C) 2011 Samsung Electronics * Donggeun Kim <dg77.kim@samsung.com> * Amit Daniel Kachhap <amit.kachhap@linaro.org> @@ -31,8 +35,8 @@ #include <linux/platform_device.h> #include <linux/regulator/consumer.h> -#include "exynos_thermal_common.h" #include "exynos_tmu.h" +#include "../thermal_core.h" /* Exynos generic registers */ #define EXYNOS_TMU_REG_TRIMINFO 0x0 @@ -115,6 +119,27 @@ #define EXYNOS5440_TMU_TH_RISE4_SHIFT 24 #define EXYNOS5440_EFUSE_SWAP_OFFSET 8 +/* Exynos7 specific registers */ +#define EXYNOS7_THD_TEMP_RISE7_6 0x50 +#define EXYNOS7_THD_TEMP_FALL7_6 0x60 +#define EXYNOS7_TMU_REG_INTEN 0x110 +#define EXYNOS7_TMU_REG_INTPEND 0x118 +#define EXYNOS7_TMU_REG_EMUL_CON 0x160 + +#define EXYNOS7_TMU_TEMP_MASK 0x1ff +#define EXYNOS7_PD_DET_EN_SHIFT 23 +#define EXYNOS7_TMU_INTEN_RISE0_SHIFT 0 +#define EXYNOS7_TMU_INTEN_RISE1_SHIFT 1 +#define EXYNOS7_TMU_INTEN_RISE2_SHIFT 2 +#define EXYNOS7_TMU_INTEN_RISE3_SHIFT 3 +#define EXYNOS7_TMU_INTEN_RISE4_SHIFT 4 +#define EXYNOS7_TMU_INTEN_RISE5_SHIFT 5 +#define EXYNOS7_TMU_INTEN_RISE6_SHIFT 6 +#define EXYNOS7_TMU_INTEN_RISE7_SHIFT 7 +#define EXYNOS7_EMUL_DATA_SHIFT 7 +#define EXYNOS7_EMUL_DATA_MASK 0x1ff + +#define MCELSIUS 1000 /** * struct exynos_tmu_data : A structure to hold the private data of the TMU driver @@ -128,6 +153,7 @@ * @lock: lock to implement synchronization. * @clk: pointer to the clock structure. * @clk_sec: pointer to the clock structure for accessing the base_second. + * @sclk: pointer to the clock structure for accessing the tmu special clk. * @temp_error1: fused value of the first point trim. * @temp_error2: fused value of the second point trim. * @regulator: pointer to the TMU regulator structure. @@ -147,10 +173,11 @@ struct exynos_tmu_data { enum soc_type soc; struct work_struct irq_work; struct mutex lock; - struct clk *clk, *clk_sec; - u8 temp_error1, temp_error2; + struct clk *clk, *clk_sec, *sclk; + u16 temp_error1, temp_error2; struct regulator *regulator; - struct thermal_sensor_conf *reg_conf; + struct thermal_zone_device *tzd; + int (*tmu_initialize)(struct platform_device *pdev); void (*tmu_control)(struct platform_device *pdev, bool on); int (*tmu_read)(struct exynos_tmu_data *data); @@ -159,6 +186,33 @@ struct exynos_tmu_data { void (*tmu_clear_irqs)(struct exynos_tmu_data *data); }; +static void exynos_report_trigger(struct exynos_tmu_data *p) +{ + char data[10], *envp[] = { data, NULL }; + struct thermal_zone_device *tz = p->tzd; + unsigned long temp; + unsigned int i; + + if (!tz) { + pr_err("No thermal zone device defined\n"); + return; + } + + thermal_zone_device_update(tz); + + mutex_lock(&tz->lock); + /* Find the level for which trip happened */ + for (i = 0; i < of_thermal_get_ntrips(tz); i++) { + tz->ops->get_trip_temp(tz, i, &temp); + if (tz->last_temperature < temp) + break; + } + + snprintf(data, sizeof(data), "%u", i); + kobject_uevent_env(&tz->device.kobj, KOBJ_CHANGE, envp); + mutex_unlock(&tz->lock); +} + /* * TMU treats temperature as a mapped temperature code. * The temperature is converted differently depending on the calibration type. @@ -190,7 +244,7 @@ static int temp_to_code(struct exynos_tmu_data *data, u8 temp) * Calculate a temperature value from a temperature code. * The unit of the temperature is degree Celsius. */ -static int code_to_temp(struct exynos_tmu_data *data, u8 temp_code) +static int code_to_temp(struct exynos_tmu_data *data, u16 temp_code) { struct exynos_tmu_platform_data *pdata = data->pdata; int temp; @@ -234,14 +288,25 @@ static void sanitize_temp_error(struct exynos_tmu_data *data, u32 trim_info) static u32 get_th_reg(struct exynos_tmu_data *data, u32 threshold, bool falling) { - struct exynos_tmu_platform_data *pdata = data->pdata; + struct thermal_zone_device *tz = data->tzd; + const struct thermal_trip * const trips = + of_thermal_get_trip_points(tz); + unsigned long temp; int i; - for (i = 0; i < pdata->non_hw_trigger_levels; i++) { - u8 temp = pdata->trigger_levels[i]; + if (!trips) { + pr_err("%s: Cannot get trip points from of-thermal.c!\n", + __func__); + return 0; + } + + for (i = 0; i < of_thermal_get_ntrips(tz); i++) { + if (trips[i].type == THERMAL_TRIP_CRITICAL) + continue; + temp = trips[i].temperature / MCELSIUS; if (falling) - temp -= pdata->threshold_falling; + temp -= (trips[i].hysteresis / MCELSIUS); else threshold &= ~(0xff << 8 * i); @@ -305,9 +370,19 @@ static void exynos_tmu_control(struct platform_device *pdev, bool on) static int exynos4210_tmu_initialize(struct platform_device *pdev) { struct exynos_tmu_data *data = platform_get_drvdata(pdev); - struct exynos_tmu_platform_data *pdata = data->pdata; - unsigned int status; + struct thermal_zone_device *tz = data->tzd; + const struct thermal_trip * const trips = + of_thermal_get_trip_points(tz); int ret = 0, threshold_code, i; + unsigned long reference, temp; + unsigned int status; + + if (!trips) { + pr_err("%s: Cannot get trip points from of-thermal.c!\n", + __func__); + ret = -ENODEV; + goto out; + } status = readb(data->base + EXYNOS_TMU_REG_STATUS); if (!status) { @@ -318,12 +393,19 @@ static int exynos4210_tmu_initialize(struct platform_device *pdev) sanitize_temp_error(data, readl(data->base + EXYNOS_TMU_REG_TRIMINFO)); /* Write temperature code for threshold */ - threshold_code = temp_to_code(data, pdata->threshold); + reference = trips[0].temperature / MCELSIUS; + threshold_code = temp_to_code(data, reference); + if (threshold_code < 0) { + ret = threshold_code; + goto out; + } writeb(threshold_code, data->base + EXYNOS4210_TMU_REG_THRESHOLD_TEMP); - for (i = 0; i < pdata->non_hw_trigger_levels; i++) - writeb(pdata->trigger_levels[i], data->base + + for (i = 0; i < of_thermal_get_ntrips(tz); i++) { + temp = trips[i].temperature / MCELSIUS; + writeb(temp - reference, data->base + EXYNOS4210_TMU_REG_TRIG_LEVEL0 + i * 4); + } data->tmu_clear_irqs(data); out: @@ -333,9 +415,11 @@ out: static int exynos4412_tmu_initialize(struct platform_device *pdev) { struct exynos_tmu_data *data = platform_get_drvdata(pdev); - struct exynos_tmu_platform_data *pdata = data->pdata; + const struct thermal_trip * const trips = + of_thermal_get_trip_points(data->tzd); unsigned int status, trim_info, con, ctrl, rising_threshold; int ret = 0, threshold_code, i; + unsigned long crit_temp = 0; status = readb(data->base + EXYNOS_TMU_REG_STATUS); if (!status) { @@ -373,17 +457,29 @@ static int exynos4412_tmu_initialize(struct platform_device *pdev) data->tmu_clear_irqs(data); /* if last threshold limit is also present */ - i = pdata->max_trigger_level - 1; - if (pdata->trigger_levels[i] && pdata->trigger_type[i] == HW_TRIP) { - threshold_code = temp_to_code(data, pdata->trigger_levels[i]); - /* 1-4 level to be assigned in th0 reg */ - rising_threshold &= ~(0xff << 8 * i); - rising_threshold |= threshold_code << 8 * i; - writel(rising_threshold, data->base + EXYNOS_THD_TEMP_RISE); - con = readl(data->base + EXYNOS_TMU_REG_CONTROL); - con |= (1 << EXYNOS_TMU_THERM_TRIP_EN_SHIFT); - writel(con, data->base + EXYNOS_TMU_REG_CONTROL); + for (i = 0; i < of_thermal_get_ntrips(data->tzd); i++) { + if (trips[i].type == THERMAL_TRIP_CRITICAL) { + crit_temp = trips[i].temperature; + break; + } } + + if (i == of_thermal_get_ntrips(data->tzd)) { + pr_err("%s: No CRITICAL trip point defined at of-thermal.c!\n", + __func__); + ret = -EINVAL; + goto out; + } + + threshold_code = temp_to_code(data, crit_temp / MCELSIUS); + /* 1-4 level to be assigned in th0 reg */ + rising_threshold &= ~(0xff << 8 * i); + rising_threshold |= threshold_code << 8 * i; + writel(rising_threshold, data->base + EXYNOS_THD_TEMP_RISE); + con = readl(data->base + EXYNOS_TMU_REG_CONTROL); + con |= (1 << EXYNOS_TMU_THERM_TRIP_EN_SHIFT); + writel(con, data->base + EXYNOS_TMU_REG_CONTROL); + out: return ret; } @@ -391,9 +487,9 @@ out: static int exynos5440_tmu_initialize(struct platform_device *pdev) { struct exynos_tmu_data *data = platform_get_drvdata(pdev); - struct exynos_tmu_platform_data *pdata = data->pdata; unsigned int trim_info = 0, con, rising_threshold; - int ret = 0, threshold_code, i; + int ret = 0, threshold_code; + unsigned long crit_temp = 0; /* * For exynos5440 soc triminfo value is swapped between TMU0 and @@ -422,9 +518,8 @@ static int exynos5440_tmu_initialize(struct platform_device *pdev) data->tmu_clear_irqs(data); /* if last threshold limit is also present */ - i = pdata->max_trigger_level - 1; - if (pdata->trigger_levels[i] && pdata->trigger_type[i] == HW_TRIP) { - threshold_code = temp_to_code(data, pdata->trigger_levels[i]); + if (!data->tzd->ops->get_crit_temp(data->tzd, &crit_temp)) { + threshold_code = temp_to_code(data, crit_temp / MCELSIUS); /* 5th level to be assigned in th2 reg */ rising_threshold = threshold_code << EXYNOS5440_TMU_TH_RISE4_SHIFT; @@ -439,10 +534,88 @@ static int exynos5440_tmu_initialize(struct platform_device *pdev) return ret; } -static void exynos4210_tmu_control(struct platform_device *pdev, bool on) +static int exynos7_tmu_initialize(struct platform_device *pdev) { struct exynos_tmu_data *data = platform_get_drvdata(pdev); + struct thermal_zone_device *tz = data->tzd; struct exynos_tmu_platform_data *pdata = data->pdata; + unsigned int status, trim_info; + unsigned int rising_threshold = 0, falling_threshold = 0; + int ret = 0, threshold_code, i; + unsigned long temp, temp_hist; + unsigned int reg_off, bit_off; + + status = readb(data->base + EXYNOS_TMU_REG_STATUS); + if (!status) { + ret = -EBUSY; + goto out; + } + + trim_info = readl(data->base + EXYNOS_TMU_REG_TRIMINFO); + + data->temp_error1 = trim_info & EXYNOS7_TMU_TEMP_MASK; + if (!data->temp_error1 || + (pdata->min_efuse_value > data->temp_error1) || + (data->temp_error1 > pdata->max_efuse_value)) + data->temp_error1 = pdata->efuse_value & EXYNOS_TMU_TEMP_MASK; + + /* Write temperature code for rising and falling threshold */ + for (i = (of_thermal_get_ntrips(tz) - 1); i >= 0; i--) { + /* + * On exynos7 there are 4 rising and 4 falling threshold + * registers (0x50-0x5c and 0x60-0x6c respectively). Each + * register holds the value of two threshold levels (at bit + * offsets 0 and 16). Based on the fact that there are atmost + * eight possible trigger levels, calculate the register and + * bit offsets where the threshold levels are to be written. + * + * e.g. EXYNOS7_THD_TEMP_RISE7_6 (0x50) + * [24:16] - Threshold level 7 + * [8:0] - Threshold level 6 + * e.g. EXYNOS7_THD_TEMP_RISE5_4 (0x54) + * [24:16] - Threshold level 5 + * [8:0] - Threshold level 4 + * + * and similarly for falling thresholds. + * + * Based on the above, calculate the register and bit offsets + * for rising/falling threshold levels and populate them. + */ + reg_off = ((7 - i) / 2) * 4; + bit_off = ((8 - i) % 2); + + tz->ops->get_trip_temp(tz, i, &temp); + temp /= MCELSIUS; + + tz->ops->get_trip_hyst(tz, i, &temp_hist); + temp_hist = temp - (temp_hist / MCELSIUS); + + /* Set 9-bit temperature code for rising threshold levels */ + threshold_code = temp_to_code(data, temp); + rising_threshold = readl(data->base + + EXYNOS7_THD_TEMP_RISE7_6 + reg_off); + rising_threshold &= ~(EXYNOS7_TMU_TEMP_MASK << (16 * bit_off)); + rising_threshold |= threshold_code << (16 * bit_off); + writel(rising_threshold, + data->base + EXYNOS7_THD_TEMP_RISE7_6 + reg_off); + + /* Set 9-bit temperature code for falling threshold levels */ + threshold_code = temp_to_code(data, temp_hist); + falling_threshold &= ~(EXYNOS7_TMU_TEMP_MASK << (16 * bit_off)); + falling_threshold |= threshold_code << (16 * bit_off); + writel(falling_threshold, + data->base + EXYNOS7_THD_TEMP_FALL7_6 + reg_off); + } + + data->tmu_clear_irqs(data); +out: + return ret; +} + +static void exynos4210_tmu_control(struct platform_device *pdev, bool on) +{ + struct exynos_tmu_data *data = platform_get_drvdata(pdev); + struct thermal_zone_device *tz = data->tzd; unsigned int con, interrupt_en; con = get_con_reg(data, readl(data->base + EXYNOS_TMU_REG_CONTROL)); @@ -450,10 +623,15 @@ static void exynos4210_tmu_control(struct platform_device *pdev, bool on) if (on) { con |= (1 << EXYNOS_TMU_CORE_EN_SHIFT); interrupt_en = - pdata->trigger_enable[3] << EXYNOS_TMU_INTEN_RISE3_SHIFT | - pdata->trigger_enable[2] << EXYNOS_TMU_INTEN_RISE2_SHIFT | - pdata->trigger_enable[1] << EXYNOS_TMU_INTEN_RISE1_SHIFT | - pdata->trigger_enable[0] << EXYNOS_TMU_INTEN_RISE0_SHIFT; + (of_thermal_is_trip_valid(tz, 3) + << EXYNOS_TMU_INTEN_RISE3_SHIFT) | + (of_thermal_is_trip_valid(tz, 2) + << EXYNOS_TMU_INTEN_RISE2_SHIFT) | + (of_thermal_is_trip_valid(tz, 1) + << EXYNOS_TMU_INTEN_RISE1_SHIFT) | + (of_thermal_is_trip_valid(tz, 0) + << EXYNOS_TMU_INTEN_RISE0_SHIFT); + if (data->soc != SOC_ARCH_EXYNOS4210) interrupt_en |= interrupt_en << EXYNOS_TMU_INTEN_FALL0_SHIFT; @@ -468,7 +646,7 @@ static void exynos4210_tmu_control(struct platform_device *pdev, bool on) static void exynos5440_tmu_control(struct platform_device *pdev, bool on) { struct exynos_tmu_data *data = platform_get_drvdata(pdev); - struct exynos_tmu_platform_data *pdata = data->pdata; + struct thermal_zone_device *tz = data->tzd; unsigned int con, interrupt_en; con = get_con_reg(data, readl(data->base + EXYNOS5440_TMU_S0_7_CTRL)); @@ -476,11 +654,16 @@ static void exynos5440_tmu_control(struct platform_device *pdev, bool on) if (on) { con |= (1 << EXYNOS_TMU_CORE_EN_SHIFT); interrupt_en = - pdata->trigger_enable[3] << EXYNOS5440_TMU_INTEN_RISE3_SHIFT | - pdata->trigger_enable[2] << EXYNOS5440_TMU_INTEN_RISE2_SHIFT | - pdata->trigger_enable[1] << EXYNOS5440_TMU_INTEN_RISE1_SHIFT | - pdata->trigger_enable[0] << EXYNOS5440_TMU_INTEN_RISE0_SHIFT; - interrupt_en |= interrupt_en << EXYNOS5440_TMU_INTEN_FALL0_SHIFT; + (of_thermal_is_trip_valid(tz, 3) + << EXYNOS5440_TMU_INTEN_RISE3_SHIFT) | + (of_thermal_is_trip_valid(tz, 2) + << EXYNOS5440_TMU_INTEN_RISE2_SHIFT) | + (of_thermal_is_trip_valid(tz, 1) + << EXYNOS5440_TMU_INTEN_RISE1_SHIFT) | + (of_thermal_is_trip_valid(tz, 0) + << EXYNOS5440_TMU_INTEN_RISE0_SHIFT); + interrupt_en |= + interrupt_en << EXYNOS5440_TMU_INTEN_FALL0_SHIFT; } else { con &= ~(1 << EXYNOS_TMU_CORE_EN_SHIFT); interrupt_en = 0; /* Disable all interrupts */ @@ -489,19 +672,62 @@ static void exynos5440_tmu_control(struct platform_device *pdev, bool on) writel(con, data->base + EXYNOS5440_TMU_S0_7_CTRL); } -static int exynos_tmu_read(struct exynos_tmu_data *data) +static void exynos7_tmu_control(struct platform_device *pdev, bool on) { - int ret; + struct exynos_tmu_data *data = platform_get_drvdata(pdev); + struct thermal_zone_device *tz = data->tzd; + unsigned int con, interrupt_en; + + con = get_con_reg(data, readl(data->base + EXYNOS_TMU_REG_CONTROL)); + + if (on) { + con |= (1 << EXYNOS_TMU_CORE_EN_SHIFT); + interrupt_en = + (of_thermal_is_trip_valid(tz, 7) + << EXYNOS7_TMU_INTEN_RISE7_SHIFT) | + (of_thermal_is_trip_valid(tz, 6) + << EXYNOS7_TMU_INTEN_RISE6_SHIFT) | + (of_thermal_is_trip_valid(tz, 5) + << EXYNOS7_TMU_INTEN_RISE5_SHIFT) | + (of_thermal_is_trip_valid(tz, 4) + << EXYNOS7_TMU_INTEN_RISE4_SHIFT) | + (of_thermal_is_trip_valid(tz, 3) + << EXYNOS7_TMU_INTEN_RISE3_SHIFT) | + (of_thermal_is_trip_valid(tz, 2) + << EXYNOS7_TMU_INTEN_RISE2_SHIFT) | + (of_thermal_is_trip_valid(tz, 1) + << EXYNOS7_TMU_INTEN_RISE1_SHIFT) | + (of_thermal_is_trip_valid(tz, 0) + << EXYNOS7_TMU_INTEN_RISE0_SHIFT); + + interrupt_en |= + interrupt_en << EXYNOS_TMU_INTEN_FALL0_SHIFT; + } else { + con &= ~(1 << EXYNOS_TMU_CORE_EN_SHIFT); + interrupt_en = 0; /* Disable all interrupts */ + } + con |= 1 << EXYNOS7_PD_DET_EN_SHIFT; + + writel(interrupt_en, data->base + EXYNOS7_TMU_REG_INTEN); + writel(con, data->base + EXYNOS_TMU_REG_CONTROL); +} + +static int exynos_get_temp(void *p, long *temp) +{ + struct exynos_tmu_data *data = p; + + if (!data) + return -EINVAL; mutex_lock(&data->lock); clk_enable(data->clk); - ret = data->tmu_read(data); - if (ret >= 0) - ret = code_to_temp(data, ret); + + *temp = code_to_temp(data, data->tmu_read(data)) * MCELSIUS; + clk_disable(data->clk); mutex_unlock(&data->lock); - return ret; + return 0; } #ifdef CONFIG_THERMAL_EMULATION @@ -515,9 +741,19 @@ static u32 get_emul_con_reg(struct exynos_tmu_data *data, unsigned int val, val &= ~(EXYNOS_EMUL_TIME_MASK << EXYNOS_EMUL_TIME_SHIFT); val |= (EXYNOS_EMUL_TIME << EXYNOS_EMUL_TIME_SHIFT); } - val &= ~(EXYNOS_EMUL_DATA_MASK << EXYNOS_EMUL_DATA_SHIFT); - val |= (temp_to_code(data, temp) << EXYNOS_EMUL_DATA_SHIFT) | - EXYNOS_EMUL_ENABLE; + if (data->soc == SOC_ARCH_EXYNOS7) { + val &= ~(EXYNOS7_EMUL_DATA_MASK << + EXYNOS7_EMUL_DATA_SHIFT); + val |= (temp_to_code(data, temp) << + EXYNOS7_EMUL_DATA_SHIFT) | + EXYNOS_EMUL_ENABLE; + } else { + val &= ~(EXYNOS_EMUL_DATA_MASK << + EXYNOS_EMUL_DATA_SHIFT); + val |= (temp_to_code(data, temp) << + EXYNOS_EMUL_DATA_SHIFT) | + EXYNOS_EMUL_ENABLE; + } } else { val &= ~EXYNOS_EMUL_ENABLE; } @@ -533,6 +769,8 @@ static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data, if (data->soc == SOC_ARCH_EXYNOS5260) emul_con = EXYNOS5260_EMUL_CON; + else if (data->soc == SOC_ARCH_EXYNOS7) + emul_con = EXYNOS7_TMU_REG_EMUL_CON; else emul_con = EXYNOS_EMUL_CON; @@ -576,7 +814,7 @@ out: #define exynos5440_tmu_set_emulation NULL static int exynos_tmu_set_emulation(void *drv_data, unsigned long temp) { return -EINVAL; } -#endif/*CONFIG_THERMAL_EMULATION*/ +#endif /* CONFIG_THERMAL_EMULATION */ static int exynos4210_tmu_read(struct exynos_tmu_data *data) { @@ -596,6 +834,12 @@ static int exynos5440_tmu_read(struct exynos_tmu_data *data) return readb(data->base + EXYNOS5440_TMU_S0_7_TEMP); } +static int exynos7_tmu_read(struct exynos_tmu_data *data) +{ + return readw(data->base + EXYNOS_TMU_REG_CURRENT_TEMP) & + EXYNOS7_TMU_TEMP_MASK; +} + static void exynos_tmu_work(struct work_struct *work) { struct exynos_tmu_data *data = container_of(work, @@ -613,7 +857,7 @@ static void exynos_tmu_work(struct work_struct *work) if (!IS_ERR(data->clk_sec)) clk_disable(data->clk_sec); - exynos_report_trigger(data->reg_conf); + exynos_report_trigger(data); mutex_lock(&data->lock); clk_enable(data->clk); @@ -634,6 +878,9 @@ static void exynos4210_tmu_clear_irqs(struct exynos_tmu_data *data) if (data->soc == SOC_ARCH_EXYNOS5260) { tmu_intstat = EXYNOS5260_TMU_REG_INTSTAT; tmu_intclear = EXYNOS5260_TMU_REG_INTCLEAR; + } else if (data->soc == SOC_ARCH_EXYNOS7) { + tmu_intstat = EXYNOS7_TMU_REG_INTPEND; + tmu_intclear = EXYNOS7_TMU_REG_INTPEND; } else { tmu_intstat = EXYNOS_TMU_REG_INTSTAT; tmu_intclear = EXYNOS_TMU_REG_INTCLEAR; @@ -673,55 +920,94 @@ static irqreturn_t exynos_tmu_irq(int irq, void *id) static const struct of_device_id exynos_tmu_match[] = { { .compatible = "samsung,exynos3250-tmu", - .data = &exynos3250_default_tmu_data, }, { .compatible = "samsung,exynos4210-tmu", - .data = &exynos4210_default_tmu_data, }, { .compatible = "samsung,exynos4412-tmu", - .data = &exynos4412_default_tmu_data, }, { .compatible = "samsung,exynos5250-tmu", - .data = &exynos5250_default_tmu_data, }, { .compatible = "samsung,exynos5260-tmu", - .data = &exynos5260_default_tmu_data, }, { .compatible = "samsung,exynos5420-tmu", - .data = &exynos5420_default_tmu_data, }, { .compatible = "samsung,exynos5420-tmu-ext-triminfo", - .data = &exynos5420_default_tmu_data, }, { .compatible = "samsung,exynos5440-tmu", - .data = &exynos5440_default_tmu_data, + }, + { + .compatible = "samsung,exynos7-tmu", }, {}, }; MODULE_DEVICE_TABLE(of, exynos_tmu_match); -static inline struct exynos_tmu_platform_data *exynos_get_driver_data( - struct platform_device *pdev, int id) +static int exynos_of_get_soc_type(struct device_node *np) +{ + if (of_device_is_compatible(np, "samsung,exynos3250-tmu")) + return SOC_ARCH_EXYNOS3250; + else if (of_device_is_compatible(np, "samsung,exynos4210-tmu")) + return SOC_ARCH_EXYNOS4210; + else if (of_device_is_compatible(np, "samsung,exynos4412-tmu")) + return SOC_ARCH_EXYNOS4412; + else if (of_device_is_compatible(np, "samsung,exynos5250-tmu")) + return SOC_ARCH_EXYNOS5250; + else if (of_device_is_compatible(np, "samsung,exynos5260-tmu")) + return SOC_ARCH_EXYNOS5260; + else if (of_device_is_compatible(np, "samsung,exynos5420-tmu")) + return SOC_ARCH_EXYNOS5420; + else if (of_device_is_compatible(np, + "samsung,exynos5420-tmu-ext-triminfo")) + return SOC_ARCH_EXYNOS5420_TRIMINFO; + else if (of_device_is_compatible(np, "samsung,exynos5440-tmu")) + return SOC_ARCH_EXYNOS5440; + else if (of_device_is_compatible(np, "samsung,exynos7-tmu")) + return SOC_ARCH_EXYNOS7; + + return -EINVAL; +} + +static int exynos_of_sensor_conf(struct device_node *np, + struct exynos_tmu_platform_data *pdata) { - struct exynos_tmu_init_data *data_table; - struct exynos_tmu_platform_data *tmu_data; - const struct of_device_id *match; + u32 value; + int ret; - match = of_match_node(exynos_tmu_match, pdev->dev.of_node); - if (!match) - return NULL; - data_table = (struct exynos_tmu_init_data *) match->data; - if (!data_table || id >= data_table->tmu_count) - return NULL; - tmu_data = data_table->tmu_data; - return (struct exynos_tmu_platform_data *) (tmu_data + id); + of_node_get(np); + + ret = of_property_read_u32(np, "samsung,tmu_gain", &value); + pdata->gain = (u8)value; + of_property_read_u32(np, "samsung,tmu_reference_voltage", &value); + pdata->reference_voltage = (u8)value; + of_property_read_u32(np, "samsung,tmu_noise_cancel_mode", &value); + pdata->noise_cancel_mode = (u8)value; + + of_property_read_u32(np, "samsung,tmu_efuse_value", + &pdata->efuse_value); + of_property_read_u32(np, "samsung,tmu_min_efuse_value", + &pdata->min_efuse_value); + of_property_read_u32(np, "samsung,tmu_max_efuse_value", + &pdata->max_efuse_value); + + of_property_read_u32(np, "samsung,tmu_first_point_trim", &value); + pdata->first_point_trim = (u8)value; + of_property_read_u32(np, "samsung,tmu_second_point_trim", &value); + pdata->second_point_trim = (u8)value; + of_property_read_u32(np, "samsung,tmu_default_temp_offset", &value); + pdata->default_temp_offset = (u8)value; + + of_property_read_u32(np, "samsung,tmu_cal_type", &pdata->cal_type); + of_property_read_u32(np, "samsung,tmu_cal_mode", &pdata->cal_mode); + + of_node_put(np); + return 0; } static int exynos_map_dt_data(struct platform_device *pdev) @@ -771,14 +1057,15 @@ static int exynos_map_dt_data(struct platform_device *pdev) return -EADDRNOTAVAIL; } - pdata = exynos_get_driver_data(pdev, data->id); - if (!pdata) { - dev_err(&pdev->dev, "No platform init data supplied.\n"); - return -ENODEV; - } + pdata = devm_kzalloc(&pdev->dev, + sizeof(struct exynos_tmu_platform_data), + GFP_KERNEL); + if (!pdata) + return -ENOMEM; + exynos_of_sensor_conf(pdev->dev.of_node, pdata); data->pdata = pdata; - data->soc = pdata->type; + data->soc = exynos_of_get_soc_type(pdev->dev.of_node); switch (data->soc) { case SOC_ARCH_EXYNOS4210: @@ -806,6 +1093,13 @@ static int exynos_map_dt_data(struct platform_device *pdev) data->tmu_set_emulation = exynos5440_tmu_set_emulation; data->tmu_clear_irqs = exynos5440_tmu_clear_irqs; break; + case SOC_ARCH_EXYNOS7: + data->tmu_initialize = exynos7_tmu_initialize; + data->tmu_control = exynos7_tmu_control; + data->tmu_read = exynos7_tmu_read; + data->tmu_set_emulation = exynos4412_tmu_set_emulation; + data->tmu_clear_irqs = exynos4210_tmu_clear_irqs; + break; default: dev_err(&pdev->dev, "Platform not supported\n"); return -EINVAL; @@ -834,12 +1128,16 @@ static int exynos_map_dt_data(struct platform_device *pdev) return 0; } +static struct thermal_zone_of_device_ops exynos_sensor_ops = { + .get_temp = exynos_get_temp, + .set_emul_temp = exynos_tmu_set_emulation, +}; + static int exynos_tmu_probe(struct platform_device *pdev) { - struct exynos_tmu_data *data; struct exynos_tmu_platform_data *pdata; - struct thermal_sensor_conf *sensor_conf; - int ret, i; + struct exynos_tmu_data *data; + int ret; data = devm_kzalloc(&pdev->dev, sizeof(struct exynos_tmu_data), GFP_KERNEL); @@ -849,9 +1147,15 @@ static int exynos_tmu_probe(struct platform_device *pdev) platform_set_drvdata(pdev, data); mutex_init(&data->lock); + data->tzd = thermal_zone_of_sensor_register(&pdev->dev, 0, data, + &exynos_sensor_ops); + if (IS_ERR(data->tzd)) { + pr_err("thermal: tz: %p ERROR\n", data->tzd); + return PTR_ERR(data->tzd); + } ret = exynos_map_dt_data(pdev); if (ret) - return ret; + goto err_sensor; pdata = data->pdata; @@ -860,20 +1164,22 @@ static int exynos_tmu_probe(struct platform_device *pdev) data->clk = devm_clk_get(&pdev->dev, "tmu_apbif"); if (IS_ERR(data->clk)) { dev_err(&pdev->dev, "Failed to get clock\n"); - return PTR_ERR(data->clk); + ret = PTR_ERR(data->clk); + goto err_sensor; } data->clk_sec = devm_clk_get(&pdev->dev, "tmu_triminfo_apbif"); if (IS_ERR(data->clk_sec)) { if (data->soc == SOC_ARCH_EXYNOS5420_TRIMINFO) { dev_err(&pdev->dev, "Failed to get triminfo clock\n"); - return PTR_ERR(data->clk_sec); + ret = PTR_ERR(data->clk_sec); + goto err_sensor; } } else { ret = clk_prepare(data->clk_sec); if (ret) { dev_err(&pdev->dev, "Failed to get clock\n"); - return ret; + goto err_sensor; } } @@ -883,82 +1189,57 @@ static int exynos_tmu_probe(struct platform_device *pdev) goto err_clk_sec; } - ret = exynos_tmu_initialize(pdev); - if (ret) { - dev_err(&pdev->dev, "Failed to initialize TMU\n"); - goto err_clk; + if (data->soc == SOC_ARCH_EXYNOS7) { + data->sclk = devm_clk_get(&pdev->dev, "tmu_sclk"); + if (IS_ERR(data->sclk)) { + dev_err(&pdev->dev, "Failed to get sclk\n"); + goto err_clk; + } else { + ret = clk_prepare_enable(data->sclk); + if (ret) { + dev_err(&pdev->dev, "Failed to enable sclk\n"); + goto err_clk; + } + } } - exynos_tmu_control(pdev, true); - - /* Allocate a structure to register with the exynos core thermal */ - sensor_conf = devm_kzalloc(&pdev->dev, - sizeof(struct thermal_sensor_conf), GFP_KERNEL); - if (!sensor_conf) { - ret = -ENOMEM; - goto err_clk; - } - sprintf(sensor_conf->name, "therm_zone%d", data->id); - sensor_conf->read_temperature = (int (*)(void *))exynos_tmu_read; - sensor_conf->write_emul_temp = - (int (*)(void *, unsigned long))exynos_tmu_set_emulation; - sensor_conf->driver_data = data; - sensor_conf->trip_data.trip_count = pdata->trigger_enable[0] + - pdata->trigger_enable[1] + pdata->trigger_enable[2]+ - pdata->trigger_enable[3]; - - for (i = 0; i < sensor_conf->trip_data.trip_count; i++) { - sensor_conf->trip_data.trip_val[i] = - pdata->threshold + pdata->trigger_levels[i]; - sensor_conf->trip_data.trip_type[i] = - pdata->trigger_type[i]; - } - - sensor_conf->trip_data.trigger_falling = pdata->threshold_falling; - - sensor_conf->cooling_data.freq_clip_count = pdata->freq_tab_count; - for (i = 0; i < pdata->freq_tab_count; i++) { - sensor_conf->cooling_data.freq_data[i].freq_clip_max = - pdata->freq_tab[i].freq_clip_max; - sensor_conf->cooling_data.freq_data[i].temp_level = - pdata->freq_tab[i].temp_level; - } - sensor_conf->dev = &pdev->dev; - /* Register the sensor with thermal management interface */ - ret = exynos_register_thermal(sensor_conf); + ret = exynos_tmu_initialize(pdev); if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, - "Failed to register thermal interface: %d\n", - ret); - goto err_clk; + dev_err(&pdev->dev, "Failed to initialize TMU\n"); + goto err_sclk; } - data->reg_conf = sensor_conf; ret = devm_request_irq(&pdev->dev, data->irq, exynos_tmu_irq, IRQF_TRIGGER_RISING | IRQF_SHARED, dev_name(&pdev->dev), data); if (ret) { dev_err(&pdev->dev, "Failed to request irq: %d\n", data->irq); - goto err_clk; + goto err_sclk; } + exynos_tmu_control(pdev, true); return 0; +err_sclk: + clk_disable_unprepare(data->sclk); err_clk: clk_unprepare(data->clk); err_clk_sec: if (!IS_ERR(data->clk_sec)) clk_unprepare(data->clk_sec); +err_sensor: + thermal_zone_of_sensor_unregister(&pdev->dev, data->tzd); + return ret; } static int exynos_tmu_remove(struct platform_device *pdev) { struct exynos_tmu_data *data = platform_get_drvdata(pdev); + struct thermal_zone_device *tzd = data->tzd; - exynos_unregister_thermal(data->reg_conf); - + thermal_zone_of_sensor_unregister(&pdev->dev, tzd); exynos_tmu_control(pdev, false); + clk_disable_unprepare(data->sclk); clk_unprepare(data->clk); if (!IS_ERR(data->clk_sec)) clk_unprepare(data->clk_sec); diff --git a/drivers/thermal/samsung/exynos_tmu.h b/drivers/thermal/samsung/exynos_tmu.h index da3009bff6c4..4d71ec6c9aa0 100644 --- a/drivers/thermal/samsung/exynos_tmu.h +++ b/drivers/thermal/samsung/exynos_tmu.h @@ -23,16 +23,7 @@ #ifndef _EXYNOS_TMU_H #define _EXYNOS_TMU_H #include <linux/cpu_cooling.h> - -#include "exynos_thermal_common.h" - -enum calibration_type { - TYPE_ONE_POINT_TRIMMING, - TYPE_ONE_POINT_TRIMMING_25, - TYPE_ONE_POINT_TRIMMING_85, - TYPE_TWO_POINT_TRIMMING, - TYPE_NONE, -}; +#include <dt-bindings/thermal/thermal_exynos.h> enum soc_type { SOC_ARCH_EXYNOS3250 = 1, @@ -43,38 +34,11 @@ enum soc_type { SOC_ARCH_EXYNOS5420, SOC_ARCH_EXYNOS5420_TRIMINFO, SOC_ARCH_EXYNOS5440, + SOC_ARCH_EXYNOS7, }; /** * struct exynos_tmu_platform_data - * @threshold: basic temperature for generating interrupt - * 25 <= threshold <= 125 [unit: degree Celsius] - * @threshold_falling: differntial value for setting threshold - * of temperature falling interrupt. - * @trigger_levels: array for each interrupt levels - * [unit: degree Celsius] - * 0: temperature for trigger_level0 interrupt - * condition for trigger_level0 interrupt: - * current temperature > threshold + trigger_levels[0] - * 1: temperature for trigger_level1 interrupt - * condition for trigger_level1 interrupt: - * current temperature > threshold + trigger_levels[1] - * 2: temperature for trigger_level2 interrupt - * condition for trigger_level2 interrupt: - * current temperature > threshold + trigger_levels[2] - * 3: temperature for trigger_level3 interrupt - * condition for trigger_level3 interrupt: - * current temperature > threshold + trigger_levels[3] - * @trigger_type: defines the type of trigger. Possible values are, - * THROTTLE_ACTIVE trigger type - * THROTTLE_PASSIVE trigger type - * SW_TRIP trigger type - * HW_TRIP - * @trigger_enable[]: array to denote which trigger levels are enabled. - * 1 = enable trigger_level[] interrupt, - * 0 = disable trigger_level[] interrupt - * @max_trigger_level: max trigger level supported by the TMU - * @non_hw_trigger_levels: number of defined non-hardware trigger levels * @gain: gain of amplifier in the positive-TC generator block * 0 < gain <= 15 * @reference_voltage: reference voltage of amplifier @@ -86,24 +50,12 @@ enum soc_type { * @efuse_value: platform defined fuse value * @min_efuse_value: minimum valid trimming data * @max_efuse_value: maximum valid trimming data - * @first_point_trim: temp value of the first point trimming - * @second_point_trim: temp value of the second point trimming * @default_temp_offset: default temperature offset in case of no trimming * @cal_type: calibration type for temperature - * @freq_clip_table: Table representing frequency reduction percentage. - * @freq_tab_count: Count of the above table as frequency reduction may - * applicable to only some of the trigger levels. * * This structure is required for configuration of exynos_tmu driver. */ struct exynos_tmu_platform_data { - u8 threshold; - u8 threshold_falling; - u8 trigger_levels[MAX_TRIP_COUNT]; - enum trigger_type trigger_type[MAX_TRIP_COUNT]; - bool trigger_enable[MAX_TRIP_COUNT]; - u8 max_trigger_level; - u8 non_hw_trigger_levels; u8 gain; u8 reference_voltage; u8 noise_cancel_mode; @@ -115,30 +67,9 @@ struct exynos_tmu_platform_data { u8 second_point_trim; u8 default_temp_offset; - enum calibration_type cal_type; enum soc_type type; - struct freq_clip_table freq_tab[4]; - unsigned int freq_tab_count; -}; - -/** - * struct exynos_tmu_init_data - * @tmu_count: number of TMU instances. - * @tmu_data: platform data of all TMU instances. - * This structure is required to store data for multi-instance exynos tmu - * driver. - */ -struct exynos_tmu_init_data { - int tmu_count; - struct exynos_tmu_platform_data tmu_data[]; + u32 cal_type; + u32 cal_mode; }; -extern struct exynos_tmu_init_data const exynos3250_default_tmu_data; -extern struct exynos_tmu_init_data const exynos4210_default_tmu_data; -extern struct exynos_tmu_init_data const exynos4412_default_tmu_data; -extern struct exynos_tmu_init_data const exynos5250_default_tmu_data; -extern struct exynos_tmu_init_data const exynos5260_default_tmu_data; -extern struct exynos_tmu_init_data const exynos5420_default_tmu_data; -extern struct exynos_tmu_init_data const exynos5440_default_tmu_data; - #endif /* _EXYNOS_TMU_H */ diff --git a/drivers/thermal/samsung/exynos_tmu_data.c b/drivers/thermal/samsung/exynos_tmu_data.c deleted file mode 100644 index b23910069f68..000000000000 --- a/drivers/thermal/samsung/exynos_tmu_data.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * exynos_tmu_data.c - Samsung EXYNOS tmu data file - * - * Copyright (C) 2013 Samsung Electronics - * Amit Daniel Kachhap <amit.daniel@samsung.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - */ - -#include "exynos_thermal_common.h" -#include "exynos_tmu.h" - -struct exynos_tmu_init_data const exynos4210_default_tmu_data = { - .tmu_data = { - { - .threshold = 80, - .trigger_levels[0] = 5, - .trigger_levels[1] = 20, - .trigger_levels[2] = 30, - .trigger_enable[0] = true, - .trigger_enable[1] = true, - .trigger_enable[2] = true, - .trigger_enable[3] = false, - .trigger_type[0] = THROTTLE_ACTIVE, - .trigger_type[1] = THROTTLE_ACTIVE, - .trigger_type[2] = SW_TRIP, - .max_trigger_level = 4, - .non_hw_trigger_levels = 3, - .gain = 15, - .reference_voltage = 7, - .cal_type = TYPE_ONE_POINT_TRIMMING, - .min_efuse_value = 40, - .max_efuse_value = 100, - .first_point_trim = 25, - .second_point_trim = 85, - .default_temp_offset = 50, - .freq_tab[0] = { - .freq_clip_max = 800 * 1000, - .temp_level = 85, - }, - .freq_tab[1] = { - .freq_clip_max = 200 * 1000, - .temp_level = 100, - }, - .freq_tab_count = 2, - .type = SOC_ARCH_EXYNOS4210, - }, - }, - .tmu_count = 1, -}; - -#define EXYNOS3250_TMU_DATA \ - .threshold_falling = 10, \ - .trigger_levels[0] = 70, \ - .trigger_levels[1] = 95, \ - .trigger_levels[2] = 110, \ - .trigger_levels[3] = 120, \ - .trigger_enable[0] = true, \ - .trigger_enable[1] = true, \ - .trigger_enable[2] = true, \ - .trigger_enable[3] = false, \ - .trigger_type[0] = THROTTLE_ACTIVE, \ - .trigger_type[1] = THROTTLE_ACTIVE, \ - .trigger_type[2] = SW_TRIP, \ - .trigger_type[3] = HW_TRIP, \ - .max_trigger_level = 4, \ - .non_hw_trigger_levels = 3, \ - .gain = 8, \ - .reference_voltage = 16, \ - .noise_cancel_mode = 4, \ - .cal_type = TYPE_TWO_POINT_TRIMMING, \ - .efuse_value = 55, \ - .min_efuse_value = 40, \ - .max_efuse_value = 100, \ - .first_point_trim = 25, \ - .second_point_trim = 85, \ - .default_temp_offset = 50, \ - .freq_tab[0] = { \ - .freq_clip_max = 800 * 1000, \ - .temp_level = 70, \ - }, \ - .freq_tab[1] = { \ - .freq_clip_max = 400 * 1000, \ - .temp_level = 95, \ - }, \ - .freq_tab_count = 2 - -struct exynos_tmu_init_data const exynos3250_default_tmu_data = { - .tmu_data = { - { - EXYNOS3250_TMU_DATA, - .type = SOC_ARCH_EXYNOS3250, - }, - }, - .tmu_count = 1, -}; - -#define EXYNOS4412_TMU_DATA \ - .threshold_falling = 10, \ - .trigger_levels[0] = 70, \ - .trigger_levels[1] = 95, \ - .trigger_levels[2] = 110, \ - .trigger_levels[3] = 120, \ - .trigger_enable[0] = true, \ - .trigger_enable[1] = true, \ - .trigger_enable[2] = true, \ - .trigger_enable[3] = false, \ - .trigger_type[0] = THROTTLE_ACTIVE, \ - .trigger_type[1] = THROTTLE_ACTIVE, \ - .trigger_type[2] = SW_TRIP, \ - .trigger_type[3] = HW_TRIP, \ - .max_trigger_level = 4, \ - .non_hw_trigger_levels = 3, \ - .gain = 8, \ - .reference_voltage = 16, \ - .noise_cancel_mode = 4, \ - .cal_type = TYPE_ONE_POINT_TRIMMING, \ - .efuse_value = 55, \ - .min_efuse_value = 40, \ - .max_efuse_value = 100, \ - .first_point_trim = 25, \ - .second_point_trim = 85, \ - .default_temp_offset = 50, \ - .freq_tab[0] = { \ - .freq_clip_max = 1400 * 1000, \ - .temp_level = 70, \ - }, \ - .freq_tab[1] = { \ - .freq_clip_max = 400 * 1000, \ - .temp_level = 95, \ - }, \ - .freq_tab_count = 2 - -struct exynos_tmu_init_data const exynos4412_default_tmu_data = { - .tmu_data = { - { - EXYNOS4412_TMU_DATA, - .type = SOC_ARCH_EXYNOS4412, - }, - }, - .tmu_count = 1, -}; - -struct exynos_tmu_init_data const exynos5250_default_tmu_data = { - .tmu_data = { - { - EXYNOS4412_TMU_DATA, - .type = SOC_ARCH_EXYNOS5250, - }, - }, - .tmu_count = 1, -}; - -#define __EXYNOS5260_TMU_DATA \ - .threshold_falling = 10, \ - .trigger_levels[0] = 85, \ - .trigger_levels[1] = 103, \ - .trigger_levels[2] = 110, \ - .trigger_levels[3] = 120, \ - .trigger_enable[0] = true, \ - .trigger_enable[1] = true, \ - .trigger_enable[2] = true, \ - .trigger_enable[3] = false, \ - .trigger_type[0] = THROTTLE_ACTIVE, \ - .trigger_type[1] = THROTTLE_ACTIVE, \ - .trigger_type[2] = SW_TRIP, \ - .trigger_type[3] = HW_TRIP, \ - .max_trigger_level = 4, \ - .non_hw_trigger_levels = 3, \ - .gain = 8, \ - .reference_voltage = 16, \ - .noise_cancel_mode = 4, \ - .cal_type = TYPE_ONE_POINT_TRIMMING, \ - .efuse_value = 55, \ - .min_efuse_value = 40, \ - .max_efuse_value = 100, \ - .first_point_trim = 25, \ - .second_point_trim = 85, \ - .default_temp_offset = 50, \ - .freq_tab[0] = { \ - .freq_clip_max = 800 * 1000, \ - .temp_level = 85, \ - }, \ - .freq_tab[1] = { \ - .freq_clip_max = 200 * 1000, \ - .temp_level = 103, \ - }, \ - .freq_tab_count = 2, \ - -#define EXYNOS5260_TMU_DATA \ - __EXYNOS5260_TMU_DATA \ - .type = SOC_ARCH_EXYNOS5260 - -struct exynos_tmu_init_data const exynos5260_default_tmu_data = { - .tmu_data = { - { EXYNOS5260_TMU_DATA }, - { EXYNOS5260_TMU_DATA }, - { EXYNOS5260_TMU_DATA }, - { EXYNOS5260_TMU_DATA }, - { EXYNOS5260_TMU_DATA }, - }, - .tmu_count = 5, -}; - -#define EXYNOS5420_TMU_DATA \ - __EXYNOS5260_TMU_DATA \ - .type = SOC_ARCH_EXYNOS5420 - -#define EXYNOS5420_TMU_DATA_SHARED \ - __EXYNOS5260_TMU_DATA \ - .type = SOC_ARCH_EXYNOS5420_TRIMINFO - -struct exynos_tmu_init_data const exynos5420_default_tmu_data = { - .tmu_data = { - { EXYNOS5420_TMU_DATA }, - { EXYNOS5420_TMU_DATA }, - { EXYNOS5420_TMU_DATA_SHARED }, - { EXYNOS5420_TMU_DATA_SHARED }, - { EXYNOS5420_TMU_DATA_SHARED }, - }, - .tmu_count = 5, -}; - -#define EXYNOS5440_TMU_DATA \ - .trigger_levels[0] = 100, \ - .trigger_levels[4] = 105, \ - .trigger_enable[0] = 1, \ - .trigger_type[0] = SW_TRIP, \ - .trigger_type[4] = HW_TRIP, \ - .max_trigger_level = 5, \ - .non_hw_trigger_levels = 1, \ - .gain = 5, \ - .reference_voltage = 16, \ - .noise_cancel_mode = 4, \ - .cal_type = TYPE_ONE_POINT_TRIMMING, \ - .efuse_value = 0x5b2d, \ - .min_efuse_value = 16, \ - .max_efuse_value = 76, \ - .first_point_trim = 25, \ - .second_point_trim = 70, \ - .default_temp_offset = 25, \ - .type = SOC_ARCH_EXYNOS5440 - -struct exynos_tmu_init_data const exynos5440_default_tmu_data = { - .tmu_data = { - { EXYNOS5440_TMU_DATA } , - { EXYNOS5440_TMU_DATA } , - { EXYNOS5440_TMU_DATA } , - }, - .tmu_count = 3, -}; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8729cf68d2fe..f55721ff9385 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, return ret; } -/* - * this makes the path point to (inum INODE_ITEM ioff) - */ -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path) -{ - struct btrfs_key key; - return btrfs_find_item(fs_root, path, inum, ioff, - BTRFS_INODE_ITEM_KEY, &key); -} - -static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path, - struct btrfs_key *found_key) -{ - return btrfs_find_item(fs_root, path, inum, ioff, - BTRFS_INODE_REF_KEY, found_key); -} - int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, @@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } - ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + ret = btrfs_find_item(fs_root, path, parent, 0, + BTRFS_INODE_REF_KEY, &found_key); if (ret > 0) ret = -ENOENT; if (ret) @@ -1727,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { - ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, - &found_key); + ret = btrfs_find_item(fs_root, path, inum, + parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY, + &found_key); + if (ret < 0) break; if (ret) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2a1ac6bfc724..9c41fbac3009 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -32,9 +32,6 @@ struct inode_fs_paths { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path); - int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4aadadcfab20..de5e4f2adfea 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -185,6 +185,9 @@ struct btrfs_inode { struct btrfs_delayed_node *delayed_node; + /* File creation time. */ + struct timespec i_otime; + struct inode vfs_inode; }; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 14a72ed14ef7..993642199326 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) */ static void add_root_to_dirty_list(struct btrfs_root *root) { + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || + !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) + return; + spin_lock(&root->fs_info->trans_lock); - if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && - list_empty(&root->dirty_list)) { - list_add(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); + if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { + /* Want the extent tree to be the last on the list */ + if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + list_move_tail(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + else + list_move(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); } spin_unlock(&root->fs_info->trans_lock); } @@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(eb->start, - fs_info->tree_root->nodesize); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(logical, root->nodesize); + eb = alloc_dummy_extent_buffer(root->fs_info, logical); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize); + readahead_tree_block(root, search); nread += blocksize; } nscan++; @@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root, u64 gen; u64 block1 = 0; u64 block2 = 0; - int blocksize; parent = path->nodes[level + 1]; if (!parent) @@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = root->nodesize; if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); @@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root, } if (block1) - readahead_tree_block(root, block1, blocksize); + readahead_tree_block(root, block1); if (block2) - readahead_tree_block(root, block2, blocksize); + readahead_tree_block(root, block2); } @@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key, return 0; } -int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 iobjectid, u64 ioff, u8 key_type, struct btrfs_key *found_key) { int ret; struct btrfs_key key; struct extent_buffer *eb; - struct btrfs_path *path; + + ASSERT(path); + ASSERT(found_key); key.type = key_type; key.objectid = iobjectid; key.offset = ioff; - if (found_path == NULL) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } else - path = found_path; - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if ((ret < 0) || (found_key == NULL)) { - if (path != found_path) - btrfs_free_path(path); + if (ret < 0) return ret; - } eb = path->nodes[0]; if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { @@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; path->slots[level] = 0; return 0; } @@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, path->search_for_split = 1; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); path->search_for_split = 0; + if (ret > 0) + ret = -EAGAIN; if (ret < 0) goto err; ret = -EAGAIN; leaf = path->nodes[0]; - /* if our item isn't there or got smaller, return now */ - if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + /* if our item isn't there, return now */ + if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b180708bf79..84c3b00f3de8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) +#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -1020,6 +1022,9 @@ enum btrfs_raid_types { BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID10) +#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6) + /* * We need a bit for restriper to be able to tell when chunks of type * SINGLE are available. This "extended" profile format is used in @@ -1239,7 +1244,6 @@ enum btrfs_disk_cache_state { BTRFS_DC_ERROR = 1, BTRFS_DC_CLEAR = 2, BTRFS_DC_SETUP = 3, - BTRFS_DC_NEED_WRITE = 4, }; struct btrfs_caching_control { @@ -1277,7 +1281,6 @@ struct btrfs_block_group_cache { unsigned long full_stripe_len; unsigned int ro:1; - unsigned int dirty:1; unsigned int iref:1; unsigned int has_caching_ctl:1; unsigned int removed:1; @@ -1315,6 +1318,9 @@ struct btrfs_block_group_cache { struct list_head ro_list; atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; }; /* delayed seq elem */ @@ -1741,6 +1747,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; @@ -1776,6 +1783,7 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_DEFRAG_RUNNING 6 #define BTRFS_ROOT_FORCE_COW 7 #define BTRFS_ROOT_MULTI_LOG_TASKS 8 +#define BTRFS_ROOT_DIRTY 9 /* * in ram representation of the tree. extent_root is used for all allocations @@ -1794,8 +1802,6 @@ struct btrfs_root { struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; - struct kobject root_kobj; - struct completion kobj_unregister; struct mutex objectid_mutex; spinlock_t accounting_lock; @@ -2465,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); - -static inline struct btrfs_timespec * -btrfs_inode_atime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, atime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_mtime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, mtime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_ctime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, ctime); - return (struct btrfs_timespec *)ptr; -} - BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index de4e70fb3cbb..82f0c7c95474 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); btrfs_set_stack_inode_block_group(inode_item, 0); - btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->atime, inode->i_atime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->atime, inode->i_atime.tv_nsec); - btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->mtime, inode->i_mtime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->mtime, inode->i_mtime.tv_nsec); - btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->ctime, inode->i_ctime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->ctime, inode->i_ctime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); } int btrfs_fill_inode(struct inode *inode, u32 *rdev) { struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) @@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_stack_timespec_nsec(&inode_item->otime); inode->i_generation = BTRFS_I(inode)->generation; BTRFS_I(inode)->index_cnt = (u64)-1; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index ca6a3a3b6b6c..5ec03d999c37 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -440,18 +440,9 @@ leave: */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { - s64 writers; - DEFINE_WAIT(wait); - set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - do { - prepare_to_wait(&fs_info->replace_wait, &wait, - TASK_UNINTERRUPTIBLE); - writers = percpu_counter_sum(&fs_info->bio_counter); - if (writers) - schedule(); - finish_wait(&fs_info->replace_wait, &wait); - } while (writers); + wait_event(fs_info->replace_wait, !percpu_counter_sum( + &fs_info->bio_counter)); } /* @@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) { - DEFINE_WAIT(wait); -again: - percpu_counter_inc(&fs_info->bio_counter); - if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { + while (1) { + percpu_counter_inc(&fs_info->bio_counter); + if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state))) + break; + btrfs_bio_counter_dec(fs_info); wait_event(fs_info->replace_wait, !test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)); - goto again; } - } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1afb18226da8..f79f38542a73 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_INFO + printk_ratelimited(KERN_WARNING "BTRFS: %s checksum verify failed on %llu wanted %X found %X " "level %d\n", root->fs_info->sb->s_id, buf->start, @@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", + printk_ratelimited(KERN_ERR + "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", eb->fs_info->sb->s_id, eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; @@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start " + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " "%llu %llu\n", eb->fs_info->sb->s_id, found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n", + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", eb->fs_info->sb->s_id, eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_info(root->fs_info, "bad tree block level %d", + btrfs_err(root->fs_info, "bad tree block level %d", (int)btrfs_header_level(eb)); ret = -EIO; goto err; @@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) +void readahead_tree_block(struct btrfs_root *root, u64 bytenr) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, @@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) free_extent_buffer(buf); } -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb) { struct extent_buffer *buf = NULL; @@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return 0; @@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr) { if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr, - blocksize); - return alloc_extent_buffer(root->fs_info, bytenr, blocksize); + return alloc_test_extent_buffer(root->fs_info, bytenr); + return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return NULL; @@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - memset(&root->root_kobj, 0, sizeof(root->root_kobj)); if (fs_info) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; - init_completion(&root->kobj_unregister); root->root_key.objectid = objectid; root->anon_dev = 0; @@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, bool check_ref) { struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) @@ -1669,8 +1669,17 @@ again: if (ret) goto fail; - ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, - location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto fail; + } + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + btrfs_free_path(path); if (ret < 0) goto fail; if (ret == 0) @@ -2232,6 +2241,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); @@ -2496,7 +2506,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_ERR "BTRFS: has skinny extents\n"); + printk(KERN_INFO "BTRFS: has skinny extents\n"); /* * flag our filesystem as having big metadata blocks if @@ -2520,7 +2530,7 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " + printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); goto fail_alloc; @@ -2628,12 +2638,12 @@ int open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(sectorsize); if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); + printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } if (sectorsize != PAGE_SIZE) { - printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " + printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -2642,7 +2652,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "BTRFS: failed to read the system " + printk(KERN_ERR "BTRFS: failed to read the system " "array on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2657,7 +2667,7 @@ int open_ctree(struct super_block *sb, generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", + printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2669,7 +2679,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", + printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2681,7 +2691,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", + printk(KERN_ERR "BTRFS: failed to read devices on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2765,7 +2775,7 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to recover balance\n"); + printk(KERN_ERR "BTRFS: failed to recover balance\n"); goto fail_block_groups; } @@ -3860,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", btrfs_super_log_root(sb)); + /* + * Check the lower bound, the alignment and other constraints are + * checked later. + */ + if (btrfs_super_nodesize(sb) < 4096) { + printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", + btrfs_super_nodesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_sectorsize(sb) < 4096) { + printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", + btrfs_super_sectorsize(sb)); + ret = -EINVAL; + } + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", fs_info->fsid, sb->dev_item.fsid); @@ -3873,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + printk(KERN_ERR "BTRFS: number of devices is 0\n"); + ret = -EINVAL; + } if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", @@ -3881,6 +3910,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 414651821fb3..27d44c0fd236 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,11 +46,11 @@ struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u64 parent_transid); -void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +void readahead_tree_block(struct btrfs_root *root, u64 bytenr); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); int open_ctree(struct super_block *sb, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a684086c3c81..571f402d3fc4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -74,8 +74,9 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(bbio); + btrfs_put_bbio(bbio); } if (actual_bytes) @@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; - int run_most = 0; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, root = root->fs_info->tree_root; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) { + if (count == 0) count = atomic_read(&delayed_refs->num_entries) * 2; - run_most = 1; - } again: #ifdef SCRAMBLE_DELAYED_REFS @@ -3315,120 +3313,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; - int err = 0; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; struct btrfs_path *path; - u64 last = 0; + + if (list_empty(&cur_trans->dirty_bgs)) + return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - err = cache_save_setup(cache, trans, path); - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); - } - - while (1) { - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) { - btrfs_put_block_group(cache); - goto again; - } - - if (cache->dirty) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - if (cache->disk_cache_state == BTRFS_DC_SETUP) - cache->disk_cache_state = BTRFS_DC_NEED_WRITE; - cache->dirty = 0; - last = cache->key.objectid + cache->key.offset; - - err = write_one_cache_group(trans, root, path, cache); - btrfs_put_block_group(cache); - if (err) /* File system offline */ - goto out; - } - - while (1) { - /* - * I don't think this is needed since we're just marking our - * preallocated extent as written, but just in case it can't - * hurt. - */ - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - /* - * Really this shouldn't happen, but it could if we - * couldn't write the entire preallocated extent and - * splitting the extent resulted in a new block. - */ - if (cache->dirty) { - btrfs_put_block_group(cache); - goto again; - } - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - err = btrfs_write_out_cache(root, trans, cache, path); - - /* - * If we didn't have an error then the cache state is still - * NEED_WRITE, so we can set it to WRITTEN. - */ - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - cache->disk_cache_state = BTRFS_DC_WRITTEN; - last = cache->key.objectid + cache->key.offset; + /* + * We don't need the lock here since we are protected by the transaction + * commit. We want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + list_del_init(&cache->dirty_list); + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, + (unsigned long) -1); + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) + btrfs_write_out_cache(root, trans, cache, path); + if (!ret) + ret = write_one_cache_group(trans, root, path, cache); btrfs_put_block_group(cache); } -out: btrfs_free_path(path); - return err; + return ret; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -5043,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for + * @num_bytes: the number of bytes we're relaseing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of * reserved extents that need to be freed. This must be called with * BTRFS_I(inode)->lock held. */ -static unsigned drop_outstanding_extent(struct inode *inode) +static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) { unsigned drop_inode_space = 0; unsigned dropped_extents = 0; + unsigned num_extents = 0; - BUG_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; + num_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + ASSERT(num_extents); + ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); + BTRFS_I(inode)->outstanding_extents -= num_extents; if (BTRFS_I(inode)->outstanding_extents == 0 && test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, @@ -5226,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) out_fail: spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); /* * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers @@ -5305,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); if (num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); @@ -5375,8 +5301,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc) +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; @@ -5414,6 +5341,14 @@ static int update_block_group(struct btrfs_root *root, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5424,7 +5359,6 @@ static int update_block_group(struct btrfs_root *root, cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; - cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { @@ -5807,10 +5741,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, unpin = &fs_info->freed_extents[0]; while (1) { + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); - if (ret) + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, @@ -5818,6 +5755,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, clear_extent_dirty(unpin, start, end, GFP_NOFS); unpin_extent_range(root, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } @@ -6103,7 +6041,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(root, bytenr, num_bytes, 0); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -6205,7 +6143,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_group_cache *cache = NULL; int pin = 1; int ret; @@ -6221,17 +6158,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group_cache *cache; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) goto out; } + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); goto out; } @@ -6239,6 +6179,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } @@ -6253,7 +6194,6 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - btrfs_put_block_group(cache); } /* Can return -ENOMEM */ @@ -7063,7 +7003,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7152,7 +7092,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->nodesize, 1); + ret = update_block_group(trans, root, ins->objectid, root->nodesize, + 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7217,11 +7158,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize, int level) + u64 bytenr, int level) { struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); @@ -7340,7 +7281,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - blocksize, level); + level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -7357,8 +7298,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, - blocksize, level); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { @@ -7487,7 +7427,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(root, bytenr, blocksize); + readahead_tree_block(root, bytenr); nread++; } wc->reada_slot = slot; @@ -7828,7 +7768,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root, bytenr); if (!next) { - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, @@ -8548,14 +8488,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, - CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - } - ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8566,6 +8498,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache->flags); + check_system_chunk(trans, root, alloc_flags); + } + btrfs_end_transaction(trans, root); return ret; } @@ -9005,6 +8942,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9068,9 +9006,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - cache->disk_cache_state = BTRFS_DC_CLEAR; if (btrfs_test_opt(root, SPACE_CACHE)) - cache->dirty = 1; + cache->disk_cache_state = BTRFS_DC_CLEAR; } read_extent_buffer(leaf, &cache->item, @@ -9460,6 +9397,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); @@ -9611,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. */ - trans = btrfs_join_transaction(root); + /* 1 for btrfs_orphan_reserve_metadata() */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_set_block_group_rw(root, block_group); ret = PTR_ERR(trans); @@ -9624,18 +9569,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ start = block_group->key.objectid; end = start + block_group->key.offset - 1; + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_set_block_group_rw(root, block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); btrfs_set_block_group_rw(root, block_group); goto end_trans; } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ block_group->pinned = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c73df6a7c9b6..c7233ff1d533 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n", + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), atomic_read(&state->refs)); @@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->set_bit_hook) tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits); + struct extent_state *state, unsigned *bits); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned long *bits) + unsigned *bits) { struct rb_node *node; @@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned long *bits, int wake) + unsigned *bits, int wake) { struct extent_state *next; - unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; + unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int wake, int delete, + unsigned bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask) { @@ -789,9 +789,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned long *bits) + unsigned *bits) { - unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; + unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { @@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree, static void cache_state_if_flags(struct extent_state *state, struct extent_state **cached_ptr, - const u64 flags) + unsigned flags) { if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { @@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state, static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long exclusive_bits, + unsigned bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask) { @@ -1034,7 +1034,7 @@ search_again: } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, u64 * failed_start, + unsigned bits, u64 * failed_start, struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, @@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long clear_bits, + unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; @@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask) + unsigned bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask) + unsigned bits, gfp_t mask) { return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } @@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, struct extent_state **cached_state) + unsigned bits, struct extent_state **cached_state) { int err; u64 failed_start; + while (1) { err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, EXTENT_LOCKED, &failed_start, @@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) */ static struct extent_state * find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, unsigned long bits) + u64 start, unsigned bits) { struct rb_node *node; struct extent_state *state; @@ -1474,7 +1475,7 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned long bits, + u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1753,7 +1754,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned long clear_bits, + unsigned clear_bits, unsigned long page_ops) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits, int contig) + unsigned bits, int contig) { struct rb_node *node; struct extent_state *state; @@ -1928,7 +1929,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int filled, struct extent_state *cached) + unsigned bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; @@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, sector = bbio->stripes[mirror_num-1].physical >> 9; bio->bi_iter.bi_sector = sector; dev = bbio->stripes[mirror_num-1].dev; - kfree(bbio); + btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { bio_put(bio); return -EIO; @@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); - if (ret < 0) + if (ret < 0) { + *bio_ret = NULL; return ret; + } bio = NULL; } else { return 0; @@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, page, &delalloc_start, &delalloc_end, - 128 * 1024 * 1024); + BTRFS_MAX_EXTENT_SIZE); if (nr_delalloc == 0) { delalloc_start = delalloc_end + 1; continue; @@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) static struct extent_buffer * __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len, gfp_t mask) + unsigned long len) { struct extent_buffer *eb = NULL; - eb = kmem_cache_zalloc(extent_buffer_cache, mask); + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS); if (eb == NULL) return NULL; eb->start = start; @@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) struct extent_buffer *new; unsigned long num_pages = num_extent_pages(src->start, src->len); - new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); + new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) return NULL; @@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) return new; } -struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - unsigned long num_pages = num_extent_pages(0, len); + unsigned long len; + unsigned long num_pages; unsigned long i; - eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + num_pages = num_extent_pages(0, len); + + eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; @@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) + u64 start) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(start, len); + eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) return NULL; eb->fs_info = fs_info; @@ -4808,8 +4824,9 @@ free_eb: #endif struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) + u64 start) { + unsigned long len = fs_info->tree_root->nodesize; unsigned long num_pages = num_extent_pages(start, len); unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT; @@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ece9ce87edff..695b0ccfb755 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -4,22 +4,22 @@ #include <linux/rbtree.h> /* bits for the extent state */ -#define EXTENT_DIRTY 1 -#define EXTENT_WRITEBACK (1 << 1) -#define EXTENT_UPTODATE (1 << 2) -#define EXTENT_LOCKED (1 << 3) -#define EXTENT_NEW (1 << 4) -#define EXTENT_DELALLOC (1 << 5) -#define EXTENT_DEFRAG (1 << 6) -#define EXTENT_BOUNDARY (1 << 9) -#define EXTENT_NODATASUM (1 << 10) -#define EXTENT_DO_ACCOUNTING (1 << 11) -#define EXTENT_FIRST_DELALLOC (1 << 12) -#define EXTENT_NEED_WAIT (1 << 13) -#define EXTENT_DAMAGED (1 << 14) -#define EXTENT_NORESERVE (1 << 15) -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) +#define EXTENT_DIRTY (1U << 0) +#define EXTENT_WRITEBACK (1U << 1) +#define EXTENT_UPTODATE (1U << 2) +#define EXTENT_LOCKED (1U << 3) +#define EXTENT_NEW (1U << 4) +#define EXTENT_DELALLOC (1U << 5) +#define EXTENT_DEFRAG (1U << 6) +#define EXTENT_BOUNDARY (1U << 9) +#define EXTENT_NODATASUM (1U << 10) +#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_FIRST_DELALLOC (1U << 12) +#define EXTENT_NEED_WAIT (1U << 13) +#define EXTENT_DAMAGED (1U << 14) +#define EXTENT_NORESERVE (1U << 15) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* * flags for bio submission. The high bits indicate the compression @@ -81,9 +81,9 @@ struct extent_io_ops { int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long *bits); + unsigned *bits); void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long *bits); + unsigned *bits); void (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -108,7 +108,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; atomic_t refs; - unsigned long state; + unsigned state; /* for use by the FS */ u64 private; @@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, int try_release_extent_buffer(struct page *page); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, struct extent_state **cached); + unsigned bits, struct extent_state **cached); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); @@ -202,21 +202,21 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits, int contig); + u64 max_bytes, unsigned bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int filled, + unsigned bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask); + unsigned bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int wake, int delete, + unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask); + unsigned bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, u64 *failed_start, + unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); @@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long clear_bits, + unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned long bits, + u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); @@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); -struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); + u64 start); +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned long bits_to_clear, + unsigned bits_to_clear, unsigned long page_ops); struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, @@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); + u64 start); #endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d6c03f7f136b..a71978578fa7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; - struct list_head bitmaps; + LIST_HEAD(bitmaps); u64 num_entries; u64 num_bitmaps; u64 generation; u8 type; int ret = 0; - INIT_LIST_HEAD(&bitmaps); - /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) return 0; @@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; + enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); if (ret) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_ERROR; - spin_unlock(&block_group->lock); + dcs = BTRFS_DC_ERROR; ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, @@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root, #endif } + spin_lock(&block_group->lock); + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); iput(inode); return ret; } @@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root, trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, min_bytes); - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 8ffa4783cbf4..265e03c73f4d 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { @@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); ret = 0; } else if (ret < 0) { - if (ret == -EOVERFLOW) - ret = -EMLINK; + if (ret == -EOVERFLOW) { + if (find_name_in_backref(path, name, name_len, &ref)) + ret = -EEXIST; + else + ret = -EMLINK; + } goto out; } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 54bcf639d1cf..a85c23dfcddb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, static void btrfs_split_extent_hook(struct inode *inode, struct extent_state *orig, u64 split) { + u64 size; + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; + size = orig->end - orig->start + 1; + if (size > BTRFS_MAX_EXTENT_SIZE) { + u64 num_extents; + u64 new_size; + + /* + * We need the largest size of the remaining extent to see if we + * need to add a new outstanding extent. Think of the following + * case + * + * [MEAX_EXTENT_SIZEx2 - 4k][4k] + * + * The new_size would just be 4k and we'd think we had enough + * outstanding extents for this if we only took one side of the + * split, same goes for the other direction. We need to see if + * the larger size still is the same amount of extents as the + * original size, because if it is we need to add a new + * outstanding extent. But if we split up and the larger size + * is less than the original then we are good to go since we've + * already accounted for the extra extent in our original + * accounting. + */ + new_size = orig->end - split + 1; + if ((split - orig->start) > new_size) + new_size = split - orig->start; + + num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) < num_extents) + return; + } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); @@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode, struct extent_state *new, struct extent_state *other) { + u64 new_size, old_size; + u64 num_extents; + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; + old_size = other->end - other->start + 1; + new_size = old_size + (new->end - new->start + 1); + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); + return; + } + + /* + * If we grew by another max_extent, just return, we want to keep that + * reserved amount. + */ + num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) > num_extents) + return; + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents--; spin_unlock(&BTRFS_I(inode)->lock); @@ -1604,7 +1663,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * have pending delalloc work to be done. */ static void btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1645,9 +1704,11 @@ static void btrfs_set_bit_hook(struct inode *inode, */ static void btrfs_clear_bit_hook(struct inode *inode, struct extent_state *state, - unsigned long *bits) + unsigned *bits) { u64 len = state->end + 1 - state->start; + u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, + BTRFS_MAX_EXTENT_SIZE); spin_lock(&BTRFS_I(inode)->lock); if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) @@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, *bits &= ~EXTENT_FIRST_DELALLOC; } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + BTRFS_I(inode)->outstanding_extents -= num_extents; spin_unlock(&BTRFS_I(inode)->lock); } @@ -2945,7 +3006,7 @@ static int __readpage_endio_check(struct inode *inode, return 0; zeroit: if (__ratelimit(&_rs)) - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_warn(BTRFS_I(inode)->root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, csum_expected); memset(kaddr + pgoff, 1, len); @@ -3407,7 +3468,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) - btrfs_crit(root->fs_info, + btrfs_err(root->fs_info, "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; @@ -3490,7 +3551,6 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; unsigned long ptr; @@ -3527,17 +3587,19 @@ static void btrfs_read_locked_inode(struct inode *inode) i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); @@ -3656,21 +3718,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); + btrfs_set_token_timespec_sec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec, &token); + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), &token); btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -5007,6 +5074,7 @@ static int fixup_tree_root_location(struct btrfs_root *root, struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; + struct btrfs_key key; int ret; int err = 0; @@ -5017,9 +5085,12 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_item(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid, BTRFS_ROOT_REF_KEY, NULL); + key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path, + 0, 0); if (ret) { if (ret < 0) err = ret; @@ -5258,7 +5329,10 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; return inode; } @@ -5826,7 +5900,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, @@ -7134,11 +7213,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; + u64 orig_len = len; int unlock_bits = EXTENT_LOCKED; int ret = 0; if (create) - unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; + unlock_bits |= EXTENT_DIRTY; else len = min_t(u64, len, root->sectorsize); @@ -7269,14 +7349,12 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); - - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - BUG_ON(ret); + if (len < orig_len) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } + btrfs_free_reserved_data_space(inode, len); } /* @@ -7805,8 +7883,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & - (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; @@ -8053,8 +8130,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); - else - btrfs_delalloc_release_metadata(inode, 0); } out: if (wakeup) @@ -8575,6 +8650,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delayed_node = NULL; + ei->i_otime.tv_sec = 0; + ei->i_otime.tv_nsec = 0; + inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 48b60dbf807f..97159a8e91d4 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup = u64_to_ptr(unode->aux); qgroup->rfer += sign * oper->num_bytes; qgroup->rfer_cmpr += sign * oper->num_bytes; + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); qgroup->excl += sign * oper->num_bytes; - if (sign < 0) - WARN_ON(qgroup->excl < oper->num_bytes); qgroup->excl_cmpr += sign * oper->num_bytes; qgroup_dirty(fs_info, qgroup); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8ab2a17bbba8..5264858ed768 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -58,15 +58,6 @@ */ #define RBIO_CACHE_READY_BIT 3 -/* - * bbio and raid_map is managed by the caller, so we shouldn't free - * them here. And besides that, all rbios with this flag should not - * be cached, because we need raid_map to check the rbios' stripe - * is the same or not, but it is very likely that the caller has - * free raid_map, so don't cache those rbios. - */ -#define RBIO_HOLD_BBIO_MAP_BIT 4 - #define RBIO_CACHE_SIZE 1024 enum btrfs_rbio_ops { @@ -79,13 +70,6 @@ struct btrfs_raid_bio { struct btrfs_fs_info *fs_info; struct btrfs_bio *bbio; - /* - * logical block numbers for the start of each stripe - * The last one or two are p/q. These are sorted, - * so raid_map[0] is the start of our full stripe - */ - u64 *raid_map; - /* while we're doing rmw on a stripe * we put it into a hash table so we can * lock the stripe and merge more rbios @@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->raid_map[0]; + u64 num = rbio->bbio->raid_map[0]; /* * we shift down quite a bit. We're using byte @@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->raid_map[0] != - cur->raid_map[0]) + if (last->bbio->raid_map[0] != + cur->bbio->raid_map[0]) return 0; /* we can't merge with different operations */ @@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { walk++; - if (cur->raid_map[0] == rbio->raid_map[0]) { + if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { spin_lock(&cur->bio_list_lock); /* can we steal this cached rbio's pages? */ @@ -841,21 +825,6 @@ done_nolock: remove_rbio_from_cache(rbio); } -static inline void -__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need) -{ - if (need) { - kfree(raid_map); - kfree(bbio); - } -} - -static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio) -{ - __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map, - !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)); -} - static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; @@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) } } - free_bbio_and_raid_map(rbio); - + btrfs_put_bbio(rbio->bbio); kfree(rbio); } @@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len) + struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; int nr_data = 0; @@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bbio = bbio; - rbio->raid_map = raid_map; rbio->fs_info = root->fs_info; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; @@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, rbio->bio_pages = p + sizeof(struct page *) * num_pages; rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; - if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + nr_data = real_stripes - 1; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) nr_data = real_stripes - 2; else - nr_data = real_stripes - 1; + BUG(); rbio->nr_data = nr_data; return rbio; @@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_lock_irq(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) { start = (u64)bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->raid_map[0]; + stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_CACHE_SHIFT; for (i = 0; i < bio->bi_vcnt; i++) { @@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, logical <<= 9; for (i = 0; i < rbio->nr_data; i++) { - stripe_start = rbio->raid_map[i]; + stripe_start = rbio->bbio->raid_map[i]; if (logical >= stripe_start && logical < stripe_start + rbio->stripe_len) { return i; @@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) * our main entry point for writes from the rest of the FS. */ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len) + struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) { - __free_bbio_and_raid_map(bbio, raid_map, 1); + btrfs_put_bbio(bbio); return PTR_ERR(rbio); } bio_list_add(&rbio->bio_list, bio); @@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->raid_map[rbio->real_stripes - 1] == - RAID6_Q_STRIPE) { - + if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* * single failure, rebuild from parity raid5 * style @@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * here due to a crc mismatch and we can't give them the * data they want */ - if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->raid_map[faila] == RAID5_P_STRIPE) { + if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bbio->raid_map[faila] == + RAID5_P_STRIPE) { err = -EIO; goto cleanup; } @@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto pstripe; } - if (rbio->raid_map[failb] == RAID5_P_STRIPE) { + if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, PAGE_SIZE, faila, pointers); } else { @@ -2001,8 +1967,7 @@ cleanup: cleanup_io: if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - if (err == 0 && - !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)) + if (err == 0) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -2156,15 +2121,16 @@ cleanup: * of the drive. */ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num, int generic_io) + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io) { struct btrfs_raid_bio *rbio; int ret; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) { - __free_bbio_and_raid_map(bbio, raid_map, generic_io); + if (generic_io) + btrfs_put_bbio(bbio); return PTR_ERR(rbio); } @@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); - __free_bbio_and_raid_map(bbio, raid_map, generic_io); + if (generic_io) + btrfs_put_bbio(bbio); kfree(rbio); return -EIO; } @@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, btrfs_bio_counter_inc_noblocked(root->fs_info); rbio->generic_bio_cnt = 1; } else { - set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); + btrfs_get_bbio(bbio); } /* @@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work) struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); + rbio = alloc_rbio(root, bbio, stripe_len); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, int stripe_offset; int index; - ASSERT(logical >= rbio->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] + + ASSERT(logical >= rbio->bbio->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + rbio->stripe_len * rbio->nr_data); - stripe_offset = (int)(logical - rbio->raid_map[0]); + stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); index = stripe_offset >> PAGE_CACHE_SHIFT; rbio->bio_pages[index] = page; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 31d4a157b5e3..2b5d7977d83b 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -43,16 +43,15 @@ struct btrfs_raid_bio; struct btrfs_device; int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num, int generic_io); + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io); int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len); + struct btrfs_bio *bbio, u64 stripe_len); struct btrfs_raid_bio * raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, u64 logical); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b63ae20618fb..0e7beea92b4c 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - u32 blocksize; int err; struct list_head extctl; int refcnt; @@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, blocksize = root->nodesize; re->logical = logical; - re->blocksize = blocksize; re->top = *top; INIT_LIST_HEAD(&re->extctl); spin_lock_init(&re->lock); @@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace); - kfree(bbio); + btrfs_put_bbio(bbio); return re; error: @@ -488,7 +486,7 @@ error: kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - kfree(bbio); + btrfs_put_bbio(bbio); kfree(re); return re_exist; } @@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, int mirror_num = 0; struct extent_buffer *eb = NULL; u64 logical; - u32 blocksize; int ret; int i; int need_kick = 0; @@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); return 0; } - dev->reada_next = re->logical + re->blocksize; + dev->reada_next = re->logical + fs_info->tree_root->nodesize; re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } } logical = re->logical; - blocksize = re->blocksize; spin_lock(&re->lock); if (re->scheduled_for == NULL) { @@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, return 0; atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, - mirror_num, &eb); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, + mirror_num, &eb); if (ret) __readahead_hook(fs_info->extent_root, NULL, logical, ret); else if (eb) @@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) break; printk(KERN_DEBUG " re: logical %llu size %u empty %d for %lld", - re->logical, re->blocksize, + re->logical, fs_info->tree_root->nodesize, list_empty(&re->extctl), re->scheduled_for ? re->scheduled_for->devid : -1); @@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } printk(KERN_DEBUG "re: logical %llu size %u list empty %d for %lld", - re->logical, re->blocksize, list_empty(&re->extctl), + re->logical, fs_info->tree_root->nodesize, + list_empty(&re->extctl), re->scheduled_for ? re->scheduled_for->devid : -1); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 74257d6436ad..d83085381bcc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc, } } -static int tree_block_processed(u64 bytenr, u32 blocksize, - struct reloc_control *rc) +static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { + u32 blocksize = rc->extent_root->nodesize; + if (test_range_bit(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) return 1; @@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); if (!block->key_ready) - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid); + readahead_tree_block(rc->extent_root, block->bytenr); rb_node = rb_next(rb_node); } @@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc, bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, SKINNY_METADATA); - if (tree_block_processed(bytenr, blocksize, rc)) + if (tree_block_processed(bytenr, rc)) return 0; if (tree_search(blocks, bytenr)) @@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc, if (added) goto next; - if (!tree_block_processed(leaf->start, leaf->len, rc)) { + if (!tree_block_processed(leaf->start, rc)) { block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) { err = -ENOMEM; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e427cb7ee12c..ec57687c9a4d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -66,7 +66,6 @@ struct scrub_ctx; struct scrub_recover { atomic_t refs; struct btrfs_bio *bbio; - u64 *raid_map; u64 map_length; }; @@ -80,7 +79,7 @@ struct scrub_page { u64 logical; u64 physical; u64 physical_for_dev_replace; - atomic_t ref_count; + atomic_t refs; struct { unsigned int mirror_num:8; unsigned int have_csum:1; @@ -113,7 +112,7 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t ref_count; /* free mem on transition to zero */ + atomic_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; struct { @@ -142,7 +141,7 @@ struct scrub_parity { int stripe_len; - atomic_t ref_count; + atomic_t refs; struct list_head spages; @@ -194,6 +193,15 @@ struct scrub_ctx { */ struct btrfs_scrub_progress stat; spinlock_t stat_lock; + + /* + * Use a ref counter to avoid use-after-free issues. Scrub workers + * decrement bios_in_flight and workers_pending and then do a wakeup + * on the list_wait wait queue. We must ensure the main scrub task + * doesn't free the scrub context before or while the workers are + * doing the wakeup() call. + */ + atomic_t refs; }; struct scrub_fixup_nodatasum { @@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_fs_info *fs_info, - struct scrub_block *original_sblock, - u64 length, u64 logical, +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, @@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, const u8 *csum, u64 generation, u16 csum_size); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write); + struct scrub_block *sblock_good); static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write); @@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static void copy_nocow_pages_worker(struct btrfs_work *work); static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { + atomic_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } @@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx) { atomic_dec(&sctx->bios_in_flight); wake_up(&sctx->list_wait); + scrub_put_ctx(sctx); } static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) @@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + atomic_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) atomic_dec(&sctx->workers_pending); wake_up(&fs_info->scrub_pause_wait); wake_up(&sctx->list_wait); + scrub_put_ctx(sctx); } static void scrub_free_csums(struct scrub_ctx *sctx) @@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sctx); } +static void scrub_put_ctx(struct scrub_ctx *sctx) +{ + if (atomic_dec_and_test(&sctx->refs)) + scrub_free_ctx(sctx); +} + static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { @@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; + atomic_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = pages_per_rd_bio; sctx->curr = -1; @@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key root_key; + struct btrfs_key key; root_key.objectid = root; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, goto err; } - ret = inode_item_info(inum, 0, local_root, swarn->path); + /* + * this makes the path point to (inum INODE_ITEM ioff) + */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { btrfs_release_path(swarn->path); goto err; @@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover) static inline void scrub_put_recover(struct scrub_recover *recover) { if (atomic_dec_and_test(&recover->refs)) { - kfree(recover->bbio); - kfree(recover->raid_map); + btrfs_put_bbio(recover->bbio); kfree(recover); } } @@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, - logical, sblocks_for_recheck); + ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; @@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; -nodatasum_case: WARN_ON(sctx->is_dev_replace); +nodatasum_case: + /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified @@ -1091,76 +1114,20 @@ nodatasum_case: sblock_other->no_io_error_seen) { if (sctx->is_dev_replace) { scrub_write_block_to_dev_replace(sblock_other); + goto corrected_error; } else { - int force_write = is_metadata || have_csum; - ret = scrub_repair_block_from_good_copy( - sblock_bad, sblock_other, - force_write); + sblock_bad, sblock_other); + if (!ret) + goto corrected_error; } - if (0 == ret) - goto corrected_error; } } - /* - * for dev_replace, pick good pages and write to the target device. - */ - if (sctx->is_dev_replace) { - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; - page_num++) { - int sub_success; - - sub_success = 0; - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = - sblocks_for_recheck + mirror_index; - struct scrub_page *page_other = - sblock_other->pagev[page_num]; - - if (!page_other->io_error) { - ret = scrub_write_page_to_dev_replace( - sblock_other, page_num); - if (ret == 0) { - /* succeeded for this page */ - sub_success = 1; - break; - } else { - btrfs_dev_replace_stats_inc( - &sctx->dev_root-> - fs_info->dev_replace. - num_write_errors); - } - } - } - - if (!sub_success) { - /* - * did not find a mirror to fetch the page - * from. scrub_write_page_to_dev_replace() - * handles this case (page->io_error), by - * filling the block with zeros before - * submitting the write request - */ - success = 0; - ret = scrub_write_page_to_dev_replace( - sblock_bad, page_num); - if (ret) - btrfs_dev_replace_stats_inc( - &sctx->dev_root->fs_info-> - dev_replace.num_write_errors); - } - } - - goto out; - } + if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) + goto did_not_correct_error; /* - * for regular scrub, repair those pages that are errored. * In case of I/O errors in the area that is supposed to be * repaired, continue by picking good copies of those pages. * Select the good pages from mirrors to rewrite bad pages from @@ -1184,44 +1151,64 @@ nodatasum_case: * mirror, even if other 512 byte sectors in the same PAGE_SIZE * area are unreadable. */ - - /* can only fix I/O errors from here on */ - if (sblock_bad->no_io_error_seen) - goto did_not_correct_error; - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_block *sblock_other = NULL; - if (!page_bad->io_error) + /* skip no-io-error page in scrub */ + if (!page_bad->io_error && !sctx->is_dev_replace) continue; - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; - struct scrub_page *page_other = sblock_other->pagev[ - page_num]; - - if (!page_other->io_error) { - ret = scrub_repair_page_from_good_copy( - sblock_bad, sblock_other, page_num, 0); - if (0 == ret) { - page_bad->io_error = 0; - break; /* succeeded for this page */ + /* try to find no-io-error page in mirrors */ + if (page_bad->io_error) { + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (!sblocks_for_recheck[mirror_index]. + pagev[page_num]->io_error) { + sblock_other = sblocks_for_recheck + + mirror_index; + break; } } + if (!sblock_other) + success = 0; } - if (page_bad->io_error) { - /* did not find a mirror to copy the page from */ - success = 0; + if (sctx->is_dev_replace) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + if (!sblock_other) + sblock_other = sblock_bad; + + if (scrub_write_page_to_dev_replace(sblock_other, + page_num) != 0) { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + success = 0; + } + } else if (sblock_other) { + ret = scrub_repair_page_from_good_copy(sblock_bad, + sblock_other, + page_num, 0); + if (0 == ret) + page_bad->io_error = 0; + else + success = 0; } } - if (success) { + if (success && !sctx->is_dev_replace) { if (is_metadata || have_csum) { /* * need to verify the checksum now that all @@ -1288,19 +1275,18 @@ out: return 0; } -static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map) +static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) { - if (raid_map) { - if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) - return 3; - else - return 2; - } else { + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + return 3; + else return (int)bbio->num_stripes; - } } -static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, +static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, + u64 *raid_map, u64 mapped_length, int nstripes, int mirror, int *stripe_index, @@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, { int i; - if (raid_map) { + if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { /* RAID5/6 */ for (i = 0; i < nstripes; i++) { if (raid_map[i] == RAID6_Q_STRIPE || @@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, } } -static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_fs_info *fs_info, - struct scrub_block *original_sblock, - u64 length, u64 logical, +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck) { + struct scrub_ctx *sctx = original_sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + u64 length = original_sblock->page_count * PAGE_SIZE; + u64 logical = original_sblock->pagev[0]->logical; struct scrub_recover *recover; struct btrfs_bio *bbio; - u64 *raid_map; u64 sublen; u64 mapped_length; u64 stripe_offset; int stripe_index; - int page_index; + int page_index = 0; int mirror_index; int nmirrors; int ret; /* - * note: the two members ref_count and outstanding_pages + * note: the two members refs and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ - page_index = 0; while (length > 0) { sublen = min_t(u64, length, PAGE_SIZE); mapped_length = sublen; bbio = NULL; - raid_map = NULL; /* * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, - &mapped_length, &bbio, 0, &raid_map); + &mapped_length, &bbio, 0, 1); if (ret || !bbio || mapped_length < sublen) { - kfree(bbio); - kfree(raid_map); + btrfs_put_bbio(bbio); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { - kfree(bbio); - kfree(raid_map); + btrfs_put_bbio(bbio); return -ENOMEM; } atomic_set(&recover->refs, 1); recover->bbio = bbio; - recover->raid_map = raid_map; recover->map_length = mapped_length; BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); - nmirrors = scrub_nr_raid_mirrors(bbio, raid_map); + nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); + for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; struct scrub_page *page; - if (mirror_index >= BTRFS_MAX_MIRRORS) - continue; - sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; page = kzalloc(sizeof(*page), GFP_NOFS); @@ -1410,9 +1389,12 @@ leave_nomem: sblock->pagev[page_index] = page; page->logical = logical; - scrub_stripe_index_and_offset(logical, raid_map, + scrub_stripe_index_and_offset(logical, + bbio->map_type, + bbio->raid_map, mapped_length, - bbio->num_stripes, + bbio->num_stripes - + bbio->num_tgtdevs, mirror_index, &stripe_index, &stripe_offset); @@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error) static inline int scrub_is_page_on_raid56(struct scrub_page *page) { - return page->recover && page->recover->raid_map; + return page->recover && + (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, @@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, bio->bi_end_io = scrub_bio_wait_endio; ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, - page->recover->raid_map, page->recover->map_length, page->mirror_num, 0); if (ret) @@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, } static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write) + struct scrub_block *sblock_good) { int page_num; int ret = 0; @@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, ret_sub = scrub_repair_page_from_good_copy(sblock_bad, sblock_good, - page_num, - force_write); + page_num, 1); if (ret_sub) ret = ret_sub; } @@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->ref_count); + atomic_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->ref_count)) { + if (atomic_dec_and_test(&sblock->refs)) { int i; if (sblock->sparity) @@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock) static void scrub_page_get(struct scrub_page *spage) { - atomic_inc(&spage->ref_count); + atomic_inc(&spage->refs); } static void scrub_page_put(struct scrub_page *spage) { - if (atomic_dec_and_test(&spage->ref_count)) { + if (atomic_dec_and_test(&spage->refs)) { if (spage->page) __free_page(spage->page); kfree(spage); @@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->ref_count, 1); + atomic_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->ref_count, 1); + atomic_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; sblock->sparity = sparity; @@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_raid_bio *rbio; struct scrub_page *spage; struct btrfs_bio *bbio = NULL; - u64 *raid_map = NULL; u64 length; int ret; @@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) length = sparity->logic_end - sparity->logic_start + 1; ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, sparity->logic_start, - &length, &bbio, 0, &raid_map); - if (ret || !bbio || !raid_map) + &length, &bbio, 0, 1); + if (ret || !bbio || !bbio->raid_map) goto bbio_out; bio = btrfs_io_bio_alloc(GFP_NOFS, 0); @@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bio->bi_end_io = scrub_parity_bio_endio; rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, - raid_map, length, - sparity->scrub_dev, + length, sparity->scrub_dev, sparity->dbitmap, sparity->nsectors); if (!rbio) @@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); bbio_out: - kfree(bbio); - kfree(raid_map); + btrfs_put_bbio(bbio); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); @@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors) static void scrub_parity_get(struct scrub_parity *sparity) { - atomic_inc(&sparity->ref_count); + atomic_inc(&sparity->refs); } static void scrub_parity_put(struct scrub_parity *sparity) { - if (!atomic_dec_and_test(&sparity->ref_count)) + if (!atomic_dec_and_test(&sparity->refs)) return; scrub_parity_check_and_repair(sparity); @@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->scrub_dev = sdev; sparity->logic_start = logic_start; sparity->logic_end = logic_end; - atomic_set(&sparity->ref_count, 1); + atomic_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->spages); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; @@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); mirror_num = 1; @@ -3074,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ logical = base + offset; physical_end = physical + nstripes * map->stripe_len; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical_end, num, map, &logic_end, NULL); logic_end += base; @@ -3121,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ret = 0; while (physical < physical_end) { /* for raid56, we skip parity stripe */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = get_raid56_logic_offset(physical, num, map, &logical, &stripe_logical); logical += base; @@ -3280,8 +3254,7 @@ again: scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { /* * loop until we find next data stripe * or we have finished all stripes. @@ -3775,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); - scrub_free_ctx(sctx); + scrub_put_ctx(sctx); return ret; } @@ -3881,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < extent_len || !bbio->stripes[0].dev->bdev) { - kfree(bbio); + btrfs_put_bbio(bbio); return; } *extent_physical = bbio->stripes[0].physical; *extent_mirror_num = bbio->mirror_num; *extent_dev = bbio->stripes[0].dev; - kfree(bbio); + btrfs_put_bbio(bbio); } static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 804432dbc351..fe5857223515 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, - btrfs_inode_atime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, - btrfs_inode_mtime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, - btrfs_inode_ctime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); /* TODO Add otime support when the otime patches get into upstream */ ret = send_cmd(sctx); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6f49b2872a64..05fef198ff94 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1958,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans, root); } -static int btrfs_unfreeze(struct super_block *sb) -{ - return 0; -} - static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -2011,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, - .unfreeze_fs = btrfs_unfreeze, }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 92db3f648df4..94edb0a2a026 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -733,10 +733,18 @@ int btrfs_init_sysfs(void) ret = btrfs_init_debugfs(); if (ret) - return ret; + goto out1; init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + if (ret) + goto out2; + + return 0; +out2: + debugfs_remove_recursive(btrfs_debugfs_root_dentry); +out1: + kset_unregister(btrfs_kset); return ret; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index cc286ce97d1e..f51963a8f929 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -53,7 +53,7 @@ static int test_btrfs_split_item(void) return -ENOMEM; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 7e99c2f98dd0..9e9f2368177d 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -258,8 +258,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, - (unsigned long)-1, GFP_NOFS); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); out: if (locked_page) page_cache_release(locked_page); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 3ae0f5b8bb80..a116b55ce788 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void) goto out; } - root->node = alloc_dummy_extent_buffer(0, 4096); + root->node = alloc_dummy_extent_buffer(NULL, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -843,7 +843,7 @@ static int test_hole_first(void) goto out; } - root->node = alloc_dummy_extent_buffer(0, 4096); + root->node = alloc_dummy_extent_buffer(NULL, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ec3dcb202357..73f299ebdabb 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -404,12 +404,22 @@ int btrfs_test_qgroups(void) ret = -ENOMEM; goto out; } + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to add the root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; /* * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -448,17 +458,6 @@ int btrfs_test_qgroups(void) goto out; } - /* We are using this root as our extent root */ - root->fs_info->extent_root = root; - - /* - * Some of the paths we test assume we have a filled out fs_info, so we - * just need to addt he root in there so we don't panic. - */ - root->fs_info->tree_root = root; - root->fs_info->quota_root = root; - root->fs_info->quota_enabled = 1; - test_msg("Running qgroup tests\n"); ret = test_no_shared_qgroup(root); if (ret) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e88b59d13439..7e80f32550a6 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -220,6 +220,7 @@ loop: * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); + cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); cur_trans->delayed_refs.href_root = RB_ROOT; @@ -248,6 +249,8 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->pending_ordered); + INIT_LIST_HEAD(&cur_trans->dirty_bgs); + spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -1020,6 +1023,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, u64 old_root_bytenr; u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; + bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID); old_root_used = btrfs_root_used(&root->root_item); btrfs_write_dirty_block_groups(trans, root); @@ -1027,7 +1031,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && - old_root_used == btrfs_root_used(&root->root_item)) + old_root_used == btrfs_root_used(&root->root_item) && + (!extent_root || + list_empty(&trans->transaction->dirty_bgs))) break; btrfs_set_root_node(&root->root_item, root->node); @@ -1038,7 +1044,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; old_root_used = btrfs_root_used(&root->root_item); - ret = btrfs_write_dirty_block_groups(trans, root); + if (extent_root) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + } + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; } @@ -1061,10 +1075,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); @@ -1097,6 +1107,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + clear_bit(BTRFS_ROOT_DIRTY, &root->state); if (root != fs_info->extent_root) list_add_tail(&root->dirty_list, @@ -1983,6 +1994,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); + ASSERT(list_empty(&cur_trans->dirty_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); @@ -2026,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); + if (cur_trans->have_free_bgs) + btrfs_clear_space_info_full(root->fs_info); + root->fs_info->last_trans_committed = cur_trans->transid; /* * We needn't acquire the lock here because there is no other task diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 00ed29c4b3f9..937050a2b68e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,6 +47,11 @@ struct btrfs_transaction { atomic_t num_writers; atomic_t use_count; + /* + * true if there is free bgs operations in this transaction + */ + int have_free_bgs; + /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; struct list_head list; @@ -58,6 +63,8 @@ struct btrfs_transaction { struct list_head pending_chunks; struct list_head pending_ordered; struct list_head switch_commits; + struct list_head dirty_bgs; + spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1a9585d4380a..9a37f8b39bae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, insert: btrfs_release_path(path); /* try to insert the key into the destination tree */ + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, key, item_size); + path->skip_release_on_error = 0; /* make sure any existing item is the correct size */ - if (ret == -EEXIST) { + if (ret == -EEXIST || ret == -EOVERFLOW) { u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -488,8 +490,20 @@ insert: src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) + if (btrfs_inode_generation(eb, src_item) == 0) { + struct extent_buffer *dst_eb = path->nodes[0]; + + if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + struct btrfs_map_token token; + u64 ino_size = btrfs_inode_size(eb, src_item); + + btrfs_init_map_token(&token); + btrfs_set_token_inode_size(dst_eb, dst_item, + ino_size, &token); + } goto no_copy; + } if (overwrite_root && S_ISDIR(btrfs_inode_mode(eb, src_item)) && @@ -844,7 +858,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - char *name, int namelen) + const char *name, int namelen) { struct btrfs_path *path; struct btrfs_inode_ref *ref; @@ -1254,13 +1268,14 @@ out: } static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) + struct btrfs_root *root, u64 ino) { int ret; - ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, - offset, BTRFS_ORPHAN_ITEM_KEY, NULL); - if (ret > 0) - ret = btrfs_insert_orphan_item(trans, root, offset); + + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; + return ret; } @@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root, leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *) (ptr + cur_offset); @@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root, } btrfs_release_path(path); - if (ret < 0) + if (ret < 0 && ret != -ENOENT) return ret; return nlink; } @@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, nlink = ret; ret = count_inode_extrefs(root, inode, path); - if (ret == -ENOENT) - ret = 0; - if (ret < 0) goto out; @@ -1557,6 +1570,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } /* + * Return true if an inode reference exists in the log for the given name, + * inode and parent inode. + */ +static bool name_in_log_ref(struct btrfs_root *log_root, + const char *name, const int name_len, + const u64 dirid, const u64 ino) +{ + struct btrfs_key search_key; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = dirid; + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(dirid, name, name_len); + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + return false; +} + +/* * take a single entry in a log directory item and replay it into * the subvolume. * @@ -1666,10 +1703,17 @@ out: return ret; insert: + if (name_in_log_ref(root->log_root, name, name_len, + key->objectid, log_key.objectid)) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) + if (ret && ret != -ENOENT && ret != -EEXIST) goto out; update_size = false; ret = 0; @@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; @@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); if (atomic_read(&root->log_writers)) schedule(); - mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); + mutex_lock(&root->log_mutex); } } @@ -3219,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only) + struct inode *inode, int log_inode_only, + u64 logged_isize) { struct btrfs_map_token token; @@ -3232,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * to say 'update this inode with these values' */ btrfs_set_token_inode_generation(leaf, item, 0, &token); - btrfs_set_token_inode_size(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, logged_isize, &token); } else { btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -3245,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), @@ -3284,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); btrfs_release_path(path); return 0; } @@ -3293,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, struct btrfs_path *src_path, u64 *last_extent, - int start_slot, int nr, int inode_only) + int start_slot, int nr, int inode_only, + u64 logged_isize) { unsigned long src_offset; unsigned long dst_offset; @@ -3350,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, dst_path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, - inode, inode_only == LOG_INODE_EXISTS); + inode, inode_only == LOG_INODE_EXISTS, + logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, src_offset, ins_sizes[i]); @@ -3902,6 +3949,33 @@ process: return ret; } +static int logged_inode_size(struct btrfs_root *log, struct inode *inode, + struct btrfs_path *path, u64 *size_ret) +{ + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *size_ret = i_size_read(inode); + } else { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + } + + btrfs_release_path(path); + return 0; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -3939,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 logged_isize = 0; path = btrfs_alloc_path(); if (!path) @@ -3966,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* Only run delayed items if we are a dir or a new file */ + /* + * Only run delayed items if we are a dir or a new file. + * Otherwise commit the delayed inode only, which is needed in + * order for the log replay code to mark inodes for link count + * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + */ if (S_ISDIR(inode->i_mode) || - BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; - } + else + ret = btrfs_commit_inode_delayed_inode(inode); + + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; } mutex_lock(&BTRFS_I(inode)->log_mutex); @@ -3988,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; + if (inode_only == LOG_INODE_EXISTS) { + max_key_type = BTRFS_INODE_EXTREF_KEY; + max_key.type = max_key_type; + } ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { - if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); - } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + if (inode_only == LOG_INODE_EXISTS) { + /* + * Make sure the new inode item we write to the log has + * the same isize as the current one (if it exists). + * This is necessary to prevent data loss after log + * replay, and also to prevent doing a wrong expanding + * truncate - for e.g. create file, write 4K into offset + * 0, fsync, write 4K into offset 4096, add hard link, + * fsync some other file (to sync log), power fail - if + * we use the inode's current i_size, after log replay + * we get a 8Kb file, with the last 4Kb extent as a hole + * (zeroes), as if an expanding truncate happened, + * instead of getting a file of 4Kb only. + */ + err = logged_inode_size(log, inode, path, + &logged_isize); + if (err) + goto out_unlock; + } + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + if (inode_only == LOG_INODE_EXISTS) { + max_key.type = BTRFS_INODE_EXTREF_KEY; + ret = drop_objectid_items(trans, log, path, ino, + max_key.type); + } else { + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } + } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) + if (inode_only == LOG_INODE_ALL) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; + } else { + max_key.type = BTRFS_INODE_EXTREF_KEY; + } ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4047,7 +4163,8 @@ again: } ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4071,7 +4188,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, ins_start_slot, - ins_nr, inode_only); + ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4092,7 +4209,8 @@ next_slot: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4273,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; + const struct dentry * const first_parent = parent; + const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > + last_committed); sb = inode->i_sb; @@ -4328,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - inode_only = LOG_INODE_EXISTS; while (1) { if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; @@ -4337,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; + /* + * On unlink we must make sure our immediate parent directory + * inode is fully logged. This is to prevent leaving dangling + * directory index entries and a wrong directory inode's i_size. + * Not doing so can result in a directory being impossible to + * delete after log replay (rmdir will always fail with error + * -ENOTEMPTY). + */ + if (did_unlink && parent == first_parent) + inode_only = LOG_INODE_ALL; + else + inode_only = LOG_INODE_EXISTS; + if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed) { + root->fs_info->last_trans_committed || + inode_only == LOG_INODE_ALL) { ret = btrfs_log_inode(trans, root, inode, inode_only, 0, LLONG_MAX, ctx); if (ret) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 50c5a8762aed..cd4d1315aaa9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1310,6 +1310,8 @@ again: if (ret) { btrfs_error(root->fs_info, ret, "Failed to remove dev extent item"); + } else { + trans->transaction->have_free_bgs = 1; } out: btrfs_free_path(path); @@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { - if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) + if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) return; btrfs_set_fs_incompat(info, RAID56); @@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); - } free_extent_map(em); return len; } @@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); return ret; @@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b) } /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) { struct btrfs_bio_stripe s; - int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; int i; u64 l; int again = 1; - int m; while (again) { again = 0; - for (i = 0; i < real_stripes - 1; i++) { - if (parity_smaller(raid_map[i], raid_map[i+1])) { + for (i = 0; i < num_stripes - 1; i++) { + if (parity_smaller(bbio->raid_map[i], + bbio->raid_map[i+1])) { s = bbio->stripes[i]; - l = raid_map[i]; + l = bbio->raid_map[i]; bbio->stripes[i] = bbio->stripes[i+1]; - raid_map[i] = raid_map[i+1]; + bbio->raid_map[i] = bbio->raid_map[i+1]; bbio->stripes[i+1] = s; - raid_map[i+1] = l; - - if (bbio->tgtdev_map) { - m = bbio->tgtdev_map[i]; - bbio->tgtdev_map[i] = - bbio->tgtdev_map[i + 1]; - bbio->tgtdev_map[i + 1] = m; - } + bbio->raid_map[i+1] = l; again = 1; } @@ -4909,10 +4900,41 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) } } +static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) +{ + struct btrfs_bio *bbio = kzalloc( + sizeof(struct btrfs_bio) + + sizeof(struct btrfs_bio_stripe) * (total_stripes) + + sizeof(int) * (real_stripes) + + sizeof(u64) * (real_stripes), + GFP_NOFS); + if (!bbio) + return NULL; + + atomic_set(&bbio->error, 0); + atomic_set(&bbio->refs, 1); + + return bbio; +} + +void btrfs_get_bbio(struct btrfs_bio *bbio) +{ + WARN_ON(!atomic_read(&bbio->refs)); + atomic_inc(&bbio->refs); +} + +void btrfs_put_bbio(struct btrfs_bio *bbio) +{ + if (!bbio) + return; + if (atomic_dec_and_test(&bbio->refs)) + kfree(bbio); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, - int mirror_num, u64 **raid_map_ret) + int mirror_num, int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -4925,7 +4947,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr_orig; u64 stripe_nr_end; u64 stripe_len; - u64 *raid_map = NULL; int stripe_index; int i; int ret = 0; @@ -4976,7 +4997,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_offset = offset - stripe_offset; /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); raid56_full_stripe_start = offset; @@ -4989,8 +5010,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & REQ_DISCARD) { /* we don't discard raid56 yet */ - if (map->type & - (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out; } @@ -5000,7 +5020,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && (rw & REQ_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); @@ -5047,7 +5067,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 physical_of_found = 0; ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, NULL); + logical, &tmp_length, &tmp_bbio, 0, 0); if (ret) { WARN_ON(tmp_bbio != NULL); goto out; @@ -5061,7 +5081,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * is not left of the left cursor */ ret = -EIO; - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); goto out; } @@ -5096,11 +5116,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else { WARN_ON(1); ret = -EIO; - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); goto out; } - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5166,15 +5186,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, mirror_num = stripe_index - old_stripe_index + 1; } - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { - u64 tmp; - - if (raid_map_ret && + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + if (need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { - int i, rot; - /* push stripe_nr back to the start of the full stripe */ stripe_nr = raid56_full_stripe_start; do_div(stripe_nr, stripe_len * nr_data_stripes(map)); @@ -5183,32 +5198,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = map->num_stripes; max_errors = nr_parity_stripes(map); - raid_map = kmalloc_array(num_stripes, sizeof(u64), - GFP_NOFS); - if (!raid_map) { - ret = -ENOMEM; - goto out; - } - - /* Work out the disk rotation on this stripe-set */ - tmp = stripe_nr; - rot = do_div(tmp, num_stripes); - - /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) - raid_map[(i+rot) % num_stripes] = - em->start + (tmp + i) * map->stripe_len; - - raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; - if (map->type & BTRFS_BLOCK_GROUP_RAID6) - raid_map[(i+rot+1) % num_stripes] = - RAID6_Q_STRIPE; - *length = map->stripe_len; stripe_index = 0; stripe_offset = 0; } else { + u64 tmp; + /* * Mirror #0 or #1 means the original data block. * Mirror #2 is RAID5 parity block. @@ -5246,17 +5241,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, tgtdev_indexes = num_stripes; } - bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes), - GFP_NOFS); + bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); if (!bbio) { - kfree(raid_map); ret = -ENOMEM; goto out; } - atomic_set(&bbio->error, 0); if (dev_replace_is_ongoing) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); + /* build raid_map */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && + need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + mirror_num > 1)) { + u64 tmp; + int i, rot; + + bbio->raid_map = (u64 *)((void *)bbio->stripes + + sizeof(struct btrfs_bio_stripe) * + num_alloc_stripes + + sizeof(int) * tgtdev_indexes); + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + bbio->raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + bbio->raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + } + if (rw & REQ_DISCARD) { int factor = 0; int sub_stripes = 0; @@ -5340,6 +5360,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) max_errors = btrfs_chunk_max_errors(map); + if (bbio->raid_map) + sort_parity_stripes(bbio, num_stripes); + tgtdev_indexes = 0; if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && dev_replace->tgtdev != NULL) { @@ -5427,6 +5450,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } *bbio_ret = bbio; + bbio->map_type = map->type; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; @@ -5443,10 +5467,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->stripes[0].physical = physical_to_patch_in_first_stripe; bbio->mirror_num = map->num_stripes + 1; } - if (raid_map) { - sort_parity_stripes(bbio, raid_map); - *raid_map_ret = raid_map; - } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); @@ -5459,17 +5479,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_bio **bbio_ret, int mirror_num) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num, NULL); + mirror_num, 0); } /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, - u64 **raid_map_ret) + int need_raid_map) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num, raid_map_ret); + mirror_num, need_raid_map); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -5511,8 +5531,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { do_div(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } @@ -5565,7 +5584,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e bio_endio_nodec(bio, err); else bio_endio(bio, err); - kfree(bbio); + btrfs_put_bbio(bbio); } static void btrfs_end_bio(struct bio *bio, int err) @@ -5808,7 +5827,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; - u64 *raid_map = NULL; int ret; int dev_nr = 0; int total_devs = 1; @@ -5819,7 +5837,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, btrfs_bio_counter_inc_blocked(root->fs_info); ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num, &raid_map); + mirror_num, 1); if (ret) { btrfs_bio_counter_dec(root->fs_info); return ret; @@ -5832,15 +5850,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (raid_map) { + if (bbio->raid_map) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { - ret = raid56_parity_write(root, bio, bbio, - raid_map, map_length); + ret = raid56_parity_write(root, bio, bbio, map_length); } else { - ret = raid56_parity_recover(root, bio, bbio, - raid_map, map_length, + ret = raid56_parity_recover(root, bio, bbio, map_length, mirror_num, 1); } @@ -6238,17 +6254,22 @@ int btrfs_read_sys_array(struct btrfs_root *root) struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; - u8 *ptr; - unsigned long sb_ptr; + u8 *array_ptr; + unsigned long sb_array_offset; int ret = 0; u32 num_stripes; u32 array_size; u32 len = 0; - u32 cur; + u32 cur_offset; struct btrfs_key key; - sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, - BTRFS_SUPER_INFO_SIZE); + ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); + /* + * This will create extent buffer of nodesize, superblock size is + * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will + * overallocate but we can keep it as-is, only the first page is used. + */ + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); @@ -6271,35 +6292,56 @@ int btrfs_read_sys_array(struct btrfs_root *root) write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); - ptr = super_copy->sys_chunk_array; - sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); - cur = 0; + array_ptr = super_copy->sys_chunk_array; + sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); + cur_offset = 0; + + while (cur_offset < array_size) { + disk_key = (struct btrfs_disk_key *)array_ptr; + len = sizeof(*disk_key); + if (cur_offset + len > array_size) + goto out_short_read; - while (cur < array_size) { - disk_key = (struct btrfs_disk_key *)ptr; btrfs_disk_key_to_cpu(&key, disk_key); - len = sizeof(*disk_key); ptr += len; - sb_ptr += len; - cur += len; + array_ptr += len; + sb_array_offset += len; + cur_offset += len; if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)sb_ptr; + chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be + * present, exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; + + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + ret = read_one_chunk(root, &key, sb, chunk); if (ret) break; - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - len = btrfs_chunk_item_size(num_stripes); } else { ret = -EIO; break; } - ptr += len; - sb_ptr += len; - cur += len; + array_ptr += len; + sb_array_offset += len; + cur_offset += len; } free_extent_buffer(sb); return ret; + +out_short_read: + printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", + len, cur_offset); + free_extent_buffer(sb); + return -EIO; } int btrfs_read_chunk_tree(struct btrfs_root *root) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d6fe73c0f4a2..83069dec6898 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); #define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) struct btrfs_bio { + atomic_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; + u64 map_type; /* get from map_lookup->type */ bio_end_io_t *end_io; struct bio *orig_bio; unsigned long flags; @@ -307,6 +309,12 @@ struct btrfs_bio { int mirror_num; int num_tgtdevs; int *tgtdev_map; + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; struct btrfs_bio_stripe stripes[]; }; @@ -388,19 +396,15 @@ struct btrfs_balance_control { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); - -#define btrfs_bio_size(total_stripes, real_stripes) \ - (sizeof(struct btrfs_bio) + \ - (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \ - (sizeof(int) * (real_stripes))) - +void btrfs_get_bbio(struct btrfs_bio *bbio); +void btrfs_put_bbio(struct btrfs_bio *bbio); int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, - u64 **raid_map_ret); + int need_raid_map); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 5bd853ba44ff..64fa248343f6 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode, spin_unlock(&ci->i_ceph_lock); } -static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, - int type) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct posix_acl *acl = ACL_NOT_CACHED; - - spin_lock(&ci->i_ceph_lock); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) - acl = get_cached_acl(inode, type); - spin_unlock(&ci->i_ceph_lock); - - return acl; -} - struct posix_acl *ceph_get_acl(struct inode *inode, int type) { int size; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 24be059fd1f8..fd5599d32362 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -196,17 +196,22 @@ static int readpage_nounlock(struct file *filp, struct page *page) u64 len = PAGE_CACHE_SIZE; if (off >= i_size_read(inode)) { - zero_user_segment(page, err, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); return 0; } - /* - * Uptodate inline data should have been added into page cache - * while getting Fcr caps. - */ - if (ci->i_inline_version != CEPH_INLINE_NONE) - return -EINVAL; + if (ci->i_inline_version != CEPH_INLINE_NONE) { + /* + * Uptodate inline data should have been added + * into page cache while getting Fcr caps. + */ + if (off == 0) + return -EINVAL; + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } err = ceph_readpage_from_fscache(inode, page); if (err == 0) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index b93c631c6c87..8172775428a0 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode, struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) { - ceph_get_snap_realm(mdsc, realm); spin_lock(&realm->inodes_with_caps_lock); ci->i_snap_realm = realm; list_add(&ci->i_snap_realm_item, @@ -1451,8 +1450,8 @@ static int __mark_caps_flushing(struct inode *inode, spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; if (list_empty(&ci->i_flushing_item)) { + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; dout(" inode %p now flushing seq %lld\n", inode, @@ -2073,17 +2072,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, int *got, struct page **pinned_page, - int *check_max, int *err) + loff_t endoff, int *got, int *check_max, int *err) { struct inode *inode = &ci->vfs_inode; int ret = 0; - int have, implemented, _got = 0; + int have, implemented; int file_wanted; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); -again: + spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2138,50 +2136,34 @@ again: inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { - _got = need | (have & want); - __take_cap_refs(ci, _got); + *got = need | (have & want); + __take_cap_refs(ci, *got); ret = 1; } } else { + int session_readonly = false; + if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { + struct ceph_mds_session *s = ci->i_auth_cap->session; + spin_lock(&s->s_cap_lock); + session_readonly = s->s_readonly; + spin_unlock(&s->s_cap_lock); + } + if (session_readonly) { + dout("get_cap_refs %p needed %s but mds%d readonly\n", + inode, ceph_cap_string(need), ci->i_auth_cap->mds); + *err = -EROFS; + ret = 1; + goto out_unlock; + } + dout("get_cap_refs %p have %s needed %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } out_unlock: spin_unlock(&ci->i_ceph_lock); - if (ci->i_inline_version != CEPH_INLINE_NONE && - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(inode) > 0) { - int ret1; - struct page *page = find_get_page(inode->i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - goto out; - } - page_cache_release(page); - } - /* - * drop cap refs first because getattr while holding - * caps refs can cause deadlock. - */ - ceph_put_cap_refs(ci, _got); - _got = 0; - - /* getattr request will bring inline data into page cache */ - ret1 = __ceph_do_getattr(inode, NULL, - CEPH_STAT_CAP_INLINE_DATA, true); - if (ret1 >= 0) { - ret = 0; - goto again; - } - *err = ret1; - ret = 1; - } -out: dout("get_cap_refs %p ret %d got %s\n", inode, - ret, ceph_cap_string(_got)); - *got = _got; + ret, ceph_cap_string(*got)); return ret; } @@ -2221,22 +2203,52 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int check_max, ret, err; + int _got, check_max, ret, err = 0; retry: if (endoff > 0) check_max_size(&ci->vfs_inode, endoff); + _got = 0; check_max = 0; - err = 0; ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, endoff, - got, pinned_page, - &check_max, &err)); + try_get_cap_refs(ci, need, want, endoff, + &_got, &check_max, &err)); if (err) ret = err; + if (ret < 0) + return ret; + if (check_max) goto retry; - return ret; + + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + goto out; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while holding + * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* getattr request will bring inline data into page cache */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret < 0) + return ret; + goto retry; + } +out: + *got = _got; + return 0; } /* @@ -2432,13 +2444,13 @@ static void invalidate_aliases(struct inode *inode) */ static void handle_cap_grant(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *grant, - void *snaptrace, int snaptrace_len, u64 inline_version, void *inline_data, int inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, struct ceph_cap *cap, int issued) __releases(ci->i_ceph_lock) + __releases(mdsc->snap_rwsem) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2639,10 +2651,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, spin_unlock(&ci->i_ceph_lock); if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, - snaptrace + snaptrace_len, false); - downgrade_write(&mdsc->snap_rwsem); kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); if (newcaps & ~issued) @@ -3052,6 +3060,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; + struct ceph_snap_realm *realm; int mds = session->s_mds; int op, issued; u32 seq, mseq; @@ -3153,11 +3162,23 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done_unlocked; case CEPH_CAP_OP_IMPORT: + realm = NULL; + if (snaptrace_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } handle_cap_import(mdsc, inode, h, peer, session, &cap, &issued); - handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, msg->middle, session, cap, issued); + if (realm) + ceph_put_snap_realm(mdsc, realm); goto done_unlocked; } @@ -3177,7 +3198,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - handle_cap_grant(mdsc, inode, h, NULL, 0, + handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, msg->middle, session, cap, issued); goto done_unlocked; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c241603764fd..0411dbb15815 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -26,8 +26,6 @@ * point by name. */ -const struct inode_operations ceph_dir_iops; -const struct file_operations ceph_dir_fops; const struct dentry_operations ceph_dentry_ops; /* @@ -672,13 +670,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) /* * We created the item, then did a lookup, and found * it was already linked to another inode we already - * had in our cache (and thus got spliced). Link our - * dentry to that inode, but don't hash it, just in - * case the VFS wants to dereference it. + * had in our cache (and thus got spliced). To not + * confuse VFS (especially when inode is a directory), + * we don't link our dentry to that inode, return an + * error instead. + * + * This event should be rare and it happens only when + * we talk to old MDS. Recent MDS does not send traceless + * reply for request that creates new inode. */ - BUG_ON(!result->d_inode); - d_instantiate(dentry, result->d_inode); - return 0; + d_drop(result); + return -ESTALE; } return PTR_ERR(result); } @@ -1335,6 +1337,13 @@ const struct file_operations ceph_dir_fops = { .fsync = ceph_dir_fsync, }; +const struct file_operations ceph_snapdir_fops = { + .iterate = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, +}; + const struct inode_operations ceph_dir_iops = { .lookup = ceph_lookup, .permission = ceph_permission, @@ -1357,6 +1366,14 @@ const struct inode_operations ceph_dir_iops = { .atomic_open = ceph_atomic_open, }; +const struct inode_operations ceph_snapdir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .mkdir = ceph_mkdir, + .rmdir = ceph_unlink, +}; + const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_d_release, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 905986dd4c3c..a3d774b35149 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -275,10 +275,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); + err = ceph_handle_snapdir(req, dentry, err); if (err) goto out_req; - err = ceph_handle_snapdir(req, dentry, err); if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); @@ -392,13 +392,14 @@ more: if (ret >= 0) { int didpages; if (was_short && (pos + ret < inode->i_size)) { - u64 tmp = min(this_len - ret, - inode->i_size - pos - ret); + int zlen = min(this_len - ret, + inode->i_size - pos - ret); + int zoff = (o_direct ? buf_align : io_align) + + read + ret; dout(" zero gap %llu to %llu\n", - pos + ret, pos + ret + tmp); - ceph_zero_page_vector_range(page_align + read + ret, - tmp, pages); - ret += tmp; + pos + ret, pos + ret + zlen); + ceph_zero_page_vector_range(zoff, zlen, pages); + ret += zlen; } didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; @@ -878,28 +879,34 @@ again: i_size = i_size_read(inode); if (retry_op == READ_INLINE) { - /* does not support inline data > PAGE_SIZE */ - if (i_size > PAGE_CACHE_SIZE) { - ret = -EIO; - } else if (iocb->ki_pos < i_size) { + BUG_ON(ret > 0 || read > 0); + if (iocb->ki_pos < i_size && + iocb->ki_pos < PAGE_CACHE_SIZE) { loff_t end = min_t(loff_t, i_size, iocb->ki_pos + len); + end = min_t(loff_t, end, PAGE_CACHE_SIZE); if (statret < end) zero_user_segment(page, statret, end); ret = copy_page_to_iter(page, iocb->ki_pos & ~PAGE_MASK, end - iocb->ki_pos, to); iocb->ki_pos += ret; - } else { - ret = 0; + read += ret; + } + if (iocb->ki_pos < i_size && read < len) { + size_t zlen = min_t(size_t, len - read, + i_size - iocb->ki_pos); + ret = iov_iter_zero(zlen, to); + iocb->ki_pos += ret; + read += ret; } __free_pages(page, 0); - return ret; + return read; } /* hit EOF or hole? */ if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && - ret < len) { + ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" ", reading more\n", iocb->ki_pos, inode->i_size); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 6b5173605154..119c43c80638 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -82,8 +82,8 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; - inode->i_op = &ceph_dir_iops; - inode->i_fop = &ceph_dir_fops; + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; return inode; @@ -838,30 +838,31 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ceph_vinop(inode), inode->i_mode); } - /* set dir completion flag? */ - if (S_ISDIR(inode->i_mode) && - ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - !__ceph_dir_is_complete(ci)) { - dout(" marking %p complete (empty)\n", inode); - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count), - ci->i_ordered_count); - } - /* were we issued a capability? */ if (info->cap.caps) { if (ceph_snap(inode) == CEPH_NOSNAP) { + unsigned caps = le32_to_cpu(info->cap.caps); ceph_add_cap(inode, session, le64_to_cpu(info->cap.cap_id), - cap_fmode, - le32_to_cpu(info->cap.caps), + cap_fmode, caps, le32_to_cpu(info->cap.wanted), le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), le64_to_cpu(info->cap.realm), info->cap.flags, &new_cap); + + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + (caps & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, + atomic_read(&ci->i_release_count), + ci->i_ordered_count); + } + wake = true; } else { dout(" %p got snap_caps %s\n", inode, @@ -1446,12 +1447,14 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL); - if (IS_ERR(dn)) { - err = PTR_ERR(dn); + struct dentry *realdn = splice_dentry(dn, in, NULL); + if (IS_ERR(realdn)) { + err = PTR_ERR(realdn); + d_drop(dn); dn = NULL; goto next_item; } + dn = realdn; } di = dn->d_fsdata; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5f62fb7a5d0a..71c073f38e54 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -480,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->max_sessions = newmax; } mdsc->sessions[mds] = s; + atomic_inc(&mdsc->num_sessions); atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, @@ -503,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc, mdsc->sessions[s->s_mds] = NULL; ceph_con_close(&s->s_con); ceph_put_mds_session(s); + atomic_dec(&mdsc->num_sessions); } /* @@ -842,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 struct ceph_options *opt = mdsc->fsc->client->options; void *p; - const char* metadata[3][2] = { + const char* metadata[][2] = { {"hostname", utsname()->nodename}, + {"kernel_version", utsname()->release}, {"entity_id", opt->name ? opt->name : ""}, {NULL, NULL} }; @@ -1464,19 +1467,33 @@ out_unlocked: return err; } +static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + spin_lock(&ci->i_ceph_lock); + if (ci->i_flushing_caps) + ret = ci->i_cap_flush_seq >= want_flush_seq; + else + ret = 1; + spin_unlock(&ci->i_ceph_lock); + return ret; +} + /* * flush all dirty inode data to disk. * * returns true if we've flushed through want_flush_seq */ -static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) { - int mds, ret = 1; + int mds; dout("check_cap_flush want %lld\n", want_flush_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; mds++) { struct ceph_mds_session *session = mdsc->sessions[mds]; + struct inode *inode = NULL; if (!session) continue; @@ -1489,29 +1506,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) list_entry(session->s_cap_flushing.next, struct ceph_inode_info, i_flushing_item); - struct inode *inode = &ci->vfs_inode; - spin_lock(&ci->i_ceph_lock); - if (ci->i_cap_flush_seq <= want_flush_seq) { + if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", inode, - ci->i_cap_flush_seq, want_flush_seq, - session->s_mds); - ret = 0; + "seq %lld <= %lld to mds%d\n", + &ci->vfs_inode, ci->i_cap_flush_seq, + want_flush_seq, session->s_mds); + inode = igrab(&ci->vfs_inode); } - spin_unlock(&ci->i_ceph_lock); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); - if (!ret) - return ret; + if (inode) { + wait_event(mdsc->cap_flushing_wq, + check_cap_flush(inode, want_flush_seq)); + iput(inode); + } + mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); - return ret; } /* @@ -1923,7 +1940,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->num_releases = cpu_to_le16(releases); /* time stamp */ - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; @@ -2012,7 +2033,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* time stamp */ p = msg->front.iov_base + req->r_request_release_offset; - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2159,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) p = rb_next(p); if (req->r_got_unsafe) continue; + if (req->r_attempts > 0) + continue; /* only new requests */ if (req->r_session && req->r_session->s_mds == mds) { dout(" kicking tid %llu\n", req->r_tid); @@ -2286,6 +2313,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) struct ceph_mds_request *req; struct ceph_mds_reply_head *head = msg->front.iov_base; struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + struct ceph_snap_realm *realm; u64 tid; int err, result; int mds = session->s_mds; @@ -2401,11 +2429,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* snap trace */ + realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); ceph_update_snap_trace(mdsc, rinfo->snapblob, - rinfo->snapblob + rinfo->snapblob_len, - le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, + &realm); downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -2423,6 +2453,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); + if (realm) + ceph_put_snap_realm(mdsc, realm); out_err: mutex_lock(&mdsc->mutex); if (!req->r_aborted) { @@ -2487,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); BUG_ON(req->r_got_result); + req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); @@ -2580,6 +2613,14 @@ static void handle_session(struct ceph_mds_session *session, send_flushmsg_ack(mdsc, session, seq); break; + case CEPH_SESSION_FORCE_RO: + dout("force_session_readonly %p\n", session); + spin_lock(&session->s_cap_lock); + session->s_readonly = true; + spin_unlock(&session->s_cap_lock); + wake_up_session_caps(session, 0); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -2610,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_request *req, *nreq; + struct rb_node *p; int err; dout("replay_unsafe_requests mds%d\n", session->s_mds); @@ -2622,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ceph_con_send(&session->s_con, req->r_request); } } + + /* + * also re-send old requests when MDS enters reconnect stage. So that MDS + * can process completed request in clientreplay stage. + */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_got_unsafe) + continue; + if (req->r_attempts == 0) + continue; /* only old requests */ + if (req->r_session && + req->r_session->s_mds == session->s_mds) { + err = __prepare_send_request(mdsc, req, session->s_mds); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + } + } mutex_unlock(&mdsc->mutex); } @@ -2787,6 +2851,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, spin_unlock(&session->s_gen_ttl_lock); spin_lock(&session->s_cap_lock); + /* don't know if session is readonly */ + session->s_readonly = 0; /* * notify __ceph_remove_cap() that we are composing cap reconnect. * If a cap get released before being added to the cap reconnect, @@ -2933,9 +2999,6 @@ static void check_new_map(struct ceph_mds_client *mdsc, mutex_unlock(&s->s_mutex); s->s_state = CEPH_MDS_SESSION_RESTARTING; } - - /* kick any requests waiting on the recovering mds */ - kick_requests(mdsc, i); } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -3295,6 +3358,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->sessions = NULL; + atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; init_rwsem(&mdsc->snap_rwsem); @@ -3428,14 +3492,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; - want_flush = mdsc->cap_flush_seq; mutex_unlock(&mdsc->mutex); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); ceph_flush_dirty_caps(mdsc); + spin_lock(&mdsc->cap_dirty_lock); + want_flush = mdsc->cap_flush_seq; + spin_unlock(&mdsc->cap_dirty_lock); + + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); wait_unsafe_requests(mdsc, want_tid); - wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); + wait_caps_flush(mdsc, want_flush); } /* @@ -3443,17 +3510,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) */ static bool done_closing_sessions(struct ceph_mds_client *mdsc) { - int i, n = 0; - if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return true; - - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) - if (mdsc->sessions[i]) - n++; - mutex_unlock(&mdsc->mutex); - return n == 0; + return atomic_read(&mdsc->num_sessions) == 0; } /* diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index e2817d00f7d9..1875b5d985c6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -137,6 +137,7 @@ struct ceph_mds_session { int s_nr_caps, s_trim_caps; int s_num_cap_releases; int s_cap_reconnect; + int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; @@ -272,6 +273,7 @@ struct ceph_mds_client { struct list_head waiting_for_map; struct ceph_mds_session **sessions; /* NULL for mds if no session */ + atomic_t num_sessions; int max_sessions; /* len of s_mds_sessions */ int stopping; /* true if shutting down */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index ce35fbd4ba5d..a97e39f09ba6 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, * safe. we do need to protect against concurrent empty list * additions, however. */ - if (atomic_read(&realm->nref) == 0) { + if (atomic_inc_return(&realm->nref) == 1) { spin_lock(&mdsc->snap_empty_lock); list_del_init(&realm->empty_item); spin_unlock(&mdsc->snap_empty_lock); } - - atomic_inc(&realm->nref); } static void __insert_snap_realm(struct rb_root *root, @@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 0); /* tree does not take a ref */ + atomic_set(&realm->nref, 1); /* for caller */ realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); @@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm( * * caller must hold snap_rwsem for write. */ -struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, - u64 ino) +static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) { struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; @@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, return NULL; } +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *r; + r = __lookup_snap_realm(mdsc, ino); + if (r) + ceph_get_snap_realm(mdsc, r); + return r; +} + static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); @@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, } realm->parent_ino = parentino; realm->parent = parent; - ceph_get_snap_realm(mdsc, parent); list_add(&realm->child_item, &parent->children); return 1; } @@ -631,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) * Caller must hold snap_rwsem for write. */ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, - void *p, void *e, bool deletion) + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret) { struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ - struct ceph_snap_realm *realm; + struct ceph_snap_realm *realm = NULL; + struct ceph_snap_realm *first_realm = NULL; int invalidate = 0; int err = -ENOMEM; LIST_HEAD(dirty_realms); @@ -704,13 +713,18 @@ more: dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, realm, invalidate, p, e); - if (p < e) - goto more; - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate) + if (invalidate && p >= e) rebuild_snap_realms(realm); + if (!first_realm) + first_realm = realm; + else + ceph_put_snap_realm(mdsc, realm); + + if (p < e) + goto more; + /* * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately. @@ -721,12 +735,21 @@ more: queue_realm_cap_snaps(realm); } + if (realm_ret) + *realm_ret = first_realm; + else + ceph_put_snap_realm(mdsc, first_realm); + __cleanup_empty_realms(mdsc); return 0; bad: err = -EINVAL; fail: + if (realm && !IS_ERR(realm)) + ceph_put_snap_realm(mdsc, realm); + if (first_realm) + ceph_put_snap_realm(mdsc, first_realm); pr_err("update_snap_trace error %d\n", err); return err; } @@ -844,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, if (IS_ERR(realm)) goto out; } - ceph_get_snap_realm(mdsc, realm); dout("splitting snap_realm %llx %p\n", realm->ino, realm); for (i = 0; i < num_split_inos; i++) { @@ -905,7 +927,7 @@ skip_inode: /* we may have taken some of the old realm's children. */ for (i = 0; i < num_split_realms; i++) { struct ceph_snap_realm *child = - ceph_lookup_snap_realm(mdsc, + __lookup_snap_realm(mdsc, le64_to_cpu(split_realms[i])); if (!child) continue; @@ -918,7 +940,7 @@ skip_inode: * snap, we can avoid queueing cap_snaps. */ ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY); + op == CEPH_SNAP_OP_DESTROY, NULL); if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 5ae62587a71d..a63997b8bcff 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -414,6 +414,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noshare"); if (opt->flags & CEPH_OPT_NOCRC) seq_puts(m, ",nocrc"); + if (opt->flags & CEPH_OPT_NOMSGAUTH) + seq_puts(m, ",nocephx_require_signatures"); + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) + seq_puts(m, ",notcp_nodelay"); if (opt->name) seq_printf(m, ",name=%s", opt->name); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e1aa32d0759d..04c8124ed30e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -693,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); extern int ceph_update_snap_trace(struct ceph_mds_client *m, - void *p, void *e, bool deletion); + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); @@ -892,7 +893,9 @@ extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; +extern const struct file_operations ceph_snapdir_fops; extern const struct inode_operations ceph_dir_iops; +extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; diff --git a/include/dt-bindings/thermal/thermal_exynos.h b/include/dt-bindings/thermal/thermal_exynos.h new file mode 100644 index 000000000000..0646500bca69 --- /dev/null +++ b/include/dt-bindings/thermal/thermal_exynos.h @@ -0,0 +1,28 @@ +/* + * thermal_exynos.h - Samsung EXYNOS TMU device tree definitions + * + * Copyright (C) 2014 Samsung Electronics + * Lukasz Majewski <l.majewski@samsung.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef _EXYNOS_THERMAL_TMU_DT_H +#define _EXYNOS_THERMAL_TMU_DT_H + +#define TYPE_ONE_POINT_TRIMMING 0 +#define TYPE_ONE_POINT_TRIMMING_25 1 +#define TYPE_ONE_POINT_TRIMMING_85 2 +#define TYPE_TWO_POINT_TRIMMING 3 +#define TYPE_NONE 4 + +#endif /* _EXYNOS_THERMAL_TMU_DT_H */ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index c0dadaac26e3..31eb03d0c766 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -158,17 +158,6 @@ enum { }; -/* pool operations */ -enum { - POOL_OP_CREATE = 0x01, - POOL_OP_DELETE = 0x02, - POOL_OP_AUID_CHANGE = 0x03, - POOL_OP_CREATE_SNAP = 0x11, - POOL_OP_DELETE_SNAP = 0x12, - POOL_OP_CREATE_UNMANAGED_SNAP = 0x21, - POOL_OP_DELETE_UNMANAGED_SNAP = 0x22, -}; - struct ceph_mon_request_header { __le64 have_version; __le16 session_mon; @@ -191,31 +180,6 @@ struct ceph_mon_statfs_reply { struct ceph_statfs st; } __attribute__ ((packed)); -const char *ceph_pool_op_name(int op); - -struct ceph_mon_poolop { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; - __le32 pool; - __le32 op; - __le64 auid; - __le64 snapid; - __le32 name_len; -} __attribute__ ((packed)); - -struct ceph_mon_poolop_reply { - struct ceph_mon_request_header monhdr; - struct ceph_fsid fsid; - __le32 reply_code; - __le32 epoch; - char has_data; - char data[0]; -} __attribute__ ((packed)); - -struct ceph_mon_unmanaged_snap { - __le64 snapid; -} __attribute__ ((packed)); - struct ceph_osd_getmap { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; @@ -307,6 +271,7 @@ enum { CEPH_SESSION_RECALL_STATE, CEPH_SESSION_FLUSHMSG, CEPH_SESSION_FLUSHMSG_ACK, + CEPH_SESSION_FORCE_RO, }; extern const char *ceph_session_op_name(int op); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 8b11a79ca1cb..16fff9608848 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -30,8 +30,9 @@ #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ #define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ #define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */ +#define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */ -#define CEPH_OPT_DEFAULT (0) +#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY) #define ceph_set_opt(client, opt) \ (client)->options->flags |= CEPH_OPT_##opt; diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index d9d396c16503..e15499422fdc 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -57,6 +57,7 @@ struct ceph_messenger { atomic_t stopping; bool nocrc; + bool tcp_nodelay; /* * the global_seq counts connections i (attempt to) initiate @@ -264,7 +265,8 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, u64 supported_features, u64 required_features, - bool nocrc); + bool nocrc, + bool tcp_nodelay); extern void ceph_con_init(struct ceph_connection *con, void *private, const struct ceph_connection_operations *ops, diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index deb47e45ac7c..81810dc21f06 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -40,7 +40,7 @@ struct ceph_mon_request { }; /* - * ceph_mon_generic_request is being used for the statfs, poolop and + * ceph_mon_generic_request is being used for the statfs and * mon_get_version requests which are being done a bit differently * because we need to get data back to the caller */ @@ -50,7 +50,6 @@ struct ceph_mon_generic_request { struct rb_node node; int result; void *buf; - int buf_len; struct completion completion; struct ceph_msg *request; /* original request */ struct ceph_msg *reply; /* and reply */ @@ -117,10 +116,4 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc); extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); -extern int ceph_monc_create_snapid(struct ceph_mon_client *monc, - u32 pool, u64 *snapid); - -extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc, - u32 pool, u64 snapid); - #endif diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 611e1c5893b4..b6dec05c7196 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -495,8 +495,7 @@ struct btrfs_ioctl_send_args { /* Error codes as returned by the kernel */ enum btrfs_err_code { - notused, - BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1, BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET, BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET, BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 5d5ab67f516d..ec565508e904 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -239,6 +239,8 @@ enum { Opt_nocrc, Opt_cephx_require_signatures, Opt_nocephx_require_signatures, + Opt_tcp_nodelay, + Opt_notcp_nodelay, }; static match_table_t opt_tokens = { @@ -259,6 +261,8 @@ static match_table_t opt_tokens = { {Opt_nocrc, "nocrc"}, {Opt_cephx_require_signatures, "cephx_require_signatures"}, {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, + {Opt_tcp_nodelay, "tcp_nodelay"}, + {Opt_notcp_nodelay, "notcp_nodelay"}, {-1, NULL} }; @@ -457,6 +461,7 @@ ceph_parse_options(char *options, const char *dev_name, case Opt_nocrc: opt->flags |= CEPH_OPT_NOCRC; break; + case Opt_cephx_require_signatures: opt->flags &= ~CEPH_OPT_NOMSGAUTH; break; @@ -464,6 +469,13 @@ ceph_parse_options(char *options, const char *dev_name, opt->flags |= CEPH_OPT_NOMSGAUTH; break; + case Opt_tcp_nodelay: + opt->flags |= CEPH_OPT_TCP_NODELAY; + break; + case Opt_notcp_nodelay: + opt->flags &= ~CEPH_OPT_TCP_NODELAY; + break; + default: BUG_ON(token); } @@ -518,10 +530,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, /* msgr */ if (ceph_test_opt(client, MYIP)) myaddr = &client->options->my_addr; + ceph_messenger_init(&client->msgr, myaddr, client->supported_features, client->required_features, - ceph_test_opt(client, NOCRC)); + ceph_test_opt(client, NOCRC), + ceph_test_opt(client, TCP_NODELAY)); /* subsystems */ err = ceph_monc_init(&client->monc, client); diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 30560202f57b..139a9cb19b0c 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -42,17 +42,3 @@ const char *ceph_osd_state_name(int s) return "???"; } } - -const char *ceph_pool_op_name(int op) -{ - switch (op) { - case POOL_OP_CREATE: return "create"; - case POOL_OP_DELETE: return "delete"; - case POOL_OP_AUID_CHANGE: return "auid change"; - case POOL_OP_CREATE_SNAP: return "create snap"; - case POOL_OP_DELETE_SNAP: return "delete snap"; - case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap"; - case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap"; - } - return "???"; -} diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index d2d525529f87..14d9995097cc 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -127,8 +127,6 @@ static int monc_show(struct seq_file *s, void *p) op = le16_to_cpu(req->request->hdr.type); if (op == CEPH_MSG_STATFS) seq_printf(s, "%llu statfs\n", req->tid); - else if (op == CEPH_MSG_POOLOP) - seq_printf(s, "%llu poolop\n", req->tid); else if (op == CEPH_MSG_MON_GET_VERSION) seq_printf(s, "%llu mon_get_version", req->tid); else diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 33a2f201e460..6b3f54ed65ba 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -510,6 +510,16 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } + if (con->msgr->tcp_nodelay) { + int optval = 1; + + ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)&optval, sizeof(optval)); + if (ret) + pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", + ret); + } + sk_set_memalloc(sock->sk); con->sock = sock; @@ -2922,7 +2932,8 @@ void ceph_messenger_init(struct ceph_messenger *msgr, struct ceph_entity_addr *myaddr, u64 supported_features, u64 required_features, - bool nocrc) + bool nocrc, + bool tcp_nodelay) { msgr->supported_features = supported_features; msgr->required_features = required_features; @@ -2937,6 +2948,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr, get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); encode_my_addr(msgr); msgr->nocrc = nocrc; + msgr->tcp_nodelay = tcp_nodelay; atomic_set(&msgr->stopping, 0); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index f2148e22b148..2b3cf05e87b0 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -410,7 +410,7 @@ out_unlocked: } /* - * generic requests (e.g., statfs, poolop) + * generic requests (currently statfs, mon_get_version) */ static struct ceph_mon_generic_request *__lookup_generic_req( struct ceph_mon_client *monc, u64 tid) @@ -569,7 +569,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, return; bad: - pr_err("corrupt generic reply, tid %llu\n", tid); + pr_err("corrupt statfs reply, tid %llu\n", tid); ceph_msg_dump(msg); } @@ -588,7 +588,6 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) kref_init(&req->kref); req->buf = buf; - req->buf_len = sizeof(*buf); init_completion(&req->completion); err = -ENOMEM; @@ -611,7 +610,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) err = do_generic_request(monc, req); out: - kref_put(&req->kref, release_generic_request); + put_generic_request(req); return err; } EXPORT_SYMBOL(ceph_monc_do_statfs); @@ -647,7 +646,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, return; bad: - pr_err("corrupt mon_get_version reply\n"); + pr_err("corrupt mon_get_version reply, tid %llu\n", tid); ceph_msg_dump(msg); } @@ -670,7 +669,6 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, kref_init(&req->kref); req->buf = newest; - req->buf_len = sizeof(*newest); init_completion(&req->completion); req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, @@ -701,134 +699,12 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, mutex_unlock(&monc->mutex); out: - kref_put(&req->kref, release_generic_request); + put_generic_request(req); return err; } EXPORT_SYMBOL(ceph_monc_do_get_version); /* - * pool ops - */ -static int get_poolop_reply_buf(const char *src, size_t src_len, - char *dst, size_t dst_len) -{ - u32 buf_len; - - if (src_len != sizeof(u32) + dst_len) - return -EINVAL; - - buf_len = le32_to_cpu(*(__le32 *)src); - if (buf_len != dst_len) - return -EINVAL; - - memcpy(dst, src + sizeof(u32), dst_len); - return 0; -} - -static void handle_poolop_reply(struct ceph_mon_client *monc, - struct ceph_msg *msg) -{ - struct ceph_mon_generic_request *req; - struct ceph_mon_poolop_reply *reply = msg->front.iov_base; - u64 tid = le64_to_cpu(msg->hdr.tid); - - if (msg->front.iov_len < sizeof(*reply)) - goto bad; - dout("handle_poolop_reply %p tid %llu\n", msg, tid); - - mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, tid); - if (req) { - if (req->buf_len && - get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), - msg->front.iov_len - sizeof(*reply), - req->buf, req->buf_len) < 0) { - mutex_unlock(&monc->mutex); - goto bad; - } - req->result = le32_to_cpu(reply->reply_code); - get_generic_request(req); - } - mutex_unlock(&monc->mutex); - if (req) { - complete(&req->completion); - put_generic_request(req); - } - return; - -bad: - pr_err("corrupt generic reply, tid %llu\n", tid); - ceph_msg_dump(msg); -} - -/* - * Do a synchronous pool op. - */ -static int do_poolop(struct ceph_mon_client *monc, u32 op, - u32 pool, u64 snapid, - char *buf, int len) -{ - struct ceph_mon_generic_request *req; - struct ceph_mon_poolop *h; - int err; - - req = kzalloc(sizeof(*req), GFP_NOFS); - if (!req) - return -ENOMEM; - - kref_init(&req->kref); - req->buf = buf; - req->buf_len = len; - init_completion(&req->completion); - - err = -ENOMEM; - req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, - true); - if (!req->request) - goto out; - req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, - true); - if (!req->reply) - goto out; - - /* fill out request */ - req->request->hdr.version = cpu_to_le16(2); - h = req->request->front.iov_base; - h->monhdr.have_version = 0; - h->monhdr.session_mon = cpu_to_le16(-1); - h->monhdr.session_mon_tid = 0; - h->fsid = monc->monmap->fsid; - h->pool = cpu_to_le32(pool); - h->op = cpu_to_le32(op); - h->auid = 0; - h->snapid = cpu_to_le64(snapid); - h->name_len = 0; - - err = do_generic_request(monc, req); - -out: - kref_put(&req->kref, release_generic_request); - return err; -} - -int ceph_monc_create_snapid(struct ceph_mon_client *monc, - u32 pool, u64 *snapid) -{ - return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, - pool, 0, (char *)snapid, sizeof(*snapid)); - -} -EXPORT_SYMBOL(ceph_monc_create_snapid); - -int ceph_monc_delete_snapid(struct ceph_mon_client *monc, - u32 pool, u64 snapid) -{ - return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, - pool, snapid, NULL, 0); - -} - -/* * Resend pending generic requests. */ static void __resend_generic_request(struct ceph_mon_client *monc) @@ -1112,10 +988,6 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) handle_get_version_reply(monc, msg); break; - case CEPH_MSG_POOLOP_REPLY: - handle_poolop_reply(monc, msg); - break; - case CEPH_MSG_MON_MAP: ceph_monc_handle_map(monc, msg); break; @@ -1154,7 +1026,6 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, case CEPH_MSG_MON_SUBSCRIBE_ACK: m = ceph_msg_get(monc->m_subscribe_ack); break; - case CEPH_MSG_POOLOP_REPLY: case CEPH_MSG_STATFS_REPLY: return get_generic_reply(con, hdr, skip); case CEPH_MSG_AUTH_REPLY: diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 53299c7b0ca4..41a4abc7e98e 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1035,10 +1035,11 @@ static void put_osd(struct ceph_osd *osd) { dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { + if (atomic_dec_and_test(&osd->o_ref)) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); + if (osd->o_auth.authorizer) + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } @@ -1048,14 +1049,24 @@ static void put_osd(struct ceph_osd *osd) */ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) { - dout("__remove_osd %p\n", osd); + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); WARN_ON(!list_empty(&osd->o_requests)); WARN_ON(!list_empty(&osd->o_linger_requests)); - rb_erase(&osd->o_node, &osdc->osds); list_del_init(&osd->o_osd_lru); - ceph_con_close(&osd->o_con); - put_osd(osd); + rb_erase(&osd->o_node, &osdc->osds); + RB_CLEAR_NODE(&osd->o_node); +} + +static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); + + if (!RB_EMPTY_NODE(&osd->o_node)) { + ceph_con_close(&osd->o_con); + __remove_osd(osdc, osd); + put_osd(osd); + } } static void remove_all_osds(struct ceph_osd_client *osdc) @@ -1065,7 +1076,7 @@ static void remove_all_osds(struct ceph_osd_client *osdc) while (!RB_EMPTY_ROOT(&osdc->osds)) { struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), struct ceph_osd, o_node); - __remove_osd(osdc, osd); + remove_osd(osdc, osd); } mutex_unlock(&osdc->request_mutex); } @@ -1106,7 +1117,7 @@ static void remove_old_osds(struct ceph_osd_client *osdc) list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { if (time_before(jiffies, osd->lru_ttl)) break; - __remove_osd(osdc, osd); + remove_osd(osdc, osd); } mutex_unlock(&osdc->request_mutex); } @@ -1121,8 +1132,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) dout("__reset_osd %p osd%d\n", osd, osd->o_osd); if (list_empty(&osd->o_requests) && list_empty(&osd->o_linger_requests)) { - __remove_osd(osdc, osd); - + remove_osd(osdc, osd); return -ENODEV; } @@ -1926,6 +1936,7 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) { struct rb_node *p, *n; + dout("%s %p\n", __func__, osdc); for (p = rb_first(&osdc->osds); p; p = n) { struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); |