summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/thermal/exynos-thermal.txt21
-rw-r--r--Documentation/devicetree/bindings/thermal/thermal.txt74
-rw-r--r--MAINTAINERS7
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--block/blk-throttle.c3
-rw-r--r--drivers/block/rbd.c193
-rw-r--r--drivers/cpufreq/Kconfig.arm44
-rw-r--r--drivers/cpufreq/Makefile9
-rw-r--r--drivers/cpufreq/exynos-cpufreq.c33
-rw-r--r--drivers/thermal/of-thermal.c3
-rw-r--r--drivers/thermal/rockchip_thermal.c36
-rw-r--r--drivers/thermal/samsung/Kconfig9
-rw-r--r--drivers/thermal/samsung/Makefile2
-rw-r--r--drivers/thermal/samsung/exynos_thermal_common.c427
-rw-r--r--drivers/thermal/samsung/exynos_thermal_common.h106
-rw-r--r--drivers/thermal/samsung/exynos_tmu.c553
-rw-r--r--drivers/thermal/samsung/exynos_tmu.h77
-rw-r--r--drivers/thermal/samsung/exynos_tmu_data.c264
-rw-r--r--fs/btrfs/backref.c28
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c55
-rw-r--r--fs/btrfs/ctree.h39
-rw-r--r--fs/btrfs/delayed-inode.c38
-rw-r--r--fs/btrfs/dev-replace.c25
-rw-r--r--fs/btrfs/disk-io.c102
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c250
-rw-r--r--fs/btrfs/extent_io.c87
-rw-r--r--fs/btrfs/extent_io.h65
-rw-r--r--fs/btrfs/free-space-cache.c13
-rw-r--r--fs/btrfs/inode-item.c9
-rw-r--r--fs/btrfs/inode.c156
-rw-r--r--fs/btrfs/qgroup.c3
-rw-r--r--fs/btrfs/raid56.c103
-rw-r--r--fs/btrfs/raid56.h11
-rw-r--r--fs/btrfs/reada.c19
-rw-r--r--fs/btrfs/relocation.c12
-rw-r--r--fs/btrfs/scrub.c309
-rw-r--r--fs/btrfs/send.c9
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/sysfs.c10
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c4
-rw-r--r--fs/btrfs/tests/qgroup-tests.c23
-rw-r--r--fs/btrfs/transaction.c27
-rw-r--r--fs/btrfs/transaction.h7
-rw-r--r--fs/btrfs/tree-log.c234
-rw-r--r--fs/btrfs/volumes.c242
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/ceph/acl.c14
-rw-r--r--fs/ceph/addr.c19
-rw-r--r--fs/ceph/caps.c127
-rw-r--r--fs/ceph/dir.c33
-rw-r--r--fs/ceph/file.c37
-rw-r--r--fs/ceph/inode.c41
-rw-r--r--fs/ceph/mds_client.c127
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/snap.c54
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/ceph/super.h5
-rw-r--r--include/dt-bindings/thermal/thermal_exynos.h28
-rw-r--r--include/linux/ceph/ceph_fs.h37
-rw-r--r--include/linux/ceph/libceph.h3
-rw-r--r--include/linux/ceph/messenger.h4
-rw-r--r--include/linux/ceph/mon_client.h9
-rw-r--r--include/uapi/linux/btrfs.h3
-rw-r--r--net/ceph/ceph_common.c16
-rw-r--r--net/ceph/ceph_strings.c14
-rw-r--r--net/ceph/debugfs.c2
-rw-r--r--net/ceph/messenger.c14
-rw-r--r--net/ceph/mon_client.c139
-rw-r--r--net/ceph/osd_client.c31
74 files changed, 2110 insertions, 2441 deletions
diff --git a/Documentation/devicetree/bindings/thermal/exynos-thermal.txt b/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
index ae738f562acc..695150a4136b 100644
--- a/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/exynos-thermal.txt
@@ -12,6 +12,7 @@
"samsung,exynos5420-tmu-ext-triminfo" for TMU channels 2, 3 and 4
Exynos5420 (Must pass triminfo base and triminfo clock)
"samsung,exynos5440-tmu"
+ "samsung,exynos7-tmu"
- interrupt-parent : The phandle for the interrupt controller
- reg : Address range of the thermal registers. For soc's which has multiple
instances of TMU and some registers are shared across all TMU's like
@@ -32,13 +33,28 @@
- clocks : The main clocks for TMU device
-- 1. operational clock for TMU channel
-- 2. optional clock to access the shared registers of TMU channel
+ -- 3. optional special clock for functional operation
- clock-names : Thermal system clock name
-- "tmu_apbif" operational clock for current TMU channel
-- "tmu_triminfo_apbif" clock to access the shared triminfo register
for current TMU channel
+ -- "tmu_sclk" clock for functional operation of the current TMU
+ channel
- vtmu-supply: This entry is optional and provides the regulator node supplying
voltage to TMU. If needed this entry can be placed inside
board/platform specific dts file.
+Following properties are mandatory (depending on SoC):
+- samsung,tmu_gain: Gain value for internal TMU operation.
+- samsung,tmu_reference_voltage: Value of TMU IP block's reference voltage
+- samsung,tmu_noise_cancel_mode: Mode for noise cancellation
+- samsung,tmu_efuse_value: Default level of temperature - it is needed when
+ in factory fusing produced wrong value
+- samsung,tmu_min_efuse_value: Minimum temperature fused value
+- samsung,tmu_max_efuse_value: Maximum temperature fused value
+- samsung,tmu_first_point_trim: First point trimming value
+- samsung,tmu_second_point_trim: Second point trimming value
+- samsung,tmu_default_temp_offset: Default temperature offset
+- samsung,tmu_cal_type: Callibration type
Example 1):
@@ -51,6 +67,7 @@ Example 1):
clock-names = "tmu_apbif";
status = "disabled";
vtmu-supply = <&tmu_regulator_node>;
+ #include "exynos4412-tmu-sensor-conf.dtsi"
};
Example 2):
@@ -61,6 +78,7 @@ Example 2):
interrupts = <0 58 0>;
clocks = <&clock 21>;
clock-names = "tmu_apbif";
+ #include "exynos5440-tmu-sensor-conf.dtsi"
};
Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register")
@@ -70,6 +88,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register")
interrupts = <0 184 0>;
clocks = <&clock 318>, <&clock 318>;
clock-names = "tmu_apbif", "tmu_triminfo_apbif";
+ #include "exynos4412-tmu-sensor-conf.dtsi"
};
tmu_cpu3: tmu@1006c000 {
@@ -78,6 +97,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register")
interrupts = <0 185 0>;
clocks = <&clock 318>, <&clock 319>;
clock-names = "tmu_apbif", "tmu_triminfo_apbif";
+ #include "exynos4412-tmu-sensor-conf.dtsi"
};
tmu_gpu: tmu@100a0000 {
@@ -86,6 +106,7 @@ Example 3): (In case of Exynos5420 "with misplaced TRIMINFO register")
interrupts = <0 215 0>;
clocks = <&clock 319>, <&clock 318>;
clock-names = "tmu_apbif", "tmu_triminfo_apbif";
+ #include "exynos4412-tmu-sensor-conf.dtsi"
};
Note: For multi-instance tmu each instance should have an alias correctly
diff --git a/Documentation/devicetree/bindings/thermal/thermal.txt b/Documentation/devicetree/bindings/thermal/thermal.txt
index f5db6b72a36f..29fe0bfae38e 100644
--- a/Documentation/devicetree/bindings/thermal/thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/thermal.txt
@@ -251,24 +251,24 @@ ocp {
};
thermal-zones {
- cpu-thermal: cpu-thermal {
+ cpu_thermal: cpu-thermal {
polling-delay-passive = <250>; /* milliseconds */
polling-delay = <1000>; /* milliseconds */
thermal-sensors = <&bandgap0>;
trips {
- cpu-alert0: cpu-alert {
+ cpu_alert0: cpu-alert0 {
temperature = <90000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "active";
};
- cpu-alert1: cpu-alert {
+ cpu_alert1: cpu-alert1 {
temperature = <100000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "passive";
};
- cpu-crit: cpu-crit {
+ cpu_crit: cpu-crit {
temperature = <125000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "critical";
@@ -277,17 +277,17 @@ thermal-zones {
cooling-maps {
map0 {
- trip = <&cpu-alert0>;
- cooling-device = <&fan0 THERMAL_NO_LIMITS 4>;
+ trip = <&cpu_alert0>;
+ cooling-device = <&fan0 THERMAL_NO_LIMIT 4>;
};
map1 {
- trip = <&cpu-alert1>;
- cooling-device = <&fan0 5 THERMAL_NO_LIMITS>;
+ trip = <&cpu_alert1>;
+ cooling-device = <&fan0 5 THERMAL_NO_LIMIT>;
};
map2 {
- trip = <&cpu-alert1>;
+ trip = <&cpu_alert1>;
cooling-device =
- <&cpu0 THERMAL_NO_LIMITS THERMAL_NO_LIMITS>;
+ <&cpu0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
};
};
};
@@ -298,13 +298,13 @@ used to monitor the zone 'cpu-thermal' using its sole sensor. A fan
device (fan0) is controlled via I2C bus 1, at address 0x48, and has ten
different cooling states 0-9. It is used to remove the heat out of
the thermal zone 'cpu-thermal' using its cooling states
-from its minimum to 4, when it reaches trip point 'cpu-alert0'
+from its minimum to 4, when it reaches trip point 'cpu_alert0'
at 90C, as an example of active cooling. The same cooling device is used at
-'cpu-alert1', but from 5 to its maximum state. The cpu@0 device is also
+'cpu_alert1', but from 5 to its maximum state. The cpu@0 device is also
linked to the same thermal zone, 'cpu-thermal', as a passive cooling device,
-using all its cooling states at trip point 'cpu-alert1',
+using all its cooling states at trip point 'cpu_alert1',
which is a trip point at 100C. On the thermal zone 'cpu-thermal', at the
-temperature of 125C, represented by the trip point 'cpu-crit', the silicon
+temperature of 125C, represented by the trip point 'cpu_crit', the silicon
is not reliable anymore.
(b) - IC with several internal sensors
@@ -329,7 +329,7 @@ ocp {
};
thermal-zones {
- cpu-thermal: cpu-thermal {
+ cpu_thermal: cpu-thermal {
polling-delay-passive = <250>; /* milliseconds */
polling-delay = <1000>; /* milliseconds */
@@ -338,12 +338,12 @@ thermal-zones {
trips {
/* each zone within the SoC may have its own trips */
- cpu-alert: cpu-alert {
+ cpu_alert: cpu-alert {
temperature = <100000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "passive";
};
- cpu-crit: cpu-crit {
+ cpu_crit: cpu-crit {
temperature = <125000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "critical";
@@ -356,7 +356,7 @@ thermal-zones {
};
};
- gpu-thermal: gpu-thermal {
+ gpu_thermal: gpu-thermal {
polling-delay-passive = <120>; /* milliseconds */
polling-delay = <1000>; /* milliseconds */
@@ -365,12 +365,12 @@ thermal-zones {
trips {
/* each zone within the SoC may have its own trips */
- gpu-alert: gpu-alert {
+ gpu_alert: gpu-alert {
temperature = <90000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "passive";
};
- gpu-crit: gpu-crit {
+ gpu_crit: gpu-crit {
temperature = <105000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "critical";
@@ -383,7 +383,7 @@ thermal-zones {
};
};
- dsp-thermal: dsp-thermal {
+ dsp_thermal: dsp-thermal {
polling-delay-passive = <50>; /* milliseconds */
polling-delay = <1000>; /* milliseconds */
@@ -392,12 +392,12 @@ thermal-zones {
trips {
/* each zone within the SoC may have its own trips */
- dsp-alert: gpu-alert {
+ dsp_alert: dsp-alert {
temperature = <90000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "passive";
};
- dsp-crit: gpu-crit {
+ dsp_crit: gpu-crit {
temperature = <135000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "critical";
@@ -457,7 +457,7 @@ ocp {
};
thermal-zones {
- cpu-thermal: cpu-thermal {
+ cpu_thermal: cpu-thermal {
polling-delay-passive = <250>; /* milliseconds */
polling-delay = <1000>; /* milliseconds */
@@ -508,7 +508,7 @@ with many sensors and many cooling devices.
/*
* An IC with several temperature sensor.
*/
- adc-dummy: sensor@0x50 {
+ adc_dummy: sensor@0x50 {
...
#thermal-sensor-cells = <1>; /* sensor internal ID */
};
@@ -520,7 +520,7 @@ thermal-zones {
polling-delay = <2500>; /* milliseconds */
/* sensor ID */
- thermal-sensors = <&adc-dummy 4>;
+ thermal-sensors = <&adc_dummy 4>;
trips {
...
@@ -531,14 +531,14 @@ thermal-zones {
};
};
- board-thermal: board-thermal {
+ board_thermal: board-thermal {
polling-delay-passive = <1000>; /* milliseconds */
polling-delay = <2500>; /* milliseconds */
/* sensor ID */
- thermal-sensors = <&adc-dummy 0>, /* pcb top edge */
- <&adc-dummy 1>, /* lcd */
- <&adc-dymmy 2>; /* back cover */
+ thermal-sensors = <&adc_dummy 0>, /* pcb top edge */
+ <&adc_dummy 1>, /* lcd */
+ <&adc_dummy 2>; /* back cover */
/*
* An array of coefficients describing the sensor
* linear relation. E.g.:
@@ -548,22 +548,22 @@ thermal-zones {
trips {
/* Trips are based on resulting linear equation */
- cpu-trip: cpu-trip {
+ cpu_trip: cpu-trip {
temperature = <60000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "passive";
};
- gpu-trip: gpu-trip {
+ gpu_trip: gpu-trip {
temperature = <55000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "passive";
}
- lcd-trip: lcp-trip {
+ lcd_trip: lcp-trip {
temperature = <53000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "passive";
};
- crit-trip: crit-trip {
+ crit_trip: crit-trip {
temperature = <68000>; /* millicelsius */
hysteresis = <2000>; /* millicelsius */
type = "critical";
@@ -572,17 +572,17 @@ thermal-zones {
cooling-maps {
map0 {
- trip = <&cpu-trip>;
+ trip = <&cpu_trip>;
cooling-device = <&cpu0 0 2>;
contribution = <55>;
};
map1 {
- trip = <&gpu-trip>;
+ trip = <&gpu_trip>;
cooling-device = <&gpu0 0 2>;
contribution = <20>;
};
map2 {
- trip = <&lcd-trip>;
+ trip = <&lcd_trip>;
cooling-device = <&lcd0 5 10>;
contribution = <15>;
};
diff --git a/MAINTAINERS b/MAINTAINERS
index 1921ed58d1a0..7cfcee4e2bea 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2433,7 +2433,8 @@ F: arch/powerpc/oprofile/*cell*
F: arch/powerpc/platforms/cell/
CEPH DISTRIBUTED FILE SYSTEM CLIENT
-M: Sage Weil <sage@inktank.com>
+M: Yan, Zheng <zyan@redhat.com>
+M: Sage Weil <sage@redhat.com>
L: ceph-devel@vger.kernel.org
W: http://ceph.com/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
@@ -7998,8 +7999,8 @@ S: Supported
F: drivers/net/wireless/ath/wcn36xx/
RADOS BLOCK DEVICE (RBD)
-M: Yehuda Sadeh <yehuda@inktank.com>
-M: Sage Weil <sage@inktank.com>
+M: Ilya Dryomov <idryomov@gmail.com>
+M: Sage Weil <sage@redhat.com>
M: Alex Elder <elder@kernel.org>
M: ceph-devel@vger.kernel.org
W: http://ceph.com/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 67fc3d2b0aab..a0c35bf6cb92 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -476,12 +476,14 @@ static inline int pmd_present(pmd_t pmd)
*/
static inline int pte_protnone(pte_t pte)
{
- return pte_flags(pte) & _PAGE_PROTNONE;
+ return (pte_flags(pte) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+ == _PAGE_PROTNONE;
}
static inline int pmd_protnone(pmd_t pmd)
{
- return pmd_flags(pmd) & _PAGE_PROTNONE;
+ return (pmd_flags(pmd) & (_PAGE_PROTNONE | _PAGE_PRESENT))
+ == _PAGE_PROTNONE;
}
#endif /* CONFIG_NUMA_BALANCING */
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9273d0969ebd..5b9c6d5c3636 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1292,6 +1292,9 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf,
struct blkg_rwstat rwstat = { }, tmp;
int i, cpu;
+ if (tg->stats_cpu == NULL)
+ return 0;
+
for_each_possible_cpu(cpu) {
struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8a86b62466f7..b40af3203089 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -38,6 +38,7 @@
#include <linux/kernel.h>
#include <linux/device.h>
#include <linux/module.h>
+#include <linux/blk-mq.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
@@ -340,9 +341,7 @@ struct rbd_device {
char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
- struct list_head rq_queue; /* incoming rq queue */
spinlock_t lock; /* queue, flags, open_count */
- struct work_struct rq_work;
struct rbd_image_header header;
unsigned long flags; /* possibly lock protected */
@@ -360,6 +359,9 @@ struct rbd_device {
atomic_t parent_ref;
struct rbd_device *parent;
+ /* Block layer tags. */
+ struct blk_mq_tag_set tag_set;
+
/* protects updating the header */
struct rw_semaphore header_rwsem;
@@ -1817,7 +1819,8 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
/*
* We support a 64-bit length, but ultimately it has to be
- * passed to blk_end_request(), which takes an unsigned int.
+ * passed to the block layer, which just supports a 32-bit
+ * length field.
*/
obj_request->xferred = osd_req->r_reply_op_len[0];
rbd_assert(obj_request->xferred < (u64)UINT_MAX);
@@ -2275,7 +2278,10 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
more = obj_request->which < img_request->obj_request_count - 1;
} else {
rbd_assert(img_request->rq != NULL);
- more = blk_end_request(img_request->rq, result, xferred);
+
+ more = blk_update_request(img_request->rq, result, xferred);
+ if (!more)
+ __blk_mq_end_request(img_request->rq, result);
}
return more;
@@ -3304,8 +3310,10 @@ out:
return ret;
}
-static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
+static void rbd_queue_workfn(struct work_struct *work)
{
+ struct request *rq = blk_mq_rq_from_pdu(work);
+ struct rbd_device *rbd_dev = rq->q->queuedata;
struct rbd_img_request *img_request;
struct ceph_snap_context *snapc = NULL;
u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
@@ -3314,6 +3322,13 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
u64 mapping_size;
int result;
+ if (rq->cmd_type != REQ_TYPE_FS) {
+ dout("%s: non-fs request type %d\n", __func__,
+ (int) rq->cmd_type);
+ result = -EIO;
+ goto err;
+ }
+
if (rq->cmd_flags & REQ_DISCARD)
op_type = OBJ_OP_DISCARD;
else if (rq->cmd_flags & REQ_WRITE)
@@ -3359,6 +3374,8 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
goto err_rq; /* Shouldn't happen */
}
+ blk_mq_start_request(rq);
+
down_read(&rbd_dev->header_rwsem);
mapping_size = rbd_dev->mapping.size;
if (op_type != OBJ_OP_READ) {
@@ -3404,53 +3421,18 @@ err_rq:
rbd_warn(rbd_dev, "%s %llx at %llx result %d",
obj_op_name(op_type), length, offset, result);
ceph_put_snap_context(snapc);
- blk_end_request_all(rq, result);
+err:
+ blk_mq_end_request(rq, result);
}
-static void rbd_request_workfn(struct work_struct *work)
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
{
- struct rbd_device *rbd_dev =
- container_of(work, struct rbd_device, rq_work);
- struct request *rq, *next;
- LIST_HEAD(requests);
-
- spin_lock_irq(&rbd_dev->lock); /* rq->q->queue_lock */
- list_splice_init(&rbd_dev->rq_queue, &requests);
- spin_unlock_irq(&rbd_dev->lock);
+ struct request *rq = bd->rq;
+ struct work_struct *work = blk_mq_rq_to_pdu(rq);
- list_for_each_entry_safe(rq, next, &requests, queuelist) {
- list_del_init(&rq->queuelist);
- rbd_handle_request(rbd_dev, rq);
- }
-}
-
-/*
- * Called with q->queue_lock held and interrupts disabled, possibly on
- * the way to schedule(). Do not sleep here!
- */
-static void rbd_request_fn(struct request_queue *q)
-{
- struct rbd_device *rbd_dev = q->queuedata;
- struct request *rq;
- int queued = 0;
-
- rbd_assert(rbd_dev);
-
- while ((rq = blk_fetch_request(q))) {
- /* Ignore any non-FS requests that filter through. */
- if (rq->cmd_type != REQ_TYPE_FS) {
- dout("%s: non-fs request type %d\n", __func__,
- (int) rq->cmd_type);
- __blk_end_request_all(rq, 0);
- continue;
- }
-
- list_add_tail(&rq->queuelist, &rbd_dev->rq_queue);
- queued++;
- }
-
- if (queued)
- queue_work(rbd_wq, &rbd_dev->rq_work);
+ queue_work(rbd_wq, work);
+ return BLK_MQ_RQ_QUEUE_OK;
}
/*
@@ -3511,6 +3493,7 @@ static void rbd_free_disk(struct rbd_device *rbd_dev)
del_gendisk(disk);
if (disk->queue)
blk_cleanup_queue(disk->queue);
+ blk_mq_free_tag_set(&rbd_dev->tag_set);
}
put_disk(disk);
}
@@ -3694,7 +3677,7 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
ret = rbd_dev_header_info(rbd_dev);
if (ret)
- return ret;
+ goto out;
/*
* If there is a parent, see if it has disappeared due to the
@@ -3703,30 +3686,46 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
if (rbd_dev->parent) {
ret = rbd_dev_v2_parent_info(rbd_dev);
if (ret)
- return ret;
+ goto out;
}
if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
- if (rbd_dev->mapping.size != rbd_dev->header.image_size)
- rbd_dev->mapping.size = rbd_dev->header.image_size;
+ rbd_dev->mapping.size = rbd_dev->header.image_size;
} else {
/* validate mapped snapshot's EXISTS flag */
rbd_exists_validate(rbd_dev);
}
+out:
up_write(&rbd_dev->header_rwsem);
-
- if (mapping_size != rbd_dev->mapping.size)
+ if (!ret && mapping_size != rbd_dev->mapping.size)
rbd_dev_update_size(rbd_dev);
+ return ret;
+}
+
+static int rbd_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct work_struct *work = blk_mq_rq_to_pdu(rq);
+
+ INIT_WORK(work, rbd_queue_workfn);
return 0;
}
+static struct blk_mq_ops rbd_mq_ops = {
+ .queue_rq = rbd_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = rbd_init_request,
+};
+
static int rbd_init_disk(struct rbd_device *rbd_dev)
{
struct gendisk *disk;
struct request_queue *q;
u64 segment_size;
+ int err;
/* create gendisk info */
disk = alloc_disk(single_major ?
@@ -3744,10 +3743,25 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
disk->fops = &rbd_bd_ops;
disk->private_data = rbd_dev;
- q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
- if (!q)
+ memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
+ rbd_dev->tag_set.ops = &rbd_mq_ops;
+ rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
+ rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
+ rbd_dev->tag_set.flags =
+ BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ rbd_dev->tag_set.nr_hw_queues = 1;
+ rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+
+ err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
+ if (err)
goto out_disk;
+ q = blk_mq_init_queue(&rbd_dev->tag_set);
+ if (IS_ERR(q)) {
+ err = PTR_ERR(q);
+ goto out_tag_set;
+ }
+
/* We use the default size, but let's be explicit about it. */
blk_queue_physical_block_size(q, SECTOR_SIZE);
@@ -3773,10 +3787,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->disk = disk;
return 0;
+out_tag_set:
+ blk_mq_free_tag_set(&rbd_dev->tag_set);
out_disk:
put_disk(disk);
-
- return -ENOMEM;
+ return err;
}
/*
@@ -4033,8 +4048,6 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
return NULL;
spin_lock_init(&rbd_dev->lock);
- INIT_LIST_HEAD(&rbd_dev->rq_queue);
- INIT_WORK(&rbd_dev->rq_work, rbd_request_workfn);
rbd_dev->flags = 0;
atomic_set(&rbd_dev->parent_ref, 0);
INIT_LIST_HEAD(&rbd_dev->node);
@@ -4274,32 +4287,22 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
}
/*
- * We always update the parent overlap. If it's zero we
- * treat it specially.
+ * We always update the parent overlap. If it's zero we issue
+ * a warning, as we will proceed as if there was no parent.
*/
- rbd_dev->parent_overlap = overlap;
if (!overlap) {
-
- /* A null parent_spec indicates it's the initial probe */
-
if (parent_spec) {
- /*
- * The overlap has become zero, so the clone
- * must have been resized down to 0 at some
- * point. Treat this the same as a flatten.
- */
- rbd_dev_parent_put(rbd_dev);
- pr_info("%s: clone image now standalone\n",
- rbd_dev->disk->disk_name);
+ /* refresh, careful to warn just once */
+ if (rbd_dev->parent_overlap)
+ rbd_warn(rbd_dev,
+ "clone now standalone (overlap became 0)");
} else {
- /*
- * For the initial probe, if we find the
- * overlap is zero we just pretend there was
- * no parent image.
- */
- rbd_warn(rbd_dev, "ignoring parent with overlap 0");
+ /* initial probe */
+ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
}
}
+ rbd_dev->parent_overlap = overlap;
+
out:
ret = 0;
out_err:
@@ -4771,36 +4774,6 @@ static inline size_t next_token(const char **buf)
}
/*
- * Finds the next token in *buf, and if the provided token buffer is
- * big enough, copies the found token into it. The result, if
- * copied, is guaranteed to be terminated with '\0'. Note that *buf
- * must be terminated with '\0' on entry.
- *
- * Returns the length of the token found (not including the '\0').
- * Return value will be 0 if no token is found, and it will be >=
- * token_size if the token would not fit.
- *
- * The *buf pointer will be updated to point beyond the end of the
- * found token. Note that this occurs even if the token buffer is
- * too small to hold it.
- */
-static inline size_t copy_token(const char **buf,
- char *token,
- size_t token_size)
-{
- size_t len;
-
- len = next_token(buf);
- if (len < token_size) {
- memcpy(token, *buf, len);
- *(token + len) = '\0';
- }
- *buf += len;
-
- return len;
-}
-
-/*
* Finds the next token in *buf, dynamically allocates a buffer big
* enough to hold a copy of it, and copies the token into the new
* buffer. The copy is guaranteed to be terminated with '\0'. Note
diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 0f9a2c3c0e0d..1b06fc4640e2 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -26,13 +26,21 @@ config ARM_VEXPRESS_SPC_CPUFREQ
config ARM_EXYNOS_CPUFREQ
- bool
+ tristate "SAMSUNG EXYNOS CPUfreq Driver"
+ depends on CPU_EXYNOS4210 || SOC_EXYNOS4212 || SOC_EXYNOS4412 || SOC_EXYNOS5250
+ depends on THERMAL
+ help
+ This adds the CPUFreq driver for Samsung EXYNOS platforms.
+ Supported SoC versions are:
+ Exynos4210, Exynos4212, Exynos4412, and Exynos5250.
+
+ If in doubt, say N.
config ARM_EXYNOS4210_CPUFREQ
bool "SAMSUNG EXYNOS4210"
depends on CPU_EXYNOS4210
+ depends on ARM_EXYNOS_CPUFREQ
default y
- select ARM_EXYNOS_CPUFREQ
help
This adds the CPUFreq driver for Samsung EXYNOS4210
SoC (S5PV310 or S5PC210).
@@ -42,8 +50,8 @@ config ARM_EXYNOS4210_CPUFREQ
config ARM_EXYNOS4X12_CPUFREQ
bool "SAMSUNG EXYNOS4x12"
depends on SOC_EXYNOS4212 || SOC_EXYNOS4412
+ depends on ARM_EXYNOS_CPUFREQ
default y
- select ARM_EXYNOS_CPUFREQ
help
This adds the CPUFreq driver for Samsung EXYNOS4X12
SoC (EXYNOS4212 or EXYNOS4412).
@@ -53,28 +61,14 @@ config ARM_EXYNOS4X12_CPUFREQ
config ARM_EXYNOS5250_CPUFREQ
bool "SAMSUNG EXYNOS5250"
depends on SOC_EXYNOS5250
+ depends on ARM_EXYNOS_CPUFREQ
default y
- select ARM_EXYNOS_CPUFREQ
help
This adds the CPUFreq driver for Samsung EXYNOS5250
SoC.
If in doubt, say N.
-config ARM_EXYNOS5440_CPUFREQ
- bool "SAMSUNG EXYNOS5440"
- depends on SOC_EXYNOS5440
- depends on HAVE_CLK && OF
- select PM_OPP
- default y
- help
- This adds the CPUFreq driver for Samsung EXYNOS5440
- SoC. The nature of exynos5440 clock controller is
- different than previous exynos controllers so not using
- the common exynos framework.
-
- If in doubt, say N.
-
config ARM_EXYNOS_CPU_FREQ_BOOST_SW
bool "EXYNOS Frequency Overclocking - Software"
depends on ARM_EXYNOS_CPUFREQ && THERMAL
@@ -90,6 +84,20 @@ config ARM_EXYNOS_CPU_FREQ_BOOST_SW
If in doubt, say N.
+config ARM_EXYNOS5440_CPUFREQ
+ tristate "SAMSUNG EXYNOS5440"
+ depends on SOC_EXYNOS5440
+ depends on HAVE_CLK && OF
+ select PM_OPP
+ default y
+ help
+ This adds the CPUFreq driver for Samsung EXYNOS5440
+ SoC. The nature of exynos5440 clock controller is
+ different than previous exynos controllers so not using
+ the common exynos framework.
+
+ If in doubt, say N.
+
config ARM_HIGHBANK_CPUFREQ
tristate "Calxeda Highbank-based"
depends on ARCH_HIGHBANK && CPUFREQ_DT && REGULATOR
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 8b4220ac888b..82a1821471fd 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -52,10 +52,11 @@ obj-$(CONFIG_ARM_DT_BL_CPUFREQ) += arm_big_little_dt.o
obj-$(CONFIG_ARCH_DAVINCI) += davinci-cpufreq.o
obj-$(CONFIG_UX500_SOC_DB8500) += dbx500-cpufreq.o
-obj-$(CONFIG_ARM_EXYNOS_CPUFREQ) += exynos-cpufreq.o
-obj-$(CONFIG_ARM_EXYNOS4210_CPUFREQ) += exynos4210-cpufreq.o
-obj-$(CONFIG_ARM_EXYNOS4X12_CPUFREQ) += exynos4x12-cpufreq.o
-obj-$(CONFIG_ARM_EXYNOS5250_CPUFREQ) += exynos5250-cpufreq.o
+obj-$(CONFIG_ARM_EXYNOS_CPUFREQ) += arm-exynos-cpufreq.o
+arm-exynos-cpufreq-y := exynos-cpufreq.o
+arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS4210_CPUFREQ) += exynos4210-cpufreq.o
+arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS4X12_CPUFREQ) += exynos4x12-cpufreq.o
+arm-exynos-cpufreq-$(CONFIG_ARM_EXYNOS5250_CPUFREQ) += exynos5250-cpufreq.o
obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ) += exynos5440-cpufreq.o
obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ) += highbank-cpufreq.o
obj-$(CONFIG_ARM_IMX6Q_CPUFREQ) += imx6q-cpufreq.o
diff --git a/drivers/cpufreq/exynos-cpufreq.c b/drivers/cpufreq/exynos-cpufreq.c
index f99a0b0b7c06..5e98c6b1f284 100644
--- a/drivers/cpufreq/exynos-cpufreq.c
+++ b/drivers/cpufreq/exynos-cpufreq.c
@@ -18,10 +18,13 @@
#include <linux/cpufreq.h>
#include <linux/platform_device.h>
#include <linux/of.h>
+#include <linux/cpu_cooling.h>
+#include <linux/cpu.h>
#include "exynos-cpufreq.h"
static struct exynos_dvfs_info *exynos_info;
+static struct thermal_cooling_device *cdev;
static struct regulator *arm_regulator;
static unsigned int locking_frequency;
@@ -156,6 +159,7 @@ static struct cpufreq_driver exynos_driver = {
static int exynos_cpufreq_probe(struct platform_device *pdev)
{
+ struct device_node *cpus, *np;
int ret = -EINVAL;
exynos_info = kzalloc(sizeof(*exynos_info), GFP_KERNEL);
@@ -198,9 +202,36 @@ static int exynos_cpufreq_probe(struct platform_device *pdev)
/* Done here as we want to capture boot frequency */
locking_frequency = clk_get_rate(exynos_info->cpu_clk) / 1000;
- if (!cpufreq_register_driver(&exynos_driver))
+ ret = cpufreq_register_driver(&exynos_driver);
+ if (ret)
+ goto err_cpufreq_reg;
+
+ cpus = of_find_node_by_path("/cpus");
+ if (!cpus) {
+ pr_err("failed to find cpus node\n");
+ return 0;
+ }
+
+ np = of_get_next_child(cpus, NULL);
+ if (!np) {
+ pr_err("failed to find cpus child node\n");
+ of_node_put(cpus);
return 0;
+ }
+
+ if (of_find_property(np, "#cooling-cells", NULL)) {
+ cdev = of_cpufreq_cooling_register(np,
+ cpu_present_mask);
+ if (IS_ERR(cdev))
+ pr_err("running cpufreq without cooling device: %ld\n",
+ PTR_ERR(cdev));
+ }
+ of_node_put(np);
+ of_node_put(cpus);
+
+ return 0;
+err_cpufreq_reg:
dev_err(&pdev->dev, "failed to register cpufreq driver\n");
regulator_put(arm_regulator);
err_vdd_arm:
diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index d717f3dab6f1..668fb1bdea9e 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -497,6 +497,9 @@ thermal_zone_of_sensor_register(struct device *dev, int sensor_id, void *data,
if (sensor_specs.np == sensor_np && id == sensor_id) {
tzd = thermal_zone_of_add_sensor(child, sensor_np,
data, ops);
+ if (!IS_ERR(tzd))
+ tzd->ops->set_mode(tzd, THERMAL_DEVICE_ENABLED);
+
of_node_put(sensor_specs.np);
of_node_put(child);
goto exit;
diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c
index 9c6ce548e363..3aa46ac7cdbc 100644
--- a/drivers/thermal/rockchip_thermal.c
+++ b/drivers/thermal/rockchip_thermal.c
@@ -193,19 +193,20 @@ static u32 rk_tsadcv2_temp_to_code(long temp)
static long rk_tsadcv2_code_to_temp(u32 code)
{
- int high, low, mid;
-
- low = 0;
- high = ARRAY_SIZE(v2_code_table) - 1;
- mid = (high + low) / 2;
-
- if (code > v2_code_table[low].code || code < v2_code_table[high].code)
- return 125000; /* No code available, return max temperature */
+ unsigned int low = 0;
+ unsigned int high = ARRAY_SIZE(v2_code_table) - 1;
+ unsigned int mid = (low + high) / 2;
+ unsigned int num;
+ unsigned long denom;
+
+ /* Invalid code, return -EAGAIN */
+ if (code > TSADCV2_DATA_MASK)
+ return -EAGAIN;
- while (low <= high) {
- if (code >= v2_code_table[mid].code && code <
- v2_code_table[mid - 1].code)
- return v2_code_table[mid].temp;
+ while (low <= high && mid) {
+ if (code >= v2_code_table[mid].code &&
+ code < v2_code_table[mid - 1].code)
+ break;
else if (code < v2_code_table[mid].code)
low = mid + 1;
else
@@ -213,7 +214,16 @@ static long rk_tsadcv2_code_to_temp(u32 code)
mid = (low + high) / 2;
}
- return 125000;
+ /*
+ * The 5C granularity provided by the table is too much. Let's
+ * assume that the relationship between sensor readings and
+ * temperature between 2 table entries is linear and interpolate
+ * to produce less granular result.
+ */
+ num = v2_code_table[mid].temp - v2_code_table[mid - 1].temp;
+ num *= v2_code_table[mid - 1].code - code;
+ denom = v2_code_table[mid - 1].code - v2_code_table[mid].code;
+ return v2_code_table[mid - 1].temp + (num / denom);
}
/**
diff --git a/drivers/thermal/samsung/Kconfig b/drivers/thermal/samsung/Kconfig
index c43306ecc0ab..c8e35c1a43dc 100644
--- a/drivers/thermal/samsung/Kconfig
+++ b/drivers/thermal/samsung/Kconfig
@@ -7,12 +7,3 @@ config EXYNOS_THERMAL
the TMU, reports temperature and handles cooling action if defined.
This driver uses the Exynos core thermal APIs and TMU configuration
data from the supported SoCs.
-
-config EXYNOS_THERMAL_CORE
- bool "Core thermal framework support for EXYNOS SOCs"
- depends on EXYNOS_THERMAL
- help
- If you say yes here you get support for EXYNOS TMU
- (Thermal Management Unit) common registration/unregistration
- functions to the core thermal layer and also to use the generic
- CPU cooling APIs.
diff --git a/drivers/thermal/samsung/Makefile b/drivers/thermal/samsung/Makefile
index c09d83095dc2..1e47d0d89ce0 100644
--- a/drivers/thermal/samsung/Makefile
+++ b/drivers/thermal/samsung/Makefile
@@ -3,5 +3,3 @@
#
obj-$(CONFIG_EXYNOS_THERMAL) += exynos_thermal.o
exynos_thermal-y := exynos_tmu.o
-exynos_thermal-y += exynos_tmu_data.o
-exynos_thermal-$(CONFIG_EXYNOS_THERMAL_CORE) += exynos_thermal_common.o
diff --git a/drivers/thermal/samsung/exynos_thermal_common.c b/drivers/thermal/samsung/exynos_thermal_common.c
deleted file mode 100644
index 6dc3815cc73f..000000000000
--- a/drivers/thermal/samsung/exynos_thermal_common.c
+++ /dev/null
@@ -1,427 +0,0 @@
-/*
- * exynos_thermal_common.c - Samsung EXYNOS common thermal file
- *
- * Copyright (C) 2013 Samsung Electronics
- * Amit Daniel Kachhap <amit.daniel@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- */
-
-#include <linux/cpu_cooling.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/thermal.h>
-
-#include "exynos_thermal_common.h"
-
-struct exynos_thermal_zone {
- enum thermal_device_mode mode;
- struct thermal_zone_device *therm_dev;
- struct thermal_cooling_device *cool_dev[MAX_COOLING_DEVICE];
- unsigned int cool_dev_size;
- struct platform_device *exynos4_dev;
- struct thermal_sensor_conf *sensor_conf;
- bool bind;
-};
-
-/* Get mode callback functions for thermal zone */
-static int exynos_get_mode(struct thermal_zone_device *thermal,
- enum thermal_device_mode *mode)
-{
- struct exynos_thermal_zone *th_zone = thermal->devdata;
- if (th_zone)
- *mode = th_zone->mode;
- return 0;
-}
-
-/* Set mode callback functions for thermal zone */
-static int exynos_set_mode(struct thermal_zone_device *thermal,
- enum thermal_device_mode mode)
-{
- struct exynos_thermal_zone *th_zone = thermal->devdata;
- if (!th_zone) {
- dev_err(&thermal->device,
- "thermal zone not registered\n");
- return 0;
- }
-
- mutex_lock(&thermal->lock);
-
- if (mode == THERMAL_DEVICE_ENABLED &&
- !th_zone->sensor_conf->trip_data.trigger_falling)
- thermal->polling_delay = IDLE_INTERVAL;
- else
- thermal->polling_delay = 0;
-
- mutex_unlock(&thermal->lock);
-
- th_zone->mode = mode;
- thermal_zone_device_update(thermal);
- dev_dbg(th_zone->sensor_conf->dev,
- "thermal polling set for duration=%d msec\n",
- thermal->polling_delay);
- return 0;
-}
-
-
-/* Get trip type callback functions for thermal zone */
-static int exynos_get_trip_type(struct thermal_zone_device *thermal, int trip,
- enum thermal_trip_type *type)
-{
- struct exynos_thermal_zone *th_zone = thermal->devdata;
- int max_trip = th_zone->sensor_conf->trip_data.trip_count;
- int trip_type;
-
- if (trip < 0 || trip >= max_trip)
- return -EINVAL;
-
- trip_type = th_zone->sensor_conf->trip_data.trip_type[trip];
-
- if (trip_type == SW_TRIP)
- *type = THERMAL_TRIP_CRITICAL;
- else if (trip_type == THROTTLE_ACTIVE)
- *type = THERMAL_TRIP_ACTIVE;
- else if (trip_type == THROTTLE_PASSIVE)
- *type = THERMAL_TRIP_PASSIVE;
- else
- return -EINVAL;
-
- return 0;
-}
-
-/* Get trip temperature callback functions for thermal zone */
-static int exynos_get_trip_temp(struct thermal_zone_device *thermal, int trip,
- unsigned long *temp)
-{
- struct exynos_thermal_zone *th_zone = thermal->devdata;
- int max_trip = th_zone->sensor_conf->trip_data.trip_count;
-
- if (trip < 0 || trip >= max_trip)
- return -EINVAL;
-
- *temp = th_zone->sensor_conf->trip_data.trip_val[trip];
- /* convert the temperature into millicelsius */
- *temp = *temp * MCELSIUS;
-
- return 0;
-}
-
-/* Get critical temperature callback functions for thermal zone */
-static int exynos_get_crit_temp(struct thermal_zone_device *thermal,
- unsigned long *temp)
-{
- struct exynos_thermal_zone *th_zone = thermal->devdata;
- int max_trip = th_zone->sensor_conf->trip_data.trip_count;
- /* Get the temp of highest trip*/
- return exynos_get_trip_temp(thermal, max_trip - 1, temp);
-}
-
-/* Bind callback functions for thermal zone */
-static int exynos_bind(struct thermal_zone_device *thermal,
- struct thermal_cooling_device *cdev)
-{
- int ret = 0, i, tab_size, level;
- struct freq_clip_table *tab_ptr, *clip_data;
- struct exynos_thermal_zone *th_zone = thermal->devdata;
- struct thermal_sensor_conf *data = th_zone->sensor_conf;
-
- tab_ptr = (struct freq_clip_table *)data->cooling_data.freq_data;
- tab_size = data->cooling_data.freq_clip_count;
-
- if (tab_ptr == NULL || tab_size == 0)
- return 0;
-
- /* find the cooling device registered*/
- for (i = 0; i < th_zone->cool_dev_size; i++)
- if (cdev == th_zone->cool_dev[i])
- break;
-
- /* No matching cooling device */
- if (i == th_zone->cool_dev_size)
- return 0;
-
- /* Bind the thermal zone to the cpufreq cooling device */
- for (i = 0; i < tab_size; i++) {
- clip_data = (struct freq_clip_table *)&(tab_ptr[i]);
- level = cpufreq_cooling_get_level(0, clip_data->freq_clip_max);
- if (level == THERMAL_CSTATE_INVALID)
- return 0;
- switch (GET_ZONE(i)) {
- case MONITOR_ZONE:
- case WARN_ZONE:
- if (thermal_zone_bind_cooling_device(thermal, i, cdev,
- level, 0)) {
- dev_err(data->dev,
- "error unbinding cdev inst=%d\n", i);
- ret = -EINVAL;
- }
- th_zone->bind = true;
- break;
- default:
- ret = -EINVAL;
- }
- }
-
- return ret;
-}
-
-/* Unbind callback functions for thermal zone */
-static int exynos_unbind(struct thermal_zone_device *thermal,
- struct thermal_cooling_device *cdev)
-{
- int ret = 0, i, tab_size;
- struct exynos_thermal_zone *th_zone = thermal->devdata;
- struct thermal_sensor_conf *data = th_zone->sensor_conf;
-
- if (th_zone->bind == false)
- return 0;
-
- tab_size = data->cooling_data.freq_clip_count;
-
- if (tab_size == 0)
- return 0;
-
- /* find the cooling device registered*/
- for (i = 0; i < th_zone->cool_dev_size; i++)
- if (cdev == th_zone->cool_dev[i])
- break;
-
- /* No matching cooling device */
- if (i == th_zone->cool_dev_size)
- return 0;
-
- /* Bind the thermal zone to the cpufreq cooling device */
- for (i = 0; i < tab_size; i++) {
- switch (GET_ZONE(i)) {
- case MONITOR_ZONE:
- case WARN_ZONE:
- if (thermal_zone_unbind_cooling_device(thermal, i,
- cdev)) {
- dev_err(data->dev,
- "error unbinding cdev inst=%d\n", i);
- ret = -EINVAL;
- }
- th_zone->bind = false;
- break;
- default:
- ret = -EINVAL;
- }
- }
- return ret;
-}
-
-/* Get temperature callback functions for thermal zone */
-static int exynos_get_temp(struct thermal_zone_device *thermal,
- unsigned long *temp)
-{
- struct exynos_thermal_zone *th_zone = thermal->devdata;
- void *data;
-
- if (!th_zone->sensor_conf) {
- dev_err(&thermal->device,
- "Temperature sensor not initialised\n");
- return -EINVAL;
- }
- data = th_zone->sensor_conf->driver_data;
- *temp = th_zone->sensor_conf->read_temperature(data);
- /* convert the temperature into millicelsius */
- *temp = *temp * MCELSIUS;
- return 0;
-}
-
-/* Get temperature callback functions for thermal zone */
-static int exynos_set_emul_temp(struct thermal_zone_device *thermal,
- unsigned long temp)
-{
- void *data;
- int ret = -EINVAL;
- struct exynos_thermal_zone *th_zone = thermal->devdata;
-
- if (!th_zone->sensor_conf) {
- dev_err(&thermal->device,
- "Temperature sensor not initialised\n");
- return -EINVAL;
- }
- data = th_zone->sensor_conf->driver_data;
- if (th_zone->sensor_conf->write_emul_temp)
- ret = th_zone->sensor_conf->write_emul_temp(data, temp);
- return ret;
-}
-
-/* Get the temperature trend */
-static int exynos_get_trend(struct thermal_zone_device *thermal,
- int trip, enum thermal_trend *trend)
-{
- int ret;
- unsigned long trip_temp;
-
- ret = exynos_get_trip_temp(thermal, trip, &trip_temp);
- if (ret < 0)
- return ret;
-
- if (thermal->temperature >= trip_temp)
- *trend = THERMAL_TREND_RAISE_FULL;
- else
- *trend = THERMAL_TREND_DROP_FULL;
-
- return 0;
-}
-/* Operation callback functions for thermal zone */
-static struct thermal_zone_device_ops exynos_dev_ops = {
- .bind = exynos_bind,
- .unbind = exynos_unbind,
- .get_temp = exynos_get_temp,
- .set_emul_temp = exynos_set_emul_temp,
- .get_trend = exynos_get_trend,
- .get_mode = exynos_get_mode,
- .set_mode = exynos_set_mode,
- .get_trip_type = exynos_get_trip_type,
- .get_trip_temp = exynos_get_trip_temp,
- .get_crit_temp = exynos_get_crit_temp,
-};
-
-/*
- * This function may be called from interrupt based temperature sensor
- * when threshold is changed.
- */
-void exynos_report_trigger(struct thermal_sensor_conf *conf)
-{
- unsigned int i;
- char data[10];
- char *envp[] = { data, NULL };
- struct exynos_thermal_zone *th_zone;
-
- if (!conf || !conf->pzone_data) {
- pr_err("Invalid temperature sensor configuration data\n");
- return;
- }
-
- th_zone = conf->pzone_data;
-
- if (th_zone->bind == false) {
- for (i = 0; i < th_zone->cool_dev_size; i++) {
- if (!th_zone->cool_dev[i])
- continue;
- exynos_bind(th_zone->therm_dev,
- th_zone->cool_dev[i]);
- }
- }
-
- thermal_zone_device_update(th_zone->therm_dev);
-
- mutex_lock(&th_zone->therm_dev->lock);
- /* Find the level for which trip happened */
- for (i = 0; i < th_zone->sensor_conf->trip_data.trip_count; i++) {
- if (th_zone->therm_dev->last_temperature <
- th_zone->sensor_conf->trip_data.trip_val[i] * MCELSIUS)
- break;
- }
-
- if (th_zone->mode == THERMAL_DEVICE_ENABLED &&
- !th_zone->sensor_conf->trip_data.trigger_falling) {
- if (i > 0)
- th_zone->therm_dev->polling_delay = ACTIVE_INTERVAL;
- else
- th_zone->therm_dev->polling_delay = IDLE_INTERVAL;
- }
-
- snprintf(data, sizeof(data), "%u", i);
- kobject_uevent_env(&th_zone->therm_dev->device.kobj, KOBJ_CHANGE, envp);
- mutex_unlock(&th_zone->therm_dev->lock);
-}
-
-/* Register with the in-kernel thermal management */
-int exynos_register_thermal(struct thermal_sensor_conf *sensor_conf)
-{
- int ret;
- struct exynos_thermal_zone *th_zone;
-
- if (!sensor_conf || !sensor_conf->read_temperature) {
- pr_err("Temperature sensor not initialised\n");
- return -EINVAL;
- }
-
- th_zone = devm_kzalloc(sensor_conf->dev,
- sizeof(struct exynos_thermal_zone), GFP_KERNEL);
- if (!th_zone)
- return -ENOMEM;
-
- th_zone->sensor_conf = sensor_conf;
- /*
- * TODO: 1) Handle multiple cooling devices in a thermal zone
- * 2) Add a flag/name in cooling info to map to specific
- * sensor
- */
- if (sensor_conf->cooling_data.freq_clip_count > 0) {
- th_zone->cool_dev[th_zone->cool_dev_size] =
- cpufreq_cooling_register(cpu_present_mask);
- if (IS_ERR(th_zone->cool_dev[th_zone->cool_dev_size])) {
- ret = PTR_ERR(th_zone->cool_dev[th_zone->cool_dev_size]);
- if (ret != -EPROBE_DEFER)
- dev_err(sensor_conf->dev,
- "Failed to register cpufreq cooling device: %d\n",
- ret);
- goto err_unregister;
- }
- th_zone->cool_dev_size++;
- }
-
- th_zone->therm_dev = thermal_zone_device_register(
- sensor_conf->name, sensor_conf->trip_data.trip_count,
- 0, th_zone, &exynos_dev_ops, NULL, 0,
- sensor_conf->trip_data.trigger_falling ? 0 :
- IDLE_INTERVAL);
-
- if (IS_ERR(th_zone->therm_dev)) {
- dev_err(sensor_conf->dev,
- "Failed to register thermal zone device\n");
- ret = PTR_ERR(th_zone->therm_dev);
- goto err_unregister;
- }
- th_zone->mode = THERMAL_DEVICE_ENABLED;
- sensor_conf->pzone_data = th_zone;
-
- dev_info(sensor_conf->dev,
- "Exynos: Thermal zone(%s) registered\n", sensor_conf->name);
-
- return 0;
-
-err_unregister:
- exynos_unregister_thermal(sensor_conf);
- return ret;
-}
-
-/* Un-Register with the in-kernel thermal management */
-void exynos_unregister_thermal(struct thermal_sensor_conf *sensor_conf)
-{
- int i;
- struct exynos_thermal_zone *th_zone;
-
- if (!sensor_conf || !sensor_conf->pzone_data) {
- pr_err("Invalid temperature sensor configuration data\n");
- return;
- }
-
- th_zone = sensor_conf->pzone_data;
-
- thermal_zone_device_unregister(th_zone->therm_dev);
-
- for (i = 0; i < th_zone->cool_dev_size; ++i)
- cpufreq_cooling_unregister(th_zone->cool_dev[i]);
-
- dev_info(sensor_conf->dev,
- "Exynos: Kernel Thermal management unregistered\n");
-}
diff --git a/drivers/thermal/samsung/exynos_thermal_common.h b/drivers/thermal/samsung/exynos_thermal_common.h
deleted file mode 100644
index cd4471925cdd..000000000000
--- a/drivers/thermal/samsung/exynos_thermal_common.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * exynos_thermal_common.h - Samsung EXYNOS common header file
- *
- * Copyright (C) 2013 Samsung Electronics
- * Amit Daniel Kachhap <amit.daniel@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- */
-
-#ifndef _EXYNOS_THERMAL_COMMON_H
-#define _EXYNOS_THERMAL_COMMON_H
-
-/* In-kernel thermal framework related macros & definations */
-#define SENSOR_NAME_LEN 16
-#define MAX_TRIP_COUNT 8
-#define MAX_COOLING_DEVICE 4
-
-#define ACTIVE_INTERVAL 500
-#define IDLE_INTERVAL 10000
-#define MCELSIUS 1000
-
-/* CPU Zone information */
-#define PANIC_ZONE 4
-#define WARN_ZONE 3
-#define MONITOR_ZONE 2
-#define SAFE_ZONE 1
-
-#define GET_ZONE(trip) (trip + 2)
-#define GET_TRIP(zone) (zone - 2)
-
-enum trigger_type {
- THROTTLE_ACTIVE = 1,
- THROTTLE_PASSIVE,
- SW_TRIP,
- HW_TRIP,
-};
-
-/**
- * struct freq_clip_table
- * @freq_clip_max: maximum frequency allowed for this cooling state.
- * @temp_level: Temperature level at which the temperature clipping will
- * happen.
- * @mask_val: cpumask of the allowed cpu's where the clipping will take place.
- *
- * This structure is required to be filled and passed to the
- * cpufreq_cooling_unregister function.
- */
-struct freq_clip_table {
- unsigned int freq_clip_max;
- unsigned int temp_level;
- const struct cpumask *mask_val;
-};
-
-struct thermal_trip_point_conf {
- int trip_val[MAX_TRIP_COUNT];
- int trip_type[MAX_TRIP_COUNT];
- int trip_count;
- unsigned char trigger_falling;
-};
-
-struct thermal_cooling_conf {
- struct freq_clip_table freq_data[MAX_TRIP_COUNT];
- int freq_clip_count;
-};
-
-struct thermal_sensor_conf {
- char name[SENSOR_NAME_LEN];
- int (*read_temperature)(void *data);
- int (*write_emul_temp)(void *drv_data, unsigned long temp);
- struct thermal_trip_point_conf trip_data;
- struct thermal_cooling_conf cooling_data;
- void *driver_data;
- void *pzone_data;
- struct device *dev;
-};
-
-/*Functions used exynos based thermal sensor driver*/
-#ifdef CONFIG_EXYNOS_THERMAL_CORE
-void exynos_unregister_thermal(struct thermal_sensor_conf *sensor_conf);
-int exynos_register_thermal(struct thermal_sensor_conf *sensor_conf);
-void exynos_report_trigger(struct thermal_sensor_conf *sensor_conf);
-#else
-static inline void
-exynos_unregister_thermal(struct thermal_sensor_conf *sensor_conf) { return; }
-
-static inline int
-exynos_register_thermal(struct thermal_sensor_conf *sensor_conf) { return 0; }
-
-static inline void
-exynos_report_trigger(struct thermal_sensor_conf *sensor_conf) { return; }
-
-#endif /* CONFIG_EXYNOS_THERMAL_CORE */
-#endif /* _EXYNOS_THERMAL_COMMON_H */
diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index d2f1e62a4232..fbeedc072cc2 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c
@@ -1,6 +1,10 @@
/*
* exynos_tmu.c - Samsung EXYNOS TMU (Thermal Management Unit)
*
+ * Copyright (C) 2014 Samsung Electronics
+ * Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
+ * Lukasz Majewski <l.majewski@samsung.com>
+ *
* Copyright (C) 2011 Samsung Electronics
* Donggeun Kim <dg77.kim@samsung.com>
* Amit Daniel Kachhap <amit.kachhap@linaro.org>
@@ -31,8 +35,8 @@
#include <linux/platform_device.h>
#include <linux/regulator/consumer.h>
-#include "exynos_thermal_common.h"
#include "exynos_tmu.h"
+#include "../thermal_core.h"
/* Exynos generic registers */
#define EXYNOS_TMU_REG_TRIMINFO 0x0
@@ -115,6 +119,27 @@
#define EXYNOS5440_TMU_TH_RISE4_SHIFT 24
#define EXYNOS5440_EFUSE_SWAP_OFFSET 8
+/* Exynos7 specific registers */
+#define EXYNOS7_THD_TEMP_RISE7_6 0x50
+#define EXYNOS7_THD_TEMP_FALL7_6 0x60
+#define EXYNOS7_TMU_REG_INTEN 0x110
+#define EXYNOS7_TMU_REG_INTPEND 0x118
+#define EXYNOS7_TMU_REG_EMUL_CON 0x160
+
+#define EXYNOS7_TMU_TEMP_MASK 0x1ff
+#define EXYNOS7_PD_DET_EN_SHIFT 23
+#define EXYNOS7_TMU_INTEN_RISE0_SHIFT 0
+#define EXYNOS7_TMU_INTEN_RISE1_SHIFT 1
+#define EXYNOS7_TMU_INTEN_RISE2_SHIFT 2
+#define EXYNOS7_TMU_INTEN_RISE3_SHIFT 3
+#define EXYNOS7_TMU_INTEN_RISE4_SHIFT 4
+#define EXYNOS7_TMU_INTEN_RISE5_SHIFT 5
+#define EXYNOS7_TMU_INTEN_RISE6_SHIFT 6
+#define EXYNOS7_TMU_INTEN_RISE7_SHIFT 7
+#define EXYNOS7_EMUL_DATA_SHIFT 7
+#define EXYNOS7_EMUL_DATA_MASK 0x1ff
+
+#define MCELSIUS 1000
/**
* struct exynos_tmu_data : A structure to hold the private data of the TMU
driver
@@ -128,6 +153,7 @@
* @lock: lock to implement synchronization.
* @clk: pointer to the clock structure.
* @clk_sec: pointer to the clock structure for accessing the base_second.
+ * @sclk: pointer to the clock structure for accessing the tmu special clk.
* @temp_error1: fused value of the first point trim.
* @temp_error2: fused value of the second point trim.
* @regulator: pointer to the TMU regulator structure.
@@ -147,10 +173,11 @@ struct exynos_tmu_data {
enum soc_type soc;
struct work_struct irq_work;
struct mutex lock;
- struct clk *clk, *clk_sec;
- u8 temp_error1, temp_error2;
+ struct clk *clk, *clk_sec, *sclk;
+ u16 temp_error1, temp_error2;
struct regulator *regulator;
- struct thermal_sensor_conf *reg_conf;
+ struct thermal_zone_device *tzd;
+
int (*tmu_initialize)(struct platform_device *pdev);
void (*tmu_control)(struct platform_device *pdev, bool on);
int (*tmu_read)(struct exynos_tmu_data *data);
@@ -159,6 +186,33 @@ struct exynos_tmu_data {
void (*tmu_clear_irqs)(struct exynos_tmu_data *data);
};
+static void exynos_report_trigger(struct exynos_tmu_data *p)
+{
+ char data[10], *envp[] = { data, NULL };
+ struct thermal_zone_device *tz = p->tzd;
+ unsigned long temp;
+ unsigned int i;
+
+ if (!tz) {
+ pr_err("No thermal zone device defined\n");
+ return;
+ }
+
+ thermal_zone_device_update(tz);
+
+ mutex_lock(&tz->lock);
+ /* Find the level for which trip happened */
+ for (i = 0; i < of_thermal_get_ntrips(tz); i++) {
+ tz->ops->get_trip_temp(tz, i, &temp);
+ if (tz->last_temperature < temp)
+ break;
+ }
+
+ snprintf(data, sizeof(data), "%u", i);
+ kobject_uevent_env(&tz->device.kobj, KOBJ_CHANGE, envp);
+ mutex_unlock(&tz->lock);
+}
+
/*
* TMU treats temperature as a mapped temperature code.
* The temperature is converted differently depending on the calibration type.
@@ -190,7 +244,7 @@ static int temp_to_code(struct exynos_tmu_data *data, u8 temp)
* Calculate a temperature value from a temperature code.
* The unit of the temperature is degree Celsius.
*/
-static int code_to_temp(struct exynos_tmu_data *data, u8 temp_code)
+static int code_to_temp(struct exynos_tmu_data *data, u16 temp_code)
{
struct exynos_tmu_platform_data *pdata = data->pdata;
int temp;
@@ -234,14 +288,25 @@ static void sanitize_temp_error(struct exynos_tmu_data *data, u32 trim_info)
static u32 get_th_reg(struct exynos_tmu_data *data, u32 threshold, bool falling)
{
- struct exynos_tmu_platform_data *pdata = data->pdata;
+ struct thermal_zone_device *tz = data->tzd;
+ const struct thermal_trip * const trips =
+ of_thermal_get_trip_points(tz);
+ unsigned long temp;
int i;
- for (i = 0; i < pdata->non_hw_trigger_levels; i++) {
- u8 temp = pdata->trigger_levels[i];
+ if (!trips) {
+ pr_err("%s: Cannot get trip points from of-thermal.c!\n",
+ __func__);
+ return 0;
+ }
+
+ for (i = 0; i < of_thermal_get_ntrips(tz); i++) {
+ if (trips[i].type == THERMAL_TRIP_CRITICAL)
+ continue;
+ temp = trips[i].temperature / MCELSIUS;
if (falling)
- temp -= pdata->threshold_falling;
+ temp -= (trips[i].hysteresis / MCELSIUS);
else
threshold &= ~(0xff << 8 * i);
@@ -305,9 +370,19 @@ static void exynos_tmu_control(struct platform_device *pdev, bool on)
static int exynos4210_tmu_initialize(struct platform_device *pdev)
{
struct exynos_tmu_data *data = platform_get_drvdata(pdev);
- struct exynos_tmu_platform_data *pdata = data->pdata;
- unsigned int status;
+ struct thermal_zone_device *tz = data->tzd;
+ const struct thermal_trip * const trips =
+ of_thermal_get_trip_points(tz);
int ret = 0, threshold_code, i;
+ unsigned long reference, temp;
+ unsigned int status;
+
+ if (!trips) {
+ pr_err("%s: Cannot get trip points from of-thermal.c!\n",
+ __func__);
+ ret = -ENODEV;
+ goto out;
+ }
status = readb(data->base + EXYNOS_TMU_REG_STATUS);
if (!status) {
@@ -318,12 +393,19 @@ static int exynos4210_tmu_initialize(struct platform_device *pdev)
sanitize_temp_error(data, readl(data->base + EXYNOS_TMU_REG_TRIMINFO));
/* Write temperature code for threshold */
- threshold_code = temp_to_code(data, pdata->threshold);
+ reference = trips[0].temperature / MCELSIUS;
+ threshold_code = temp_to_code(data, reference);
+ if (threshold_code < 0) {
+ ret = threshold_code;
+ goto out;
+ }
writeb(threshold_code, data->base + EXYNOS4210_TMU_REG_THRESHOLD_TEMP);
- for (i = 0; i < pdata->non_hw_trigger_levels; i++)
- writeb(pdata->trigger_levels[i], data->base +
+ for (i = 0; i < of_thermal_get_ntrips(tz); i++) {
+ temp = trips[i].temperature / MCELSIUS;
+ writeb(temp - reference, data->base +
EXYNOS4210_TMU_REG_TRIG_LEVEL0 + i * 4);
+ }
data->tmu_clear_irqs(data);
out:
@@ -333,9 +415,11 @@ out:
static int exynos4412_tmu_initialize(struct platform_device *pdev)
{
struct exynos_tmu_data *data = platform_get_drvdata(pdev);
- struct exynos_tmu_platform_data *pdata = data->pdata;
+ const struct thermal_trip * const trips =
+ of_thermal_get_trip_points(data->tzd);
unsigned int status, trim_info, con, ctrl, rising_threshold;
int ret = 0, threshold_code, i;
+ unsigned long crit_temp = 0;
status = readb(data->base + EXYNOS_TMU_REG_STATUS);
if (!status) {
@@ -373,17 +457,29 @@ static int exynos4412_tmu_initialize(struct platform_device *pdev)
data->tmu_clear_irqs(data);
/* if last threshold limit is also present */
- i = pdata->max_trigger_level - 1;
- if (pdata->trigger_levels[i] && pdata->trigger_type[i] == HW_TRIP) {
- threshold_code = temp_to_code(data, pdata->trigger_levels[i]);
- /* 1-4 level to be assigned in th0 reg */
- rising_threshold &= ~(0xff << 8 * i);
- rising_threshold |= threshold_code << 8 * i;
- writel(rising_threshold, data->base + EXYNOS_THD_TEMP_RISE);
- con = readl(data->base + EXYNOS_TMU_REG_CONTROL);
- con |= (1 << EXYNOS_TMU_THERM_TRIP_EN_SHIFT);
- writel(con, data->base + EXYNOS_TMU_REG_CONTROL);
+ for (i = 0; i < of_thermal_get_ntrips(data->tzd); i++) {
+ if (trips[i].type == THERMAL_TRIP_CRITICAL) {
+ crit_temp = trips[i].temperature;
+ break;
+ }
}
+
+ if (i == of_thermal_get_ntrips(data->tzd)) {
+ pr_err("%s: No CRITICAL trip point defined at of-thermal.c!\n",
+ __func__);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ threshold_code = temp_to_code(data, crit_temp / MCELSIUS);
+ /* 1-4 level to be assigned in th0 reg */
+ rising_threshold &= ~(0xff << 8 * i);
+ rising_threshold |= threshold_code << 8 * i;
+ writel(rising_threshold, data->base + EXYNOS_THD_TEMP_RISE);
+ con = readl(data->base + EXYNOS_TMU_REG_CONTROL);
+ con |= (1 << EXYNOS_TMU_THERM_TRIP_EN_SHIFT);
+ writel(con, data->base + EXYNOS_TMU_REG_CONTROL);
+
out:
return ret;
}
@@ -391,9 +487,9 @@ out:
static int exynos5440_tmu_initialize(struct platform_device *pdev)
{
struct exynos_tmu_data *data = platform_get_drvdata(pdev);
- struct exynos_tmu_platform_data *pdata = data->pdata;
unsigned int trim_info = 0, con, rising_threshold;
- int ret = 0, threshold_code, i;
+ int ret = 0, threshold_code;
+ unsigned long crit_temp = 0;
/*
* For exynos5440 soc triminfo value is swapped between TMU0 and
@@ -422,9 +518,8 @@ static int exynos5440_tmu_initialize(struct platform_device *pdev)
data->tmu_clear_irqs(data);
/* if last threshold limit is also present */
- i = pdata->max_trigger_level - 1;
- if (pdata->trigger_levels[i] && pdata->trigger_type[i] == HW_TRIP) {
- threshold_code = temp_to_code(data, pdata->trigger_levels[i]);
+ if (!data->tzd->ops->get_crit_temp(data->tzd, &crit_temp)) {
+ threshold_code = temp_to_code(data, crit_temp / MCELSIUS);
/* 5th level to be assigned in th2 reg */
rising_threshold =
threshold_code << EXYNOS5440_TMU_TH_RISE4_SHIFT;
@@ -439,10 +534,88 @@ static int exynos5440_tmu_initialize(struct platform_device *pdev)
return ret;
}
-static void exynos4210_tmu_control(struct platform_device *pdev, bool on)
+static int exynos7_tmu_initialize(struct platform_device *pdev)
{
struct exynos_tmu_data *data = platform_get_drvdata(pdev);
+ struct thermal_zone_device *tz = data->tzd;
struct exynos_tmu_platform_data *pdata = data->pdata;
+ unsigned int status, trim_info;
+ unsigned int rising_threshold = 0, falling_threshold = 0;
+ int ret = 0, threshold_code, i;
+ unsigned long temp, temp_hist;
+ unsigned int reg_off, bit_off;
+
+ status = readb(data->base + EXYNOS_TMU_REG_STATUS);
+ if (!status) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ trim_info = readl(data->base + EXYNOS_TMU_REG_TRIMINFO);
+
+ data->temp_error1 = trim_info & EXYNOS7_TMU_TEMP_MASK;
+ if (!data->temp_error1 ||
+ (pdata->min_efuse_value > data->temp_error1) ||
+ (data->temp_error1 > pdata->max_efuse_value))
+ data->temp_error1 = pdata->efuse_value & EXYNOS_TMU_TEMP_MASK;
+
+ /* Write temperature code for rising and falling threshold */
+ for (i = (of_thermal_get_ntrips(tz) - 1); i >= 0; i--) {
+ /*
+ * On exynos7 there are 4 rising and 4 falling threshold
+ * registers (0x50-0x5c and 0x60-0x6c respectively). Each
+ * register holds the value of two threshold levels (at bit
+ * offsets 0 and 16). Based on the fact that there are atmost
+ * eight possible trigger levels, calculate the register and
+ * bit offsets where the threshold levels are to be written.
+ *
+ * e.g. EXYNOS7_THD_TEMP_RISE7_6 (0x50)
+ * [24:16] - Threshold level 7
+ * [8:0] - Threshold level 6
+ * e.g. EXYNOS7_THD_TEMP_RISE5_4 (0x54)
+ * [24:16] - Threshold level 5
+ * [8:0] - Threshold level 4
+ *
+ * and similarly for falling thresholds.
+ *
+ * Based on the above, calculate the register and bit offsets
+ * for rising/falling threshold levels and populate them.
+ */
+ reg_off = ((7 - i) / 2) * 4;
+ bit_off = ((8 - i) % 2);
+
+ tz->ops->get_trip_temp(tz, i, &temp);
+ temp /= MCELSIUS;
+
+ tz->ops->get_trip_hyst(tz, i, &temp_hist);
+ temp_hist = temp - (temp_hist / MCELSIUS);
+
+ /* Set 9-bit temperature code for rising threshold levels */
+ threshold_code = temp_to_code(data, temp);
+ rising_threshold = readl(data->base +
+ EXYNOS7_THD_TEMP_RISE7_6 + reg_off);
+ rising_threshold &= ~(EXYNOS7_TMU_TEMP_MASK << (16 * bit_off));
+ rising_threshold |= threshold_code << (16 * bit_off);
+ writel(rising_threshold,
+ data->base + EXYNOS7_THD_TEMP_RISE7_6 + reg_off);
+
+ /* Set 9-bit temperature code for falling threshold levels */
+ threshold_code = temp_to_code(data, temp_hist);
+ falling_threshold &= ~(EXYNOS7_TMU_TEMP_MASK << (16 * bit_off));
+ falling_threshold |= threshold_code << (16 * bit_off);
+ writel(falling_threshold,
+ data->base + EXYNOS7_THD_TEMP_FALL7_6 + reg_off);
+ }
+
+ data->tmu_clear_irqs(data);
+out:
+ return ret;
+}
+
+static void exynos4210_tmu_control(struct platform_device *pdev, bool on)
+{
+ struct exynos_tmu_data *data = platform_get_drvdata(pdev);
+ struct thermal_zone_device *tz = data->tzd;
unsigned int con, interrupt_en;
con = get_con_reg(data, readl(data->base + EXYNOS_TMU_REG_CONTROL));
@@ -450,10 +623,15 @@ static void exynos4210_tmu_control(struct platform_device *pdev, bool on)
if (on) {
con |= (1 << EXYNOS_TMU_CORE_EN_SHIFT);
interrupt_en =
- pdata->trigger_enable[3] << EXYNOS_TMU_INTEN_RISE3_SHIFT |
- pdata->trigger_enable[2] << EXYNOS_TMU_INTEN_RISE2_SHIFT |
- pdata->trigger_enable[1] << EXYNOS_TMU_INTEN_RISE1_SHIFT |
- pdata->trigger_enable[0] << EXYNOS_TMU_INTEN_RISE0_SHIFT;
+ (of_thermal_is_trip_valid(tz, 3)
+ << EXYNOS_TMU_INTEN_RISE3_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 2)
+ << EXYNOS_TMU_INTEN_RISE2_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 1)
+ << EXYNOS_TMU_INTEN_RISE1_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 0)
+ << EXYNOS_TMU_INTEN_RISE0_SHIFT);
+
if (data->soc != SOC_ARCH_EXYNOS4210)
interrupt_en |=
interrupt_en << EXYNOS_TMU_INTEN_FALL0_SHIFT;
@@ -468,7 +646,7 @@ static void exynos4210_tmu_control(struct platform_device *pdev, bool on)
static void exynos5440_tmu_control(struct platform_device *pdev, bool on)
{
struct exynos_tmu_data *data = platform_get_drvdata(pdev);
- struct exynos_tmu_platform_data *pdata = data->pdata;
+ struct thermal_zone_device *tz = data->tzd;
unsigned int con, interrupt_en;
con = get_con_reg(data, readl(data->base + EXYNOS5440_TMU_S0_7_CTRL));
@@ -476,11 +654,16 @@ static void exynos5440_tmu_control(struct platform_device *pdev, bool on)
if (on) {
con |= (1 << EXYNOS_TMU_CORE_EN_SHIFT);
interrupt_en =
- pdata->trigger_enable[3] << EXYNOS5440_TMU_INTEN_RISE3_SHIFT |
- pdata->trigger_enable[2] << EXYNOS5440_TMU_INTEN_RISE2_SHIFT |
- pdata->trigger_enable[1] << EXYNOS5440_TMU_INTEN_RISE1_SHIFT |
- pdata->trigger_enable[0] << EXYNOS5440_TMU_INTEN_RISE0_SHIFT;
- interrupt_en |= interrupt_en << EXYNOS5440_TMU_INTEN_FALL0_SHIFT;
+ (of_thermal_is_trip_valid(tz, 3)
+ << EXYNOS5440_TMU_INTEN_RISE3_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 2)
+ << EXYNOS5440_TMU_INTEN_RISE2_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 1)
+ << EXYNOS5440_TMU_INTEN_RISE1_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 0)
+ << EXYNOS5440_TMU_INTEN_RISE0_SHIFT);
+ interrupt_en |=
+ interrupt_en << EXYNOS5440_TMU_INTEN_FALL0_SHIFT;
} else {
con &= ~(1 << EXYNOS_TMU_CORE_EN_SHIFT);
interrupt_en = 0; /* Disable all interrupts */
@@ -489,19 +672,62 @@ static void exynos5440_tmu_control(struct platform_device *pdev, bool on)
writel(con, data->base + EXYNOS5440_TMU_S0_7_CTRL);
}
-static int exynos_tmu_read(struct exynos_tmu_data *data)
+static void exynos7_tmu_control(struct platform_device *pdev, bool on)
{
- int ret;
+ struct exynos_tmu_data *data = platform_get_drvdata(pdev);
+ struct thermal_zone_device *tz = data->tzd;
+ unsigned int con, interrupt_en;
+
+ con = get_con_reg(data, readl(data->base + EXYNOS_TMU_REG_CONTROL));
+
+ if (on) {
+ con |= (1 << EXYNOS_TMU_CORE_EN_SHIFT);
+ interrupt_en =
+ (of_thermal_is_trip_valid(tz, 7)
+ << EXYNOS7_TMU_INTEN_RISE7_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 6)
+ << EXYNOS7_TMU_INTEN_RISE6_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 5)
+ << EXYNOS7_TMU_INTEN_RISE5_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 4)
+ << EXYNOS7_TMU_INTEN_RISE4_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 3)
+ << EXYNOS7_TMU_INTEN_RISE3_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 2)
+ << EXYNOS7_TMU_INTEN_RISE2_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 1)
+ << EXYNOS7_TMU_INTEN_RISE1_SHIFT) |
+ (of_thermal_is_trip_valid(tz, 0)
+ << EXYNOS7_TMU_INTEN_RISE0_SHIFT);
+
+ interrupt_en |=
+ interrupt_en << EXYNOS_TMU_INTEN_FALL0_SHIFT;
+ } else {
+ con &= ~(1 << EXYNOS_TMU_CORE_EN_SHIFT);
+ interrupt_en = 0; /* Disable all interrupts */
+ }
+ con |= 1 << EXYNOS7_PD_DET_EN_SHIFT;
+
+ writel(interrupt_en, data->base + EXYNOS7_TMU_REG_INTEN);
+ writel(con, data->base + EXYNOS_TMU_REG_CONTROL);
+}
+
+static int exynos_get_temp(void *p, long *temp)
+{
+ struct exynos_tmu_data *data = p;
+
+ if (!data)
+ return -EINVAL;
mutex_lock(&data->lock);
clk_enable(data->clk);
- ret = data->tmu_read(data);
- if (ret >= 0)
- ret = code_to_temp(data, ret);
+
+ *temp = code_to_temp(data, data->tmu_read(data)) * MCELSIUS;
+
clk_disable(data->clk);
mutex_unlock(&data->lock);
- return ret;
+ return 0;
}
#ifdef CONFIG_THERMAL_EMULATION
@@ -515,9 +741,19 @@ static u32 get_emul_con_reg(struct exynos_tmu_data *data, unsigned int val,
val &= ~(EXYNOS_EMUL_TIME_MASK << EXYNOS_EMUL_TIME_SHIFT);
val |= (EXYNOS_EMUL_TIME << EXYNOS_EMUL_TIME_SHIFT);
}
- val &= ~(EXYNOS_EMUL_DATA_MASK << EXYNOS_EMUL_DATA_SHIFT);
- val |= (temp_to_code(data, temp) << EXYNOS_EMUL_DATA_SHIFT) |
- EXYNOS_EMUL_ENABLE;
+ if (data->soc == SOC_ARCH_EXYNOS7) {
+ val &= ~(EXYNOS7_EMUL_DATA_MASK <<
+ EXYNOS7_EMUL_DATA_SHIFT);
+ val |= (temp_to_code(data, temp) <<
+ EXYNOS7_EMUL_DATA_SHIFT) |
+ EXYNOS_EMUL_ENABLE;
+ } else {
+ val &= ~(EXYNOS_EMUL_DATA_MASK <<
+ EXYNOS_EMUL_DATA_SHIFT);
+ val |= (temp_to_code(data, temp) <<
+ EXYNOS_EMUL_DATA_SHIFT) |
+ EXYNOS_EMUL_ENABLE;
+ }
} else {
val &= ~EXYNOS_EMUL_ENABLE;
}
@@ -533,6 +769,8 @@ static void exynos4412_tmu_set_emulation(struct exynos_tmu_data *data,
if (data->soc == SOC_ARCH_EXYNOS5260)
emul_con = EXYNOS5260_EMUL_CON;
+ else if (data->soc == SOC_ARCH_EXYNOS7)
+ emul_con = EXYNOS7_TMU_REG_EMUL_CON;
else
emul_con = EXYNOS_EMUL_CON;
@@ -576,7 +814,7 @@ out:
#define exynos5440_tmu_set_emulation NULL
static int exynos_tmu_set_emulation(void *drv_data, unsigned long temp)
{ return -EINVAL; }
-#endif/*CONFIG_THERMAL_EMULATION*/
+#endif /* CONFIG_THERMAL_EMULATION */
static int exynos4210_tmu_read(struct exynos_tmu_data *data)
{
@@ -596,6 +834,12 @@ static int exynos5440_tmu_read(struct exynos_tmu_data *data)
return readb(data->base + EXYNOS5440_TMU_S0_7_TEMP);
}
+static int exynos7_tmu_read(struct exynos_tmu_data *data)
+{
+ return readw(data->base + EXYNOS_TMU_REG_CURRENT_TEMP) &
+ EXYNOS7_TMU_TEMP_MASK;
+}
+
static void exynos_tmu_work(struct work_struct *work)
{
struct exynos_tmu_data *data = container_of(work,
@@ -613,7 +857,7 @@ static void exynos_tmu_work(struct work_struct *work)
if (!IS_ERR(data->clk_sec))
clk_disable(data->clk_sec);
- exynos_report_trigger(data->reg_conf);
+ exynos_report_trigger(data);
mutex_lock(&data->lock);
clk_enable(data->clk);
@@ -634,6 +878,9 @@ static void exynos4210_tmu_clear_irqs(struct exynos_tmu_data *data)
if (data->soc == SOC_ARCH_EXYNOS5260) {
tmu_intstat = EXYNOS5260_TMU_REG_INTSTAT;
tmu_intclear = EXYNOS5260_TMU_REG_INTCLEAR;
+ } else if (data->soc == SOC_ARCH_EXYNOS7) {
+ tmu_intstat = EXYNOS7_TMU_REG_INTPEND;
+ tmu_intclear = EXYNOS7_TMU_REG_INTPEND;
} else {
tmu_intstat = EXYNOS_TMU_REG_INTSTAT;
tmu_intclear = EXYNOS_TMU_REG_INTCLEAR;
@@ -673,55 +920,94 @@ static irqreturn_t exynos_tmu_irq(int irq, void *id)
static const struct of_device_id exynos_tmu_match[] = {
{
.compatible = "samsung,exynos3250-tmu",
- .data = &exynos3250_default_tmu_data,
},
{
.compatible = "samsung,exynos4210-tmu",
- .data = &exynos4210_default_tmu_data,
},
{
.compatible = "samsung,exynos4412-tmu",
- .data = &exynos4412_default_tmu_data,
},
{
.compatible = "samsung,exynos5250-tmu",
- .data = &exynos5250_default_tmu_data,
},
{
.compatible = "samsung,exynos5260-tmu",
- .data = &exynos5260_default_tmu_data,
},
{
.compatible = "samsung,exynos5420-tmu",
- .data = &exynos5420_default_tmu_data,
},
{
.compatible = "samsung,exynos5420-tmu-ext-triminfo",
- .data = &exynos5420_default_tmu_data,
},
{
.compatible = "samsung,exynos5440-tmu",
- .data = &exynos5440_default_tmu_data,
+ },
+ {
+ .compatible = "samsung,exynos7-tmu",
},
{},
};
MODULE_DEVICE_TABLE(of, exynos_tmu_match);
-static inline struct exynos_tmu_platform_data *exynos_get_driver_data(
- struct platform_device *pdev, int id)
+static int exynos_of_get_soc_type(struct device_node *np)
+{
+ if (of_device_is_compatible(np, "samsung,exynos3250-tmu"))
+ return SOC_ARCH_EXYNOS3250;
+ else if (of_device_is_compatible(np, "samsung,exynos4210-tmu"))
+ return SOC_ARCH_EXYNOS4210;
+ else if (of_device_is_compatible(np, "samsung,exynos4412-tmu"))
+ return SOC_ARCH_EXYNOS4412;
+ else if (of_device_is_compatible(np, "samsung,exynos5250-tmu"))
+ return SOC_ARCH_EXYNOS5250;
+ else if (of_device_is_compatible(np, "samsung,exynos5260-tmu"))
+ return SOC_ARCH_EXYNOS5260;
+ else if (of_device_is_compatible(np, "samsung,exynos5420-tmu"))
+ return SOC_ARCH_EXYNOS5420;
+ else if (of_device_is_compatible(np,
+ "samsung,exynos5420-tmu-ext-triminfo"))
+ return SOC_ARCH_EXYNOS5420_TRIMINFO;
+ else if (of_device_is_compatible(np, "samsung,exynos5440-tmu"))
+ return SOC_ARCH_EXYNOS5440;
+ else if (of_device_is_compatible(np, "samsung,exynos7-tmu"))
+ return SOC_ARCH_EXYNOS7;
+
+ return -EINVAL;
+}
+
+static int exynos_of_sensor_conf(struct device_node *np,
+ struct exynos_tmu_platform_data *pdata)
{
- struct exynos_tmu_init_data *data_table;
- struct exynos_tmu_platform_data *tmu_data;
- const struct of_device_id *match;
+ u32 value;
+ int ret;
- match = of_match_node(exynos_tmu_match, pdev->dev.of_node);
- if (!match)
- return NULL;
- data_table = (struct exynos_tmu_init_data *) match->data;
- if (!data_table || id >= data_table->tmu_count)
- return NULL;
- tmu_data = data_table->tmu_data;
- return (struct exynos_tmu_platform_data *) (tmu_data + id);
+ of_node_get(np);
+
+ ret = of_property_read_u32(np, "samsung,tmu_gain", &value);
+ pdata->gain = (u8)value;
+ of_property_read_u32(np, "samsung,tmu_reference_voltage", &value);
+ pdata->reference_voltage = (u8)value;
+ of_property_read_u32(np, "samsung,tmu_noise_cancel_mode", &value);
+ pdata->noise_cancel_mode = (u8)value;
+
+ of_property_read_u32(np, "samsung,tmu_efuse_value",
+ &pdata->efuse_value);
+ of_property_read_u32(np, "samsung,tmu_min_efuse_value",
+ &pdata->min_efuse_value);
+ of_property_read_u32(np, "samsung,tmu_max_efuse_value",
+ &pdata->max_efuse_value);
+
+ of_property_read_u32(np, "samsung,tmu_first_point_trim", &value);
+ pdata->first_point_trim = (u8)value;
+ of_property_read_u32(np, "samsung,tmu_second_point_trim", &value);
+ pdata->second_point_trim = (u8)value;
+ of_property_read_u32(np, "samsung,tmu_default_temp_offset", &value);
+ pdata->default_temp_offset = (u8)value;
+
+ of_property_read_u32(np, "samsung,tmu_cal_type", &pdata->cal_type);
+ of_property_read_u32(np, "samsung,tmu_cal_mode", &pdata->cal_mode);
+
+ of_node_put(np);
+ return 0;
}
static int exynos_map_dt_data(struct platform_device *pdev)
@@ -771,14 +1057,15 @@ static int exynos_map_dt_data(struct platform_device *pdev)
return -EADDRNOTAVAIL;
}
- pdata = exynos_get_driver_data(pdev, data->id);
- if (!pdata) {
- dev_err(&pdev->dev, "No platform init data supplied.\n");
- return -ENODEV;
- }
+ pdata = devm_kzalloc(&pdev->dev,
+ sizeof(struct exynos_tmu_platform_data),
+ GFP_KERNEL);
+ if (!pdata)
+ return -ENOMEM;
+ exynos_of_sensor_conf(pdev->dev.of_node, pdata);
data->pdata = pdata;
- data->soc = pdata->type;
+ data->soc = exynos_of_get_soc_type(pdev->dev.of_node);
switch (data->soc) {
case SOC_ARCH_EXYNOS4210:
@@ -806,6 +1093,13 @@ static int exynos_map_dt_data(struct platform_device *pdev)
data->tmu_set_emulation = exynos5440_tmu_set_emulation;
data->tmu_clear_irqs = exynos5440_tmu_clear_irqs;
break;
+ case SOC_ARCH_EXYNOS7:
+ data->tmu_initialize = exynos7_tmu_initialize;
+ data->tmu_control = exynos7_tmu_control;
+ data->tmu_read = exynos7_tmu_read;
+ data->tmu_set_emulation = exynos4412_tmu_set_emulation;
+ data->tmu_clear_irqs = exynos4210_tmu_clear_irqs;
+ break;
default:
dev_err(&pdev->dev, "Platform not supported\n");
return -EINVAL;
@@ -834,12 +1128,16 @@ static int exynos_map_dt_data(struct platform_device *pdev)
return 0;
}
+static struct thermal_zone_of_device_ops exynos_sensor_ops = {
+ .get_temp = exynos_get_temp,
+ .set_emul_temp = exynos_tmu_set_emulation,
+};
+
static int exynos_tmu_probe(struct platform_device *pdev)
{
- struct exynos_tmu_data *data;
struct exynos_tmu_platform_data *pdata;
- struct thermal_sensor_conf *sensor_conf;
- int ret, i;
+ struct exynos_tmu_data *data;
+ int ret;
data = devm_kzalloc(&pdev->dev, sizeof(struct exynos_tmu_data),
GFP_KERNEL);
@@ -849,9 +1147,15 @@ static int exynos_tmu_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, data);
mutex_init(&data->lock);
+ data->tzd = thermal_zone_of_sensor_register(&pdev->dev, 0, data,
+ &exynos_sensor_ops);
+ if (IS_ERR(data->tzd)) {
+ pr_err("thermal: tz: %p ERROR\n", data->tzd);
+ return PTR_ERR(data->tzd);
+ }
ret = exynos_map_dt_data(pdev);
if (ret)
- return ret;
+ goto err_sensor;
pdata = data->pdata;
@@ -860,20 +1164,22 @@ static int exynos_tmu_probe(struct platform_device *pdev)
data->clk = devm_clk_get(&pdev->dev, "tmu_apbif");
if (IS_ERR(data->clk)) {
dev_err(&pdev->dev, "Failed to get clock\n");
- return PTR_ERR(data->clk);
+ ret = PTR_ERR(data->clk);
+ goto err_sensor;
}
data->clk_sec = devm_clk_get(&pdev->dev, "tmu_triminfo_apbif");
if (IS_ERR(data->clk_sec)) {
if (data->soc == SOC_ARCH_EXYNOS5420_TRIMINFO) {
dev_err(&pdev->dev, "Failed to get triminfo clock\n");
- return PTR_ERR(data->clk_sec);
+ ret = PTR_ERR(data->clk_sec);
+ goto err_sensor;
}
} else {
ret = clk_prepare(data->clk_sec);
if (ret) {
dev_err(&pdev->dev, "Failed to get clock\n");
- return ret;
+ goto err_sensor;
}
}
@@ -883,82 +1189,57 @@ static int exynos_tmu_probe(struct platform_device *pdev)
goto err_clk_sec;
}
- ret = exynos_tmu_initialize(pdev);
- if (ret) {
- dev_err(&pdev->dev, "Failed to initialize TMU\n");
- goto err_clk;
+ if (data->soc == SOC_ARCH_EXYNOS7) {
+ data->sclk = devm_clk_get(&pdev->dev, "tmu_sclk");
+ if (IS_ERR(data->sclk)) {
+ dev_err(&pdev->dev, "Failed to get sclk\n");
+ goto err_clk;
+ } else {
+ ret = clk_prepare_enable(data->sclk);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to enable sclk\n");
+ goto err_clk;
+ }
+ }
}
- exynos_tmu_control(pdev, true);
-
- /* Allocate a structure to register with the exynos core thermal */
- sensor_conf = devm_kzalloc(&pdev->dev,
- sizeof(struct thermal_sensor_conf), GFP_KERNEL);
- if (!sensor_conf) {
- ret = -ENOMEM;
- goto err_clk;
- }
- sprintf(sensor_conf->name, "therm_zone%d", data->id);
- sensor_conf->read_temperature = (int (*)(void *))exynos_tmu_read;
- sensor_conf->write_emul_temp =
- (int (*)(void *, unsigned long))exynos_tmu_set_emulation;
- sensor_conf->driver_data = data;
- sensor_conf->trip_data.trip_count = pdata->trigger_enable[0] +
- pdata->trigger_enable[1] + pdata->trigger_enable[2]+
- pdata->trigger_enable[3];
-
- for (i = 0; i < sensor_conf->trip_data.trip_count; i++) {
- sensor_conf->trip_data.trip_val[i] =
- pdata->threshold + pdata->trigger_levels[i];
- sensor_conf->trip_data.trip_type[i] =
- pdata->trigger_type[i];
- }
-
- sensor_conf->trip_data.trigger_falling = pdata->threshold_falling;
-
- sensor_conf->cooling_data.freq_clip_count = pdata->freq_tab_count;
- for (i = 0; i < pdata->freq_tab_count; i++) {
- sensor_conf->cooling_data.freq_data[i].freq_clip_max =
- pdata->freq_tab[i].freq_clip_max;
- sensor_conf->cooling_data.freq_data[i].temp_level =
- pdata->freq_tab[i].temp_level;
- }
- sensor_conf->dev = &pdev->dev;
- /* Register the sensor with thermal management interface */
- ret = exynos_register_thermal(sensor_conf);
+ ret = exynos_tmu_initialize(pdev);
if (ret) {
- if (ret != -EPROBE_DEFER)
- dev_err(&pdev->dev,
- "Failed to register thermal interface: %d\n",
- ret);
- goto err_clk;
+ dev_err(&pdev->dev, "Failed to initialize TMU\n");
+ goto err_sclk;
}
- data->reg_conf = sensor_conf;
ret = devm_request_irq(&pdev->dev, data->irq, exynos_tmu_irq,
IRQF_TRIGGER_RISING | IRQF_SHARED, dev_name(&pdev->dev), data);
if (ret) {
dev_err(&pdev->dev, "Failed to request irq: %d\n", data->irq);
- goto err_clk;
+ goto err_sclk;
}
+ exynos_tmu_control(pdev, true);
return 0;
+err_sclk:
+ clk_disable_unprepare(data->sclk);
err_clk:
clk_unprepare(data->clk);
err_clk_sec:
if (!IS_ERR(data->clk_sec))
clk_unprepare(data->clk_sec);
+err_sensor:
+ thermal_zone_of_sensor_unregister(&pdev->dev, data->tzd);
+
return ret;
}
static int exynos_tmu_remove(struct platform_device *pdev)
{
struct exynos_tmu_data *data = platform_get_drvdata(pdev);
+ struct thermal_zone_device *tzd = data->tzd;
- exynos_unregister_thermal(data->reg_conf);
-
+ thermal_zone_of_sensor_unregister(&pdev->dev, tzd);
exynos_tmu_control(pdev, false);
+ clk_disable_unprepare(data->sclk);
clk_unprepare(data->clk);
if (!IS_ERR(data->clk_sec))
clk_unprepare(data->clk_sec);
diff --git a/drivers/thermal/samsung/exynos_tmu.h b/drivers/thermal/samsung/exynos_tmu.h
index da3009bff6c4..4d71ec6c9aa0 100644
--- a/drivers/thermal/samsung/exynos_tmu.h
+++ b/drivers/thermal/samsung/exynos_tmu.h
@@ -23,16 +23,7 @@
#ifndef _EXYNOS_TMU_H
#define _EXYNOS_TMU_H
#include <linux/cpu_cooling.h>
-
-#include "exynos_thermal_common.h"
-
-enum calibration_type {
- TYPE_ONE_POINT_TRIMMING,
- TYPE_ONE_POINT_TRIMMING_25,
- TYPE_ONE_POINT_TRIMMING_85,
- TYPE_TWO_POINT_TRIMMING,
- TYPE_NONE,
-};
+#include <dt-bindings/thermal/thermal_exynos.h>
enum soc_type {
SOC_ARCH_EXYNOS3250 = 1,
@@ -43,38 +34,11 @@ enum soc_type {
SOC_ARCH_EXYNOS5420,
SOC_ARCH_EXYNOS5420_TRIMINFO,
SOC_ARCH_EXYNOS5440,
+ SOC_ARCH_EXYNOS7,
};
/**
* struct exynos_tmu_platform_data
- * @threshold: basic temperature for generating interrupt
- * 25 <= threshold <= 125 [unit: degree Celsius]
- * @threshold_falling: differntial value for setting threshold
- * of temperature falling interrupt.
- * @trigger_levels: array for each interrupt levels
- * [unit: degree Celsius]
- * 0: temperature for trigger_level0 interrupt
- * condition for trigger_level0 interrupt:
- * current temperature > threshold + trigger_levels[0]
- * 1: temperature for trigger_level1 interrupt
- * condition for trigger_level1 interrupt:
- * current temperature > threshold + trigger_levels[1]
- * 2: temperature for trigger_level2 interrupt
- * condition for trigger_level2 interrupt:
- * current temperature > threshold + trigger_levels[2]
- * 3: temperature for trigger_level3 interrupt
- * condition for trigger_level3 interrupt:
- * current temperature > threshold + trigger_levels[3]
- * @trigger_type: defines the type of trigger. Possible values are,
- * THROTTLE_ACTIVE trigger type
- * THROTTLE_PASSIVE trigger type
- * SW_TRIP trigger type
- * HW_TRIP
- * @trigger_enable[]: array to denote which trigger levels are enabled.
- * 1 = enable trigger_level[] interrupt,
- * 0 = disable trigger_level[] interrupt
- * @max_trigger_level: max trigger level supported by the TMU
- * @non_hw_trigger_levels: number of defined non-hardware trigger levels
* @gain: gain of amplifier in the positive-TC generator block
* 0 < gain <= 15
* @reference_voltage: reference voltage of amplifier
@@ -86,24 +50,12 @@ enum soc_type {
* @efuse_value: platform defined fuse value
* @min_efuse_value: minimum valid trimming data
* @max_efuse_value: maximum valid trimming data
- * @first_point_trim: temp value of the first point trimming
- * @second_point_trim: temp value of the second point trimming
* @default_temp_offset: default temperature offset in case of no trimming
* @cal_type: calibration type for temperature
- * @freq_clip_table: Table representing frequency reduction percentage.
- * @freq_tab_count: Count of the above table as frequency reduction may
- * applicable to only some of the trigger levels.
*
* This structure is required for configuration of exynos_tmu driver.
*/
struct exynos_tmu_platform_data {
- u8 threshold;
- u8 threshold_falling;
- u8 trigger_levels[MAX_TRIP_COUNT];
- enum trigger_type trigger_type[MAX_TRIP_COUNT];
- bool trigger_enable[MAX_TRIP_COUNT];
- u8 max_trigger_level;
- u8 non_hw_trigger_levels;
u8 gain;
u8 reference_voltage;
u8 noise_cancel_mode;
@@ -115,30 +67,9 @@ struct exynos_tmu_platform_data {
u8 second_point_trim;
u8 default_temp_offset;
- enum calibration_type cal_type;
enum soc_type type;
- struct freq_clip_table freq_tab[4];
- unsigned int freq_tab_count;
-};
-
-/**
- * struct exynos_tmu_init_data
- * @tmu_count: number of TMU instances.
- * @tmu_data: platform data of all TMU instances.
- * This structure is required to store data for multi-instance exynos tmu
- * driver.
- */
-struct exynos_tmu_init_data {
- int tmu_count;
- struct exynos_tmu_platform_data tmu_data[];
+ u32 cal_type;
+ u32 cal_mode;
};
-extern struct exynos_tmu_init_data const exynos3250_default_tmu_data;
-extern struct exynos_tmu_init_data const exynos4210_default_tmu_data;
-extern struct exynos_tmu_init_data const exynos4412_default_tmu_data;
-extern struct exynos_tmu_init_data const exynos5250_default_tmu_data;
-extern struct exynos_tmu_init_data const exynos5260_default_tmu_data;
-extern struct exynos_tmu_init_data const exynos5420_default_tmu_data;
-extern struct exynos_tmu_init_data const exynos5440_default_tmu_data;
-
#endif /* _EXYNOS_TMU_H */
diff --git a/drivers/thermal/samsung/exynos_tmu_data.c b/drivers/thermal/samsung/exynos_tmu_data.c
deleted file mode 100644
index b23910069f68..000000000000
--- a/drivers/thermal/samsung/exynos_tmu_data.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * exynos_tmu_data.c - Samsung EXYNOS tmu data file
- *
- * Copyright (C) 2013 Samsung Electronics
- * Amit Daniel Kachhap <amit.daniel@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- */
-
-#include "exynos_thermal_common.h"
-#include "exynos_tmu.h"
-
-struct exynos_tmu_init_data const exynos4210_default_tmu_data = {
- .tmu_data = {
- {
- .threshold = 80,
- .trigger_levels[0] = 5,
- .trigger_levels[1] = 20,
- .trigger_levels[2] = 30,
- .trigger_enable[0] = true,
- .trigger_enable[1] = true,
- .trigger_enable[2] = true,
- .trigger_enable[3] = false,
- .trigger_type[0] = THROTTLE_ACTIVE,
- .trigger_type[1] = THROTTLE_ACTIVE,
- .trigger_type[2] = SW_TRIP,
- .max_trigger_level = 4,
- .non_hw_trigger_levels = 3,
- .gain = 15,
- .reference_voltage = 7,
- .cal_type = TYPE_ONE_POINT_TRIMMING,
- .min_efuse_value = 40,
- .max_efuse_value = 100,
- .first_point_trim = 25,
- .second_point_trim = 85,
- .default_temp_offset = 50,
- .freq_tab[0] = {
- .freq_clip_max = 800 * 1000,
- .temp_level = 85,
- },
- .freq_tab[1] = {
- .freq_clip_max = 200 * 1000,
- .temp_level = 100,
- },
- .freq_tab_count = 2,
- .type = SOC_ARCH_EXYNOS4210,
- },
- },
- .tmu_count = 1,
-};
-
-#define EXYNOS3250_TMU_DATA \
- .threshold_falling = 10, \
- .trigger_levels[0] = 70, \
- .trigger_levels[1] = 95, \
- .trigger_levels[2] = 110, \
- .trigger_levels[3] = 120, \
- .trigger_enable[0] = true, \
- .trigger_enable[1] = true, \
- .trigger_enable[2] = true, \
- .trigger_enable[3] = false, \
- .trigger_type[0] = THROTTLE_ACTIVE, \
- .trigger_type[1] = THROTTLE_ACTIVE, \
- .trigger_type[2] = SW_TRIP, \
- .trigger_type[3] = HW_TRIP, \
- .max_trigger_level = 4, \
- .non_hw_trigger_levels = 3, \
- .gain = 8, \
- .reference_voltage = 16, \
- .noise_cancel_mode = 4, \
- .cal_type = TYPE_TWO_POINT_TRIMMING, \
- .efuse_value = 55, \
- .min_efuse_value = 40, \
- .max_efuse_value = 100, \
- .first_point_trim = 25, \
- .second_point_trim = 85, \
- .default_temp_offset = 50, \
- .freq_tab[0] = { \
- .freq_clip_max = 800 * 1000, \
- .temp_level = 70, \
- }, \
- .freq_tab[1] = { \
- .freq_clip_max = 400 * 1000, \
- .temp_level = 95, \
- }, \
- .freq_tab_count = 2
-
-struct exynos_tmu_init_data const exynos3250_default_tmu_data = {
- .tmu_data = {
- {
- EXYNOS3250_TMU_DATA,
- .type = SOC_ARCH_EXYNOS3250,
- },
- },
- .tmu_count = 1,
-};
-
-#define EXYNOS4412_TMU_DATA \
- .threshold_falling = 10, \
- .trigger_levels[0] = 70, \
- .trigger_levels[1] = 95, \
- .trigger_levels[2] = 110, \
- .trigger_levels[3] = 120, \
- .trigger_enable[0] = true, \
- .trigger_enable[1] = true, \
- .trigger_enable[2] = true, \
- .trigger_enable[3] = false, \
- .trigger_type[0] = THROTTLE_ACTIVE, \
- .trigger_type[1] = THROTTLE_ACTIVE, \
- .trigger_type[2] = SW_TRIP, \
- .trigger_type[3] = HW_TRIP, \
- .max_trigger_level = 4, \
- .non_hw_trigger_levels = 3, \
- .gain = 8, \
- .reference_voltage = 16, \
- .noise_cancel_mode = 4, \
- .cal_type = TYPE_ONE_POINT_TRIMMING, \
- .efuse_value = 55, \
- .min_efuse_value = 40, \
- .max_efuse_value = 100, \
- .first_point_trim = 25, \
- .second_point_trim = 85, \
- .default_temp_offset = 50, \
- .freq_tab[0] = { \
- .freq_clip_max = 1400 * 1000, \
- .temp_level = 70, \
- }, \
- .freq_tab[1] = { \
- .freq_clip_max = 400 * 1000, \
- .temp_level = 95, \
- }, \
- .freq_tab_count = 2
-
-struct exynos_tmu_init_data const exynos4412_default_tmu_data = {
- .tmu_data = {
- {
- EXYNOS4412_TMU_DATA,
- .type = SOC_ARCH_EXYNOS4412,
- },
- },
- .tmu_count = 1,
-};
-
-struct exynos_tmu_init_data const exynos5250_default_tmu_data = {
- .tmu_data = {
- {
- EXYNOS4412_TMU_DATA,
- .type = SOC_ARCH_EXYNOS5250,
- },
- },
- .tmu_count = 1,
-};
-
-#define __EXYNOS5260_TMU_DATA \
- .threshold_falling = 10, \
- .trigger_levels[0] = 85, \
- .trigger_levels[1] = 103, \
- .trigger_levels[2] = 110, \
- .trigger_levels[3] = 120, \
- .trigger_enable[0] = true, \
- .trigger_enable[1] = true, \
- .trigger_enable[2] = true, \
- .trigger_enable[3] = false, \
- .trigger_type[0] = THROTTLE_ACTIVE, \
- .trigger_type[1] = THROTTLE_ACTIVE, \
- .trigger_type[2] = SW_TRIP, \
- .trigger_type[3] = HW_TRIP, \
- .max_trigger_level = 4, \
- .non_hw_trigger_levels = 3, \
- .gain = 8, \
- .reference_voltage = 16, \
- .noise_cancel_mode = 4, \
- .cal_type = TYPE_ONE_POINT_TRIMMING, \
- .efuse_value = 55, \
- .min_efuse_value = 40, \
- .max_efuse_value = 100, \
- .first_point_trim = 25, \
- .second_point_trim = 85, \
- .default_temp_offset = 50, \
- .freq_tab[0] = { \
- .freq_clip_max = 800 * 1000, \
- .temp_level = 85, \
- }, \
- .freq_tab[1] = { \
- .freq_clip_max = 200 * 1000, \
- .temp_level = 103, \
- }, \
- .freq_tab_count = 2, \
-
-#define EXYNOS5260_TMU_DATA \
- __EXYNOS5260_TMU_DATA \
- .type = SOC_ARCH_EXYNOS5260
-
-struct exynos_tmu_init_data const exynos5260_default_tmu_data = {
- .tmu_data = {
- { EXYNOS5260_TMU_DATA },
- { EXYNOS5260_TMU_DATA },
- { EXYNOS5260_TMU_DATA },
- { EXYNOS5260_TMU_DATA },
- { EXYNOS5260_TMU_DATA },
- },
- .tmu_count = 5,
-};
-
-#define EXYNOS5420_TMU_DATA \
- __EXYNOS5260_TMU_DATA \
- .type = SOC_ARCH_EXYNOS5420
-
-#define EXYNOS5420_TMU_DATA_SHARED \
- __EXYNOS5260_TMU_DATA \
- .type = SOC_ARCH_EXYNOS5420_TRIMINFO
-
-struct exynos_tmu_init_data const exynos5420_default_tmu_data = {
- .tmu_data = {
- { EXYNOS5420_TMU_DATA },
- { EXYNOS5420_TMU_DATA },
- { EXYNOS5420_TMU_DATA_SHARED },
- { EXYNOS5420_TMU_DATA_SHARED },
- { EXYNOS5420_TMU_DATA_SHARED },
- },
- .tmu_count = 5,
-};
-
-#define EXYNOS5440_TMU_DATA \
- .trigger_levels[0] = 100, \
- .trigger_levels[4] = 105, \
- .trigger_enable[0] = 1, \
- .trigger_type[0] = SW_TRIP, \
- .trigger_type[4] = HW_TRIP, \
- .max_trigger_level = 5, \
- .non_hw_trigger_levels = 1, \
- .gain = 5, \
- .reference_voltage = 16, \
- .noise_cancel_mode = 4, \
- .cal_type = TYPE_ONE_POINT_TRIMMING, \
- .efuse_value = 0x5b2d, \
- .min_efuse_value = 16, \
- .max_efuse_value = 76, \
- .first_point_trim = 25, \
- .second_point_trim = 70, \
- .default_temp_offset = 25, \
- .type = SOC_ARCH_EXYNOS5440
-
-struct exynos_tmu_init_data const exynos5440_default_tmu_data = {
- .tmu_data = {
- { EXYNOS5440_TMU_DATA } ,
- { EXYNOS5440_TMU_DATA } ,
- { EXYNOS5440_TMU_DATA } ,
- },
- .tmu_count = 3,
-};
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8729cf68d2fe..f55721ff9385 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
return ret;
}
-/*
- * this makes the path point to (inum INODE_ITEM ioff)
- */
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path)
-{
- struct btrfs_key key;
- return btrfs_find_item(fs_root, path, inum, ioff,
- BTRFS_INODE_ITEM_KEY, &key);
-}
-
-static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- struct btrfs_key *found_key)
-{
- return btrfs_find_item(fs_root, path, inum, ioff,
- BTRFS_INODE_REF_KEY, found_key);
-}
-
int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
@@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
- ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+ ret = btrfs_find_item(fs_root, path, parent, 0,
+ BTRFS_INODE_REF_KEY, &found_key);
if (ret > 0)
ret = -ENOENT;
if (ret)
@@ -1727,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
struct btrfs_key found_key;
while (!ret) {
- ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
- &found_key);
+ ret = btrfs_find_item(fs_root, path, inum,
+ parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+ &found_key);
+
if (ret < 0)
break;
if (ret) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2a1ac6bfc724..9c41fbac3009 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -32,9 +32,6 @@ struct inode_fs_paths {
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path);
-
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4aadadcfab20..de5e4f2adfea 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -185,6 +185,9 @@ struct btrfs_inode {
struct btrfs_delayed_node *delayed_node;
+ /* File creation time. */
+ struct timespec i_otime;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 14a72ed14ef7..993642199326 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
+ if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+ !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+ return;
+
spin_lock(&root->fs_info->trans_lock);
- if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
- list_empty(&root->dirty_list)) {
- list_add(&root->dirty_list,
- &root->fs_info->dirty_cowonly_roots);
+ if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+ /* Want the extent tree to be the last on the list */
+ if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ list_move_tail(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
+ else
+ list_move(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
}
spin_unlock(&root->fs_info->trans_lock);
}
@@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(eb->start,
- fs_info->tree_root->nodesize);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
if (!eb_rewin) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
} else if (old_root) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+ eb = alloc_dummy_extent_buffer(root->fs_info, logical);
} else {
btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
eb = btrfs_clone_extent_buffer(eb_root);
@@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root,
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
gen = btrfs_node_ptr_generation(node, nr);
- readahead_tree_block(root, search, blocksize);
+ readahead_tree_block(root, search);
nread += blocksize;
}
nscan++;
@@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
u64 gen;
u64 block1 = 0;
u64 block2 = 0;
- int blocksize;
parent = path->nodes[level + 1];
if (!parent)
@@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- blocksize = root->nodesize;
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
@@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root,
}
if (block1)
- readahead_tree_block(root, block1, blocksize);
+ readahead_tree_block(root, block1);
if (block2)
- readahead_tree_block(root, block2, blocksize);
+ readahead_tree_block(root, block2);
}
@@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
return 0;
}
-int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
{
int ret;
struct btrfs_key key;
struct extent_buffer *eb;
- struct btrfs_path *path;
+
+ ASSERT(path);
+ ASSERT(found_key);
key.type = key_type;
key.objectid = iobjectid;
key.offset = ioff;
- if (found_path == NULL) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- } else
- path = found_path;
-
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
- if ((ret < 0) || (found_key == NULL)) {
- if (path != found_path)
- btrfs_free_path(path);
+ if (ret < 0)
return ret;
- }
eb = path->nodes[0];
if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
@@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
extent_buffer_get(c);
path->nodes[level] = c;
- path->locks[level] = BTRFS_WRITE_LOCK;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
path->slots[level] = 0;
return 0;
}
@@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
path->search_for_split = 1;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
path->search_for_split = 0;
+ if (ret > 0)
+ ret = -EAGAIN;
if (ret < 0)
goto err;
ret = -EAGAIN;
leaf = path->nodes[0];
- /* if our item isn't there or got smaller, return now */
- if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ /* if our item isn't there, return now */
+ if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0b180708bf79..84c3b00f3de8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
@@ -1020,6 +1022,9 @@ enum btrfs_raid_types {
BTRFS_BLOCK_GROUP_RAID6 | \
BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
+ BTRFS_BLOCK_GROUP_RAID6)
+
/*
* We need a bit for restriper to be able to tell when chunks of type
* SINGLE are available. This "extended" profile format is used in
@@ -1239,7 +1244,6 @@ enum btrfs_disk_cache_state {
BTRFS_DC_ERROR = 1,
BTRFS_DC_CLEAR = 2,
BTRFS_DC_SETUP = 3,
- BTRFS_DC_NEED_WRITE = 4,
};
struct btrfs_caching_control {
@@ -1277,7 +1281,6 @@ struct btrfs_block_group_cache {
unsigned long full_stripe_len;
unsigned int ro:1;
- unsigned int dirty:1;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
@@ -1315,6 +1318,9 @@ struct btrfs_block_group_cache {
struct list_head ro_list;
atomic_t trimming;
+
+ /* For dirty block groups */
+ struct list_head dirty_list;
};
/* delayed seq elem */
@@ -1741,6 +1747,7 @@ struct btrfs_fs_info {
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
+ struct mutex unused_bg_unpin_mutex;
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
@@ -1776,6 +1783,7 @@ struct btrfs_subvolume_writers {
#define BTRFS_ROOT_DEFRAG_RUNNING 6
#define BTRFS_ROOT_FORCE_COW 7
#define BTRFS_ROOT_MULTI_LOG_TASKS 8
+#define BTRFS_ROOT_DIRTY 9
/*
* in ram representation of the tree. extent_root is used for all allocations
@@ -1794,8 +1802,6 @@ struct btrfs_root {
struct btrfs_fs_info *fs_info;
struct extent_io_tree dirty_log_pages;
- struct kobject root_kobj;
- struct completion kobj_unregister;
struct mutex objectid_mutex;
spinlock_t accounting_lock;
@@ -2465,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-
-static inline struct btrfs_timespec *
-btrfs_inode_atime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, atime);
- return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, mtime);
- return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, ctime);
- return (struct btrfs_timespec *)ptr;
-}
-
BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index de4e70fb3cbb..82f0c7c95474 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
- btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->atime,
inode->i_atime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->atime,
inode->i_atime.tv_nsec);
- btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->mtime,
inode->i_mtime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->mtime,
inode->i_mtime.tv_nsec);
- btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->ctime,
inode->i_ctime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->ctime,
inode->i_ctime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
}
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
{
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
- struct btrfs_timespec *tspec;
delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
@@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
*rdev = btrfs_stack_inode_rdev(inode_item);
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
- tspec = btrfs_inode_atime(inode_item);
- inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
- tspec = btrfs_inode_mtime(inode_item);
- inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
- tspec = btrfs_inode_ctime(inode_item);
- inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_stack_timespec_sec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_stack_timespec_nsec(&inode_item->otime);
inode->i_generation = BTRFS_I(inode)->generation;
BTRFS_I(inode)->index_cnt = (u64)-1;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ca6a3a3b6b6c..5ec03d999c37 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -440,18 +440,9 @@ leave:
*/
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
- s64 writers;
- DEFINE_WAIT(wait);
-
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- do {
- prepare_to_wait(&fs_info->replace_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- writers = percpu_counter_sum(&fs_info->bio_counter);
- if (writers)
- schedule();
- finish_wait(&fs_info->replace_wait, &wait);
- } while (writers);
+ wait_event(fs_info->replace_wait, !percpu_counter_sum(
+ &fs_info->bio_counter));
}
/*
@@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
- DEFINE_WAIT(wait);
-again:
- percpu_counter_inc(&fs_info->bio_counter);
- if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+ while (1) {
+ percpu_counter_inc(&fs_info->bio_counter);
+ if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state)))
+ break;
+
btrfs_bio_counter_dec(fs_info);
wait_event(fs_info->replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
- goto again;
}
-
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1afb18226da8..f79f38542a73 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk_ratelimited(KERN_INFO
+ printk_ratelimited(KERN_WARNING
"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
"level %d\n",
root->fs_info->sb->s_id, buf->start,
@@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ printk_ratelimited(KERN_ERR
+ "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
eb->fs_info->sb->s_id, eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
@@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
+ printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
"%llu %llu\n",
eb->fs_info->sb->s_id, found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+ printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
goto err;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_info(root->fs_info, "bad tree block level %d",
+ btrfs_err(root->fs_info, "bad tree block level %d",
(int)btrfs_header_level(eb));
ret = -EIO;
goto err;
@@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
{
struct extent_buffer *buf = NULL;
struct inode *btree_inode = root->fs_info->btree_inode;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
@@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
free_extent_buffer(buf);
}
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
int mirror_num, struct extent_buffer **eb)
{
struct extent_buffer *buf = NULL;
@@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return 0;
@@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
}
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize)
+ u64 bytenr)
{
if (btrfs_test_is_dummy_root(root))
- return alloc_test_extent_buffer(root->fs_info, bytenr,
- blocksize);
- return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
+ return alloc_test_extent_buffer(root->fs_info, bytenr);
+ return alloc_extent_buffer(root->fs_info, bytenr);
}
@@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return NULL;
@@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- memset(&root->root_kobj, 0, sizeof(root->root_kobj));
if (fs_info)
root->defrag_trans_start = fs_info->generation;
else
root->defrag_trans_start = 0;
- init_completion(&root->kobj_unregister);
root->root_key.objectid = objectid;
root->anon_dev = 0;
@@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
bool check_ref)
{
struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
int ret;
if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
@@ -1669,8 +1669,17 @@ again:
if (ret)
goto fail;
- ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
- location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ btrfs_free_path(path);
if (ret < 0)
goto fail;
if (ret == 0)
@@ -2232,6 +2241,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
+ mutex_init(&fs_info->unused_bg_unpin_mutex);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
@@ -2496,7 +2506,7 @@ int open_ctree(struct super_block *sb,
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- printk(KERN_ERR "BTRFS: has skinny extents\n");
+ printk(KERN_INFO "BTRFS: has skinny extents\n");
/*
* flag our filesystem as having big metadata blocks if
@@ -2520,7 +2530,7 @@ int open_ctree(struct super_block *sb,
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
(sectorsize != nodesize)) {
- printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
+ printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
goto fail_alloc;
@@ -2628,12 +2638,12 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize_bits = blksize_bits(sectorsize);
if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
+ printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
goto fail_sb_buffer;
}
if (sectorsize != PAGE_SIZE) {
- printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
+ printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
"found on %s\n", (unsigned long)sectorsize, sb->s_id);
goto fail_sb_buffer;
}
@@ -2642,7 +2652,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to read the system "
+ printk(KERN_ERR "BTRFS: failed to read the system "
"array on %s\n", sb->s_id);
goto fail_sb_buffer;
}
@@ -2657,7 +2667,7 @@ int open_ctree(struct super_block *sb,
generation);
if (!chunk_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
- printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2669,7 +2679,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_chunk_tree(chunk_root);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2681,7 +2691,7 @@ int open_ctree(struct super_block *sb,
btrfs_close_extra_devices(fs_info, fs_devices, 0);
if (!fs_devices->latest_bdev) {
- printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2765,7 +2775,7 @@ retry_root_backup:
ret = btrfs_recover_balance(fs_info);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to recover balance\n");
+ printk(KERN_ERR "BTRFS: failed to recover balance\n");
goto fail_block_groups;
}
@@ -3860,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
btrfs_super_log_root(sb));
+ /*
+ * Check the lower bound, the alignment and other constraints are
+ * checked later.
+ */
+ if (btrfs_super_nodesize(sb) < 4096) {
+ printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
+ btrfs_super_nodesize(sb));
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sectorsize(sb) < 4096) {
+ printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
+ btrfs_super_sectorsize(sb));
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
fs_info->fsid, sb->dev_item.fsid);
@@ -3873,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
if (btrfs_super_num_devices(sb) > (1UL << 31))
printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
btrfs_super_num_devices(sb));
+ if (btrfs_super_num_devices(sb) == 0) {
+ printk(KERN_ERR "BTRFS: number of devices is 0\n");
+ ret = -EINVAL;
+ }
if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
@@ -3881,6 +3910,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
}
/*
+ * Obvious sys_chunk_array corruptions, it must hold at least one key
+ * and one chunk
+ */
+ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
+ btrfs_super_sys_array_size(sb),
+ BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk)) {
+ printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+ btrfs_super_sys_array_size(sb),
+ sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk));
+ ret = -EINVAL;
+ }
+
+ /*
* The generation is a global counter, we'll trust it more than the others
* but it's still possible that it's the one that's wrong.
*/
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 414651821fb3..27d44c0fd236 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,11 +46,11 @@ struct btrfs_fs_devices;
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u64 parent_transid);
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize);
+ u64 bytenr);
void clean_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf);
int open_ctree(struct super_block *sb,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a684086c3c81..571f402d3fc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,8 +74,9 @@ enum {
RESERVE_ALLOC_NO_ACCOUNT = 2,
};
-static int update_block_group(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc);
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
*/
ret = 0;
}
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
if (actual_bytes)
@@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- int run_most = 0;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
root = root->fs_info->tree_root;
delayed_refs = &trans->transaction->delayed_refs;
- if (count == 0) {
+ if (count == 0)
count = atomic_read(&delayed_refs->num_entries) * 2;
- run_most = 1;
- }
again:
#ifdef SCRAMBLE_DELAYED_REFS
@@ -3315,120 +3313,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_group_cache *cache;
- int err = 0;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
struct btrfs_path *path;
- u64 last = 0;
+
+ if (list_empty(&cur_trans->dirty_bgs))
+ return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-again:
- while (1) {
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
- err = cache_save_setup(cache, trans, path);
- last = cache->key.objectid + cache->key.offset;
- btrfs_put_block_group(cache);
- }
-
- while (1) {
- if (last == 0) {
- err = btrfs_run_delayed_refs(trans, root,
- (unsigned long)-1);
- if (err) /* File system offline */
- goto out;
- }
-
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
- btrfs_put_block_group(cache);
- goto again;
- }
-
- if (cache->dirty)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
-
- if (cache->disk_cache_state == BTRFS_DC_SETUP)
- cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
- cache->dirty = 0;
- last = cache->key.objectid + cache->key.offset;
-
- err = write_one_cache_group(trans, root, path, cache);
- btrfs_put_block_group(cache);
- if (err) /* File system offline */
- goto out;
- }
-
- while (1) {
- /*
- * I don't think this is needed since we're just marking our
- * preallocated extent as written, but just in case it can't
- * hurt.
- */
- if (last == 0) {
- err = btrfs_run_delayed_refs(trans, root,
- (unsigned long)-1);
- if (err) /* File system offline */
- goto out;
- }
-
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- /*
- * Really this shouldn't happen, but it could if we
- * couldn't write the entire preallocated extent and
- * splitting the extent resulted in a new block.
- */
- if (cache->dirty) {
- btrfs_put_block_group(cache);
- goto again;
- }
- if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
-
- err = btrfs_write_out_cache(root, trans, cache, path);
-
- /*
- * If we didn't have an error then the cache state is still
- * NEED_WRITE, so we can set it to WRITTEN.
- */
- if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
- cache->disk_cache_state = BTRFS_DC_WRITTEN;
- last = cache->key.objectid + cache->key.offset;
+ /*
+ * We don't need the lock here since we are protected by the transaction
+ * commit. We want to do the cache_save_setup first and then run the
+ * delayed refs to make sure we have the best chance at doing this all
+ * in one shot.
+ */
+ while (!list_empty(&cur_trans->dirty_bgs)) {
+ cache = list_first_entry(&cur_trans->dirty_bgs,
+ struct btrfs_block_group_cache,
+ dirty_list);
+ list_del_init(&cache->dirty_list);
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ cache_save_setup(cache, trans, path);
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, root,
+ (unsigned long) -1);
+ if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
+ btrfs_write_out_cache(root, trans, cache, path);
+ if (!ret)
+ ret = write_one_cache_group(trans, root, path, cache);
btrfs_put_block_group(cache);
}
-out:
btrfs_free_path(path);
- return err;
+ return ret;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5043,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
/**
* drop_outstanding_extent - drop an outstanding extent
* @inode: the inode we're dropping the extent for
+ * @num_bytes: the number of bytes we're relaseing.
*
* This is called when we are freeing up an outstanding extent, either called
* after an error or after an extent is written. This will return the number of
* reserved extents that need to be freed. This must be called with
* BTRFS_I(inode)->lock held.
*/
-static unsigned drop_outstanding_extent(struct inode *inode)
+static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
{
unsigned drop_inode_space = 0;
unsigned dropped_extents = 0;
+ unsigned num_extents = 0;
- BUG_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
+ num_extents = (unsigned)div64_u64(num_bytes +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ ASSERT(num_extents);
+ ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
if (BTRFS_I(inode)->outstanding_extents == 0 &&
test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
@@ -5226,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
out_fail:
spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
+ dropped = drop_outstanding_extent(inode, num_bytes);
/*
* If the inodes csum_bytes is the same as the original
* csum_bytes then we know we haven't raced with any free()ers
@@ -5305,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
+ dropped = drop_outstanding_extent(inode, num_bytes);
if (num_bytes)
to_free = calc_csum_metadata_size(inode, num_bytes, 0);
@@ -5375,8 +5301,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
btrfs_free_reserved_data_space(inode, num_bytes);
}
-static int update_block_group(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc)
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5341,14 @@ static int update_block_group(struct btrfs_root *root,
if (!alloc && cache->cached == BTRFS_CACHE_NO)
cache_block_group(cache, 1);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &trans->transaction->dirty_bgs);
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -5424,7 +5359,6 @@ static int update_block_group(struct btrfs_root *root,
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
- cache->dirty = 1;
old_val = btrfs_block_group_used(&cache->item);
num_bytes = min(total, cache->key.offset - byte_in_group);
if (alloc) {
@@ -5807,10 +5741,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
unpin = &fs_info->freed_extents[0];
while (1) {
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
- if (ret)
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
+ }
if (btrfs_test_opt(root, DISCARD))
ret = btrfs_discard_extent(root, start,
@@ -5818,6 +5755,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
clear_extent_dirty(unpin, start, end, GFP_NOFS);
unpin_extent_range(root, start, end, true);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
@@ -6103,7 +6041,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = update_block_group(root, bytenr, num_bytes, 0);
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -6205,7 +6143,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
- struct btrfs_block_group_cache *cache = NULL;
int pin = 1;
int ret;
@@ -6221,17 +6158,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (!last_ref)
return;
- cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-
if (btrfs_header_generation(buf) == trans->transid) {
+ struct btrfs_block_group_cache *cache;
+
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, root, buf->start);
if (!ret)
goto out;
}
+ cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
pin_down_extent(root, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
goto out;
}
@@ -6239,6 +6179,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+ btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
@@ -6253,7 +6194,6 @@ out:
* anymore.
*/
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
- btrfs_put_block_group(cache);
}
/* Can return -ENOMEM */
@@ -7063,7 +7003,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = update_block_group(root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -7152,7 +7092,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
- ret = update_block_group(root, ins->objectid, root->nodesize, 1);
+ ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+ 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -7217,11 +7158,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, u32 blocksize, int level)
+ u64 bytenr, int level)
{
struct extent_buffer *buf;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return ERR_PTR(-ENOMEM);
btrfs_set_header_generation(buf, trans->transid);
@@ -7340,7 +7281,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
if (btrfs_test_is_dummy_root(root)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- blocksize, level);
+ level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -7357,8 +7298,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
}
- buf = btrfs_init_new_buffer(trans, root, ins.objectid,
- blocksize, level);
+ buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
BUG_ON(IS_ERR(buf)); /* -ENOMEM */
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
@@ -7487,7 +7427,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- readahead_tree_block(root, bytenr, blocksize);
+ readahead_tree_block(root, bytenr);
nread++;
}
wc->reada_slot = slot;
@@ -7828,7 +7768,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = btrfs_find_tree_block(root, bytenr);
if (!next) {
- next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
return -ENOMEM;
btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
@@ -8548,14 +8488,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
if (IS_ERR(trans))
return PTR_ERR(trans);
- alloc_flags = update_block_group_flags(root, cache->flags);
- if (alloc_flags != cache->flags) {
- ret = do_chunk_alloc(trans, root, alloc_flags,
- CHUNK_ALLOC_FORCE);
- if (ret < 0)
- goto out;
- }
-
ret = set_block_group_ro(cache, 0);
if (!ret)
goto out;
@@ -8566,6 +8498,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
goto out;
ret = set_block_group_ro(cache, 0);
out:
+ if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ check_system_chunk(trans, root, alloc_flags);
+ }
+
btrfs_end_transaction(trans, root);
return ret;
}
@@ -9005,6 +8942,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->dirty_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
@@ -9068,9 +9006,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* b) Setting 'dirty flag' makes sure that we flush
* the new space cache info onto disk.
*/
- cache->disk_cache_state = BTRFS_DC_CLEAR;
if (btrfs_test_opt(root, SPACE_CACHE))
- cache->dirty = 1;
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
}
read_extent_buffer(leaf, &cache->item,
@@ -9460,6 +9397,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
}
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
@@ -9611,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
- trans = btrfs_join_transaction(root);
+ /* 1 for btrfs_orphan_reserve_metadata() */
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_set_block_group_rw(root, block_group);
ret = PTR_ERR(trans);
@@ -9624,18 +9569,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
start = block_group->key.objectid;
end = start + block_group->key.offset - 1;
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N,
+ * another task might be running finish_extent_commit() for the
+ * previous transaction N - 1, and have seen a range belonging
+ * to the block group in freed_extents[] before we were able to
+ * clear the whole block group range from freed_extents[]. This
+ * means that task can lookup for the block group after we
+ * unpinned it from freed_extents[] and removed it, leading to
+ * a BUG_ON() at btrfs_unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_set_block_group_rw(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_set_block_group_rw(root, block_group);
goto end_trans;
}
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/* Reset pinned so btrfs_put_block_group doesn't complain */
block_group->pinned = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c73df6a7c9b6..c7233ff1d533 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
atomic_read(&state->refs));
@@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->set_bit_hook)
tree->ops->set_bit_hook(tree->mapping->host, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits);
+ struct extent_state *state, unsigned *bits);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned long *bits)
+ unsigned *bits)
{
struct rb_node *node;
@@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned long *bits, int wake)
+ unsigned *bits, int wake)
{
struct extent_state *next;
- unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int wake, int delete,
+ unsigned bits, int wake, int delete,
struct extent_state **cached_state,
gfp_t mask)
{
@@ -789,9 +789,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned long *bits)
+ unsigned *bits)
{
- unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
+ unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
set_state_cb(tree, state, bits);
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree,
static void cache_state_if_flags(struct extent_state *state,
struct extent_state **cached_ptr,
- const u64 flags)
+ unsigned flags)
{
if (cached_ptr && !(*cached_ptr)) {
if (!flags || (state->state & flags)) {
@@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state,
static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long exclusive_bits,
+ unsigned bits, unsigned exclusive_bits,
u64 *failed_start, struct extent_state **cached_state,
gfp_t mask)
{
@@ -1034,7 +1034,7 @@ search_again:
}
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, u64 * failed_start,
+ unsigned bits, u64 * failed_start,
struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, 0, failed_start,
@@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* boundary bits like LOCK.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long clear_bits,
+ unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
@@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask)
+ unsigned bits, gfp_t mask)
{
return set_extent_bit(tree, start, end, bits, NULL,
NULL, mask);
}
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask)
+ unsigned bits, gfp_t mask)
{
return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
}
@@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, struct extent_state **cached_state)
+ unsigned bits, struct extent_state **cached_state)
{
int err;
u64 failed_start;
+
while (1) {
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
EXTENT_LOCKED, &failed_start,
@@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
*/
static struct extent_state *
find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, unsigned long bits)
+ u64 start, unsigned bits)
{
struct rb_node *node;
struct extent_state *state;
@@ -1474,7 +1475,7 @@ out:
* If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned long bits,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1753,7 +1754,7 @@ out_failed:
int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned long clear_bits,
+ unsigned clear_bits,
unsigned long page_ops)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- unsigned long bits, int contig)
+ unsigned bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
@@ -1928,7 +1929,7 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int filled, struct extent_state *cached)
+ unsigned bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
@@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
sector = bbio->stripes[mirror_num-1].physical >> 9;
bio->bi_iter.bi_sector = sector;
dev = bbio->stripes[mirror_num-1].dev;
- kfree(bbio);
+ btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
bio_put(bio);
return -EIO;
@@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
prev_bio_flags);
- if (ret < 0)
+ if (ret < 0) {
+ *bio_ret = NULL;
return ret;
+ }
bio = NULL;
} else {
return 0;
@@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
page,
&delalloc_start,
&delalloc_end,
- 128 * 1024 * 1024);
+ BTRFS_MAX_EXTENT_SIZE);
if (nr_delalloc == 0) {
delalloc_start = delalloc_end + 1;
continue;
@@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
static struct extent_buffer *
__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
- unsigned long len, gfp_t mask)
+ unsigned long len)
{
struct extent_buffer *eb = NULL;
- eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+ eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
if (eb == NULL)
return NULL;
eb->start = start;
@@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
struct extent_buffer *new;
unsigned long num_pages = num_extent_pages(src->start, src->len);
- new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
+ new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (new == NULL)
return NULL;
@@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
return new;
}
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb;
- unsigned long num_pages = num_extent_pages(0, len);
+ unsigned long len;
+ unsigned long num_pages;
unsigned long i;
- eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
+ if (!fs_info) {
+ /*
+ * Called only from tests that don't always have a fs_info
+ * available, but we know that nodesize is 4096
+ */
+ len = 4096;
+ } else {
+ len = fs_info->tree_root->nodesize;
+ }
+ num_pages = num_extent_pages(0, len);
+
+ eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
@@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+ u64 start)
{
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
- eb = alloc_dummy_extent_buffer(start, len);
+ eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
return NULL;
eb->fs_info = fs_info;
@@ -4808,8 +4824,9 @@ free_eb:
#endif
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+ u64 start)
{
+ unsigned long len = fs_info->tree_root->nodesize;
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
unsigned long index = start >> PAGE_CACHE_SHIFT;
@@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
+ eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ece9ce87edff..695b0ccfb755 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -4,22 +4,22 @@
#include <linux/rbtree.h>
/* bits for the extent state */
-#define EXTENT_DIRTY 1
-#define EXTENT_WRITEBACK (1 << 1)
-#define EXTENT_UPTODATE (1 << 2)
-#define EXTENT_LOCKED (1 << 3)
-#define EXTENT_NEW (1 << 4)
-#define EXTENT_DELALLOC (1 << 5)
-#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_BOUNDARY (1 << 9)
-#define EXTENT_NODATASUM (1 << 10)
-#define EXTENT_DO_ACCOUNTING (1 << 11)
-#define EXTENT_FIRST_DELALLOC (1 << 12)
-#define EXTENT_NEED_WAIT (1 << 13)
-#define EXTENT_DAMAGED (1 << 14)
-#define EXTENT_NORESERVE (1 << 15)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_DIRTY (1U << 0)
+#define EXTENT_WRITEBACK (1U << 1)
+#define EXTENT_UPTODATE (1U << 2)
+#define EXTENT_LOCKED (1U << 3)
+#define EXTENT_NEW (1U << 4)
+#define EXTENT_DELALLOC (1U << 5)
+#define EXTENT_DEFRAG (1U << 6)
+#define EXTENT_BOUNDARY (1U << 9)
+#define EXTENT_NODATASUM (1U << 10)
+#define EXTENT_DO_ACCOUNTING (1U << 11)
+#define EXTENT_FIRST_DELALLOC (1U << 12)
+#define EXTENT_NEED_WAIT (1U << 13)
+#define EXTENT_DAMAGED (1U << 14)
+#define EXTENT_NORESERVE (1U << 15)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/*
* flags for bio submission. The high bits indicate the compression
@@ -81,9 +81,9 @@ struct extent_io_ops {
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long *bits);
+ unsigned *bits);
void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long *bits);
+ unsigned *bits);
void (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
@@ -108,7 +108,7 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
atomic_t refs;
- unsigned long state;
+ unsigned state;
/* for use by the FS */
u64 private;
@@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
int try_release_extent_buffer(struct page *page);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, struct extent_state **cached);
+ unsigned bits, struct extent_state **cached);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask);
@@ -202,21 +202,21 @@ void extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, unsigned long bits, int contig);
+ u64 max_bytes, unsigned bits, int contig);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int filled,
+ unsigned bits, int filled,
struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask);
+ unsigned bits, gfp_t mask);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int wake, int delete,
+ unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask);
+ unsigned bits, gfp_t mask);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, u64 *failed_start,
+ unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
@@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long clear_bits,
+ unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned long bits,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
@@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+ u64 start);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned long bits_to_clear,
+ unsigned bits_to_clear,
unsigned long page_ops);
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
@@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 *end, u64 max_bytes);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
+ u64 start);
#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d6c03f7f136b..a71978578fa7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct io_ctl io_ctl;
struct btrfs_key key;
struct btrfs_free_space *e, *n;
- struct list_head bitmaps;
+ LIST_HEAD(bitmaps);
u64 num_entries;
u64 num_bitmaps;
u64 generation;
u8 type;
int ret = 0;
- INIT_LIST_HEAD(&bitmaps);
-
/* Nothing in the space cache, goodbye */
if (!i_size_read(inode))
return 0;
@@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
+ enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
root = root->fs_info->tree_root;
@@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
path, block_group->key.objectid);
if (ret) {
- spin_lock(&block_group->lock);
- block_group->disk_cache_state = BTRFS_DC_ERROR;
- spin_unlock(&block_group->lock);
+ dcs = BTRFS_DC_ERROR;
ret = 0;
#ifdef DEBUG
btrfs_err(root->fs_info,
@@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
#endif
}
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = dcs;
+ spin_unlock(&block_group->lock);
iput(inode);
return ret;
}
@@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
min_bytes);
- INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
bytes + empty_size,
cont1_bytes, min_bytes);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 8ffa4783cbf4..265e03c73f4d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->leave_spinning = 1;
+ path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
- if (ret == -EOVERFLOW)
- ret = -EMLINK;
+ if (ret == -EOVERFLOW) {
+ if (find_name_in_backref(path, name, name_len, &ref))
+ ret = -EEXIST;
+ else
+ ret = -EMLINK;
+ }
goto out;
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 54bcf639d1cf..a85c23dfcddb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
static void btrfs_split_extent_hook(struct inode *inode,
struct extent_state *orig, u64 split)
{
+ u64 size;
+
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
+ size = orig->end - orig->start + 1;
+ if (size > BTRFS_MAX_EXTENT_SIZE) {
+ u64 num_extents;
+ u64 new_size;
+
+ /*
+ * We need the largest size of the remaining extent to see if we
+ * need to add a new outstanding extent. Think of the following
+ * case
+ *
+ * [MEAX_EXTENT_SIZEx2 - 4k][4k]
+ *
+ * The new_size would just be 4k and we'd think we had enough
+ * outstanding extents for this if we only took one side of the
+ * split, same goes for the other direction. We need to see if
+ * the larger size still is the same amount of extents as the
+ * original size, because if it is we need to add a new
+ * outstanding extent. But if we split up and the larger size
+ * is less than the original then we are good to go since we've
+ * already accounted for the extra extent in our original
+ * accounting.
+ */
+ new_size = orig->end - split + 1;
+ if ((split - orig->start) > new_size)
+ new_size = split - orig->start;
+
+ num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) < num_extents)
+ return;
+ }
+
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode,
struct extent_state *new,
struct extent_state *other)
{
+ u64 new_size, old_size;
+ u64 num_extents;
+
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
+ old_size = other->end - other->start + 1;
+ new_size = old_size + (new->end - new->start + 1);
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return;
+ }
+
+ /*
+ * If we grew by another max_extent, just return, we want to keep that
+ * reserved amount.
+ */
+ num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) > num_extents)
+ return;
+
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents--;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1604,7 +1663,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* have pending delalloc work to be done.
*/
static void btrfs_set_bit_hook(struct inode *inode,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1645,9 +1704,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
*/
static void btrfs_clear_bit_hook(struct inode *inode,
struct extent_state *state,
- unsigned long *bits)
+ unsigned *bits)
{
u64 len = state->end + 1 - state->start;
+ u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+ BTRFS_MAX_EXTENT_SIZE);
spin_lock(&BTRFS_I(inode)->lock);
if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
*bits &= ~EXTENT_FIRST_DELALLOC;
} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents--;
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
spin_unlock(&BTRFS_I(inode)->lock);
}
@@ -2945,7 +3006,7 @@ static int __readpage_endio_check(struct inode *inode,
return 0;
zeroit:
if (__ratelimit(&_rs))
- btrfs_info(BTRFS_I(inode)->root->fs_info,
+ btrfs_warn(BTRFS_I(inode)->root->fs_info,
"csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum, csum_expected);
memset(kaddr + pgoff, 1, len);
@@ -3407,7 +3468,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
out:
if (ret)
- btrfs_crit(root->fs_info,
+ btrfs_err(root->fs_info,
"could not do orphan cleanup %d", ret);
btrfs_free_path(path);
return ret;
@@ -3490,7 +3551,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
- struct btrfs_timespec *tspec;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key location;
unsigned long ptr;
@@ -3527,17 +3587,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
- tspec = btrfs_inode_atime(inode_item);
- inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
- tspec = btrfs_inode_mtime(inode_item);
- inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
- tspec = btrfs_inode_ctime(inode_item);
- inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_timespec_sec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_timespec_nsec(leaf, &inode_item->otime);
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3656,21 +3718,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->atime,
inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->atime,
inode->i_atime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->mtime,
inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->mtime,
inode->i_mtime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->ctime,
inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->ctime,
inode->i_ctime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(leaf, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec, &token);
+
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
&token);
btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@ -5007,6 +5074,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
+ struct btrfs_key key;
int ret;
int err = 0;
@@ -5017,9 +5085,12 @@ static int fixup_tree_root_location(struct btrfs_root *root,
}
err = -ENOENT;
- ret = btrfs_find_item(root->fs_info->tree_root, path,
- BTRFS_I(dir)->root->root_key.objectid,
- location->objectid, BTRFS_ROOT_REF_KEY, NULL);
+ key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+ 0, 0);
if (ret) {
if (ret < 0)
err = ret;
@@ -5258,7 +5329,10 @@ static struct inode *new_simple_dir(struct super_block *s,
inode->i_op = &btrfs_dir_ro_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
return inode;
}
@@ -5826,7 +5900,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
+
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@ -7134,11 +7213,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
u64 len = bh_result->b_size;
+ u64 orig_len = len;
int unlock_bits = EXTENT_LOCKED;
int ret = 0;
if (create)
- unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+ unlock_bits |= EXTENT_DIRTY;
else
len = min_t(u64, len, root->sectorsize);
@@ -7269,14 +7349,12 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
-
- ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockstart + len - 1, EXTENT_DELALLOC, NULL,
- &cached_state, GFP_NOFS);
- BUG_ON(ret);
+ if (len < orig_len) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+ btrfs_free_reserved_data_space(inode, len);
}
/*
@@ -7805,8 +7883,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
}
/* async crcs make it difficult to collect full stripe writes. */
- if (btrfs_get_alloc_profile(root, 1) &
- (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+ if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
async_submit = 0;
else
async_submit = 1;
@@ -8053,8 +8130,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode,
count - (size_t)ret);
- else
- btrfs_delalloc_release_metadata(inode, 0);
}
out:
if (wakeup)
@@ -8575,6 +8650,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->delayed_node = NULL;
+ ei->i_otime.tv_sec = 0;
+ ei->i_otime.tv_nsec = 0;
+
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(&ei->io_tree, &inode->i_data);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 48b60dbf807f..97159a8e91d4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup = u64_to_ptr(unode->aux);
qgroup->rfer += sign * oper->num_bytes;
qgroup->rfer_cmpr += sign * oper->num_bytes;
+ WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
qgroup->excl += sign * oper->num_bytes;
- if (sign < 0)
- WARN_ON(qgroup->excl < oper->num_bytes);
qgroup->excl_cmpr += sign * oper->num_bytes;
qgroup_dirty(fs_info, qgroup);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8ab2a17bbba8..5264858ed768 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,15 +58,6 @@
*/
#define RBIO_CACHE_READY_BIT 3
-/*
- * bbio and raid_map is managed by the caller, so we shouldn't free
- * them here. And besides that, all rbios with this flag should not
- * be cached, because we need raid_map to check the rbios' stripe
- * is the same or not, but it is very likely that the caller has
- * free raid_map, so don't cache those rbios.
- */
-#define RBIO_HOLD_BBIO_MAP_BIT 4
-
#define RBIO_CACHE_SIZE 1024
enum btrfs_rbio_ops {
@@ -79,13 +70,6 @@ struct btrfs_raid_bio {
struct btrfs_fs_info *fs_info;
struct btrfs_bio *bbio;
- /*
- * logical block numbers for the start of each stripe
- * The last one or two are p/q. These are sorted,
- * so raid_map[0] is the start of our full stripe
- */
- u64 *raid_map;
-
/* while we're doing rmw on a stripe
* we put it into a hash table so we can
* lock the stripe and merge more rbios
@@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->raid_map[0];
+ u64 num = rbio->bbio->raid_map[0];
/*
* we shift down quite a bit. We're using byte
@@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->raid_map[0] !=
- cur->raid_map[0])
+ if (last->bbio->raid_map[0] !=
+ cur->bbio->raid_map[0])
return 0;
/* we can't merge with different operations */
@@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
walk++;
- if (cur->raid_map[0] == rbio->raid_map[0]) {
+ if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
spin_lock(&cur->bio_list_lock);
/* can we steal this cached rbio's pages? */
@@ -841,21 +825,6 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static inline void
-__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
-{
- if (need) {
- kfree(raid_map);
- kfree(bbio);
- }
-}
-
-static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
-{
- __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
- !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
-}
-
static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
@@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
}
}
- free_bbio_and_raid_map(rbio);
-
+ btrfs_put_bbio(rbio->bbio);
kfree(rbio);
}
@@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len)
+ struct btrfs_bio *bbio, u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
@@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
rbio->bbio = bbio;
- rbio->raid_map = raid_map;
rbio->fs_info = root->fs_info;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
@@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio->bio_pages = p + sizeof(struct page *) * num_pages;
rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
- if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ nr_data = real_stripes - 1;
+ else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
nr_data = real_stripes - 2;
else
- nr_data = real_stripes - 1;
+ BUG();
rbio->nr_data = nr_data;
return rbio;
@@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
spin_lock_irq(&rbio->bio_list_lock);
bio_list_for_each(bio, &rbio->bio_list) {
start = (u64)bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->raid_map[0];
+ stripe_offset = start - rbio->bbio->raid_map[0];
page_index = stripe_offset >> PAGE_CACHE_SHIFT;
for (i = 0; i < bio->bi_vcnt; i++) {
@@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
logical <<= 9;
for (i = 0; i < rbio->nr_data; i++) {
- stripe_start = rbio->raid_map[i];
+ stripe_start = rbio->bbio->raid_map[i];
if (logical >= stripe_start &&
logical < stripe_start + rbio->stripe_len) {
return i;
@@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
* our main entry point for writes from the rest of the FS.
*/
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len)
+ struct btrfs_bio *bbio, u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
int ret;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio)) {
- __free_bbio_and_raid_map(bbio, raid_map, 1);
+ btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
@@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* all raid6 handling here */
- if (rbio->raid_map[rbio->real_stripes - 1] ==
- RAID6_Q_STRIPE) {
-
+ if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
/*
* single failure, rebuild from parity raid5
* style
@@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* here due to a crc mismatch and we can't give them the
* data they want
*/
- if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+ if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bbio->raid_map[faila] ==
+ RAID5_P_STRIPE) {
err = -EIO;
goto cleanup;
}
@@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto pstripe;
}
- if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+ if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
@@ -2001,8 +1967,7 @@ cleanup:
cleanup_io:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- if (err == 0 &&
- !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
+ if (err == 0)
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2156,15 +2121,16 @@ cleanup:
* of the drive.
*/
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, int mirror_num, int generic_io)
+ struct btrfs_bio *bbio, u64 stripe_len,
+ int mirror_num, int generic_io)
{
struct btrfs_raid_bio *rbio;
int ret;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio)) {
- __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+ if (generic_io)
+ btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
@@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
BUG();
- __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+ if (generic_io)
+ btrfs_put_bbio(bbio);
kfree(rbio);
return -EIO;
}
@@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
btrfs_bio_counter_inc_noblocked(root->fs_info);
rbio->generic_bio_cnt = 1;
} else {
- set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+ btrfs_get_bbio(bbio);
}
/*
@@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work)
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
int stripe_offset;
int index;
- ASSERT(logical >= rbio->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+ ASSERT(logical >= rbio->bbio->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->raid_map[0]);
+ stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
index = stripe_offset >> PAGE_CACHE_SHIFT;
rbio->bio_pages[index] = page;
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 31d4a157b5e3..2b5d7977d83b 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -43,16 +43,15 @@ struct btrfs_raid_bio;
struct btrfs_device;
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, int mirror_num, int generic_io);
+ struct btrfs_bio *bbio, u64 stripe_len,
+ int mirror_num, int generic_io);
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len);
+ struct btrfs_bio *bbio, u64 stripe_len);
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
struct page *page, u64 logical);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index b63ae20618fb..0e7beea92b4c 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
struct reada_extent {
u64 logical;
struct btrfs_key top;
- u32 blocksize;
int err;
struct list_head extctl;
int refcnt;
@@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
blocksize = root->nodesize;
re->logical = logical;
- re->blocksize = blocksize;
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
spin_lock_init(&re->lock);
@@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace);
- kfree(bbio);
+ btrfs_put_bbio(bbio);
return re;
error:
@@ -488,7 +486,7 @@ error:
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
- kfree(bbio);
+ btrfs_put_bbio(bbio);
kfree(re);
return re_exist;
}
@@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
int mirror_num = 0;
struct extent_buffer *eb = NULL;
u64 logical;
- u32 blocksize;
int ret;
int i;
int need_kick = 0;
@@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->reada_lock);
return 0;
}
- dev->reada_next = re->logical + re->blocksize;
+ dev->reada_next = re->logical + fs_info->tree_root->nodesize;
re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
}
}
logical = re->logical;
- blocksize = re->blocksize;
spin_lock(&re->lock);
if (re->scheduled_for == NULL) {
@@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
return 0;
atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
- mirror_num, &eb);
+ ret = reada_tree_block_flagged(fs_info->extent_root, logical,
+ mirror_num, &eb);
if (ret)
__readahead_hook(fs_info->extent_root, NULL, logical, ret);
else if (eb)
@@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
break;
printk(KERN_DEBUG
" re: logical %llu size %u empty %d for %lld",
- re->logical, re->blocksize,
+ re->logical, fs_info->tree_root->nodesize,
list_empty(&re->extctl), re->scheduled_for ?
re->scheduled_for->devid : -1);
@@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
printk(KERN_DEBUG
"re: logical %llu size %u list empty %d for %lld",
- re->logical, re->blocksize, list_empty(&re->extctl),
+ re->logical, fs_info->tree_root->nodesize,
+ list_empty(&re->extctl),
re->scheduled_for ? re->scheduled_for->devid : -1);
for (i = 0; i < re->nzones; ++i) {
printk(KERN_CONT " zone %llu-%llu devs",
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74257d6436ad..d83085381bcc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc,
}
}
-static int tree_block_processed(u64 bytenr, u32 blocksize,
- struct reloc_control *rc)
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
+ u32 blocksize = rc->extent_root->nodesize;
+
if (test_range_bit(&rc->processed_blocks, bytenr,
bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
return 1;
@@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
if (!block->key_ready)
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid);
+ readahead_tree_block(rc->extent_root, block->bytenr);
rb_node = rb_next(rb_node);
}
@@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc,
bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
SKINNY_METADATA);
- if (tree_block_processed(bytenr, blocksize, rc))
+ if (tree_block_processed(bytenr, rc))
return 0;
if (tree_search(blocks, bytenr))
@@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc,
if (added)
goto next;
- if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+ if (!tree_block_processed(leaf->start, rc)) {
block = kmalloc(sizeof(*block), GFP_NOFS);
if (!block) {
err = -ENOMEM;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e427cb7ee12c..ec57687c9a4d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,7 +66,6 @@ struct scrub_ctx;
struct scrub_recover {
atomic_t refs;
struct btrfs_bio *bbio;
- u64 *raid_map;
u64 map_length;
};
@@ -80,7 +79,7 @@ struct scrub_page {
u64 logical;
u64 physical;
u64 physical_for_dev_replace;
- atomic_t ref_count;
+ atomic_t refs;
struct {
unsigned int mirror_num:8;
unsigned int have_csum:1;
@@ -113,7 +112,7 @@ struct scrub_block {
struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
- atomic_t ref_count; /* free mem on transition to zero */
+ atomic_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
struct {
@@ -142,7 +141,7 @@ struct scrub_parity {
int stripe_len;
- atomic_t ref_count;
+ atomic_t refs;
struct list_head spages;
@@ -194,6 +193,15 @@ struct scrub_ctx {
*/
struct btrfs_scrub_progress stat;
spinlock_t stat_lock;
+
+ /*
+ * Use a ref counter to avoid use-after-free issues. Scrub workers
+ * decrement bios_in_flight and workers_pending and then do a wakeup
+ * on the list_wait wait queue. We must ensure the main scrub task
+ * doesn't free the scrub context before or while the workers are
+ * doing the wakeup() call.
+ */
+ atomic_t refs;
};
struct scrub_fixup_nodatasum {
@@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
- struct btrfs_fs_info *fs_info,
- struct scrub_block *original_sblock,
- u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
@@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
const u8 *csum, u64 generation,
u16 csum_size);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int force_write);
+ struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write);
@@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
static void copy_nocow_pages_worker(struct btrfs_work *work);
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
+ atomic_inc(&sctx->refs);
atomic_inc(&sctx->bios_in_flight);
}
@@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
{
atomic_dec(&sctx->bios_in_flight);
wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
}
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
@@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ atomic_inc(&sctx->refs);
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a worker is running. we must also
@@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
atomic_dec(&sctx->workers_pending);
wake_up(&fs_info->scrub_pause_wait);
wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
}
static void scrub_free_csums(struct scrub_ctx *sctx)
@@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
kfree(sctx);
}
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+ if (atomic_dec_and_test(&sctx->refs))
+ scrub_free_ctx(sctx);
+}
+
static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
{
@@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
if (!sctx)
goto nomem;
+ atomic_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = pages_per_rd_bio;
sctx->curr = -1;
@@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key root_key;
+ struct btrfs_key key;
root_key.objectid = root;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
goto err;
}
- ret = inode_item_info(inum, 0, local_root, swarn->path);
+ /*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+ key.objectid = inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
btrfs_release_path(swarn->path);
goto err;
@@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover)
static inline void scrub_put_recover(struct scrub_recover *recover)
{
if (atomic_dec_and_test(&recover->refs)) {
- kfree(recover->bbio);
- kfree(recover->raid_map);
+ btrfs_put_bbio(recover->bbio);
kfree(recover);
}
}
@@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
}
/* setup the context, map the logical blocks and alloc the pages */
- ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
- logical, sblocks_for_recheck);
+ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
@@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!is_metadata && !have_csum) {
struct scrub_fixup_nodatasum *fixup_nodatasum;
-nodatasum_case:
WARN_ON(sctx->is_dev_replace);
+nodatasum_case:
+
/*
* !is_metadata and !have_csum, this means that the data
* might not be COW'ed, that it might be modified
@@ -1091,76 +1114,20 @@ nodatasum_case:
sblock_other->no_io_error_seen) {
if (sctx->is_dev_replace) {
scrub_write_block_to_dev_replace(sblock_other);
+ goto corrected_error;
} else {
- int force_write = is_metadata || have_csum;
-
ret = scrub_repair_block_from_good_copy(
- sblock_bad, sblock_other,
- force_write);
+ sblock_bad, sblock_other);
+ if (!ret)
+ goto corrected_error;
}
- if (0 == ret)
- goto corrected_error;
}
}
- /*
- * for dev_replace, pick good pages and write to the target device.
- */
- if (sctx->is_dev_replace) {
- success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count;
- page_num++) {
- int sub_success;
-
- sub_success = 0;
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
- struct scrub_block *sblock_other =
- sblocks_for_recheck + mirror_index;
- struct scrub_page *page_other =
- sblock_other->pagev[page_num];
-
- if (!page_other->io_error) {
- ret = scrub_write_page_to_dev_replace(
- sblock_other, page_num);
- if (ret == 0) {
- /* succeeded for this page */
- sub_success = 1;
- break;
- } else {
- btrfs_dev_replace_stats_inc(
- &sctx->dev_root->
- fs_info->dev_replace.
- num_write_errors);
- }
- }
- }
-
- if (!sub_success) {
- /*
- * did not find a mirror to fetch the page
- * from. scrub_write_page_to_dev_replace()
- * handles this case (page->io_error), by
- * filling the block with zeros before
- * submitting the write request
- */
- success = 0;
- ret = scrub_write_page_to_dev_replace(
- sblock_bad, page_num);
- if (ret)
- btrfs_dev_replace_stats_inc(
- &sctx->dev_root->fs_info->
- dev_replace.num_write_errors);
- }
- }
-
- goto out;
- }
+ if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+ goto did_not_correct_error;
/*
- * for regular scrub, repair those pages that are errored.
* In case of I/O errors in the area that is supposed to be
* repaired, continue by picking good copies of those pages.
* Select the good pages from mirrors to rewrite bad pages from
@@ -1184,44 +1151,64 @@ nodatasum_case:
* mirror, even if other 512 byte sectors in the same PAGE_SIZE
* area are unreadable.
*/
-
- /* can only fix I/O errors from here on */
- if (sblock_bad->no_io_error_seen)
- goto did_not_correct_error;
-
success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ for (page_num = 0; page_num < sblock_bad->page_count;
+ page_num++) {
struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_block *sblock_other = NULL;
- if (!page_bad->io_error)
+ /* skip no-io-error page in scrub */
+ if (!page_bad->io_error && !sctx->is_dev_replace)
continue;
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
- struct scrub_block *sblock_other = sblocks_for_recheck +
- mirror_index;
- struct scrub_page *page_other = sblock_other->pagev[
- page_num];
-
- if (!page_other->io_error) {
- ret = scrub_repair_page_from_good_copy(
- sblock_bad, sblock_other, page_num, 0);
- if (0 == ret) {
- page_bad->io_error = 0;
- break; /* succeeded for this page */
+ /* try to find no-io-error page in mirrors */
+ if (page_bad->io_error) {
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ if (!sblocks_for_recheck[mirror_index].
+ pagev[page_num]->io_error) {
+ sblock_other = sblocks_for_recheck +
+ mirror_index;
+ break;
}
}
+ if (!sblock_other)
+ success = 0;
}
- if (page_bad->io_error) {
- /* did not find a mirror to copy the page from */
- success = 0;
+ if (sctx->is_dev_replace) {
+ /*
+ * did not find a mirror to fetch the page
+ * from. scrub_write_page_to_dev_replace()
+ * handles this case (page->io_error), by
+ * filling the block with zeros before
+ * submitting the write request
+ */
+ if (!sblock_other)
+ sblock_other = sblock_bad;
+
+ if (scrub_write_page_to_dev_replace(sblock_other,
+ page_num) != 0) {
+ btrfs_dev_replace_stats_inc(
+ &sctx->dev_root->
+ fs_info->dev_replace.
+ num_write_errors);
+ success = 0;
+ }
+ } else if (sblock_other) {
+ ret = scrub_repair_page_from_good_copy(sblock_bad,
+ sblock_other,
+ page_num, 0);
+ if (0 == ret)
+ page_bad->io_error = 0;
+ else
+ success = 0;
}
}
- if (success) {
+ if (success && !sctx->is_dev_replace) {
if (is_metadata || have_csum) {
/*
* need to verify the checksum now that all
@@ -1288,19 +1275,18 @@ out:
return 0;
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
{
- if (raid_map) {
- if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
- return 3;
- else
- return 2;
- } else {
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ return 2;
+ else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ return 3;
+ else
return (int)bbio->num_stripes;
- }
}
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+ u64 *raid_map,
u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
@@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
{
int i;
- if (raid_map) {
+ if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/* RAID5/6 */
for (i = 0; i < nstripes; i++) {
if (raid_map[i] == RAID6_Q_STRIPE ||
@@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
}
}
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
- struct btrfs_fs_info *fs_info,
- struct scrub_block *original_sblock,
- u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck)
{
+ struct scrub_ctx *sctx = original_sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ u64 length = original_sblock->page_count * PAGE_SIZE;
+ u64 logical = original_sblock->pagev[0]->logical;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
- u64 *raid_map;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
- int page_index;
+ int page_index = 0;
int mirror_index;
int nmirrors;
int ret;
/*
- * note: the two members ref_count and outstanding_pages
+ * note: the two members refs and outstanding_pages
* are not used (and not set) in the blocks that are used for
* the recheck procedure
*/
- page_index = 0;
while (length > 0) {
sublen = min_t(u64, length, PAGE_SIZE);
mapped_length = sublen;
bbio = NULL;
- raid_map = NULL;
/*
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio, 0, &raid_map);
+ &mapped_length, &bbio, 0, 1);
if (ret || !bbio || mapped_length < sublen) {
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
return -ENOMEM;
}
atomic_set(&recover->refs, 1);
recover->bbio = bbio;
- recover->raid_map = raid_map;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
- nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+ nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_page *page;
- if (mirror_index >= BTRFS_MAX_MIRRORS)
- continue;
-
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
page = kzalloc(sizeof(*page), GFP_NOFS);
@@ -1410,9 +1389,12 @@ leave_nomem:
sblock->pagev[page_index] = page;
page->logical = logical;
- scrub_stripe_index_and_offset(logical, raid_map,
+ scrub_stripe_index_and_offset(logical,
+ bbio->map_type,
+ bbio->raid_map,
mapped_length,
- bbio->num_stripes,
+ bbio->num_stripes -
+ bbio->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
@@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error)
static inline int scrub_is_page_on_raid56(struct scrub_page *page)
{
- return page->recover && page->recover->raid_map;
+ return page->recover &&
+ (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
@@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
bio->bi_end_io = scrub_bio_wait_endio;
ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
- page->recover->raid_map,
page->recover->map_length,
page->mirror_num, 0);
if (ret)
@@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int force_write)
+ struct scrub_block *sblock_good)
{
int page_num;
int ret = 0;
@@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
sblock_good,
- page_num,
- force_write);
+ page_num, 1);
if (ret_sub)
ret = ret_sub;
}
@@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
static void scrub_block_get(struct scrub_block *sblock)
{
- atomic_inc(&sblock->ref_count);
+ atomic_inc(&sblock->refs);
}
static void scrub_block_put(struct scrub_block *sblock)
{
- if (atomic_dec_and_test(&sblock->ref_count)) {
+ if (atomic_dec_and_test(&sblock->refs)) {
int i;
if (sblock->sparity)
@@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock)
static void scrub_page_get(struct scrub_page *spage)
{
- atomic_inc(&spage->ref_count);
+ atomic_inc(&spage->refs);
}
static void scrub_page_put(struct scrub_page *spage)
{
- if (atomic_dec_and_test(&spage->ref_count)) {
+ if (atomic_dec_and_test(&spage->refs)) {
if (spage->page)
__free_page(spage->page);
kfree(spage);
@@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->ref_count, 1);
+ atomic_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
@@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->ref_count, 1);
+ atomic_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
@@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_raid_bio *rbio;
struct scrub_page *spage;
struct btrfs_bio *bbio = NULL;
- u64 *raid_map = NULL;
u64 length;
int ret;
@@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
length = sparity->logic_end - sparity->logic_start + 1;
ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
sparity->logic_start,
- &length, &bbio, 0, &raid_map);
- if (ret || !bbio || !raid_map)
+ &length, &bbio, 0, 1);
+ if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
@@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
bio->bi_end_io = scrub_parity_bio_endio;
rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
- raid_map, length,
- sparity->scrub_dev,
+ length, sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
@@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
bbio_out:
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
static void scrub_parity_get(struct scrub_parity *sparity)
{
- atomic_inc(&sparity->ref_count);
+ atomic_inc(&sparity->refs);
}
static void scrub_parity_put(struct scrub_parity *sparity)
{
- if (!atomic_dec_and_test(&sparity->ref_count))
+ if (!atomic_dec_and_test(&sparity->refs))
return;
scrub_parity_check_and_repair(sparity);
@@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->scrub_dev = sdev;
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
- atomic_set(&sparity->ref_count, 1);
+ atomic_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->spages);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
mirror_num = 1;
@@ -3074,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
logical = base + offset;
physical_end = physical + nstripes * map->stripe_len;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical_end, num,
map, &logic_end, NULL);
logic_end += base;
@@ -3121,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
ret = 0;
while (physical < physical_end) {
/* for raid56, we skip parity stripe */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = get_raid56_logic_offset(physical, num,
map, &logical, &stripe_logical);
logical += base;
@@ -3280,8 +3254,7 @@ again:
scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/*
* loop until we find next data stripe
* or we have finished all stripes.
@@ -3775,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_workers_put(fs_info);
mutex_unlock(&fs_info->scrub_lock);
- scrub_free_ctx(sctx);
+ scrub_put_ctx(sctx);
return ret;
}
@@ -3881,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
&mapped_length, &bbio, 0);
if (ret || !bbio || mapped_length < extent_len ||
!bbio->stripes[0].dev->bdev) {
- kfree(bbio);
+ btrfs_put_bbio(bbio);
return;
}
*extent_physical = bbio->stripes[0].physical;
*extent_mirror_num = bbio->mirror_num;
*extent_dev = bbio->stripes[0].dev;
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 804432dbc351..fe5857223515 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
if (ret < 0)
goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
- btrfs_inode_atime(ii));
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
- btrfs_inode_mtime(ii));
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
- btrfs_inode_ctime(ii));
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
/* TODO Add otime support when the otime patches get into upstream */
ret = send_cmd(sctx);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6f49b2872a64..05fef198ff94 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1958,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans, root);
}
-static int btrfs_unfreeze(struct super_block *sb)
-{
- return 0;
-}
-
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2011,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = {
.statfs = btrfs_statfs,
.remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
- .unfreeze_fs = btrfs_unfreeze,
};
static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 92db3f648df4..94edb0a2a026 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -733,10 +733,18 @@ int btrfs_init_sysfs(void)
ret = btrfs_init_debugfs();
if (ret)
- return ret;
+ goto out1;
init_feature_attrs();
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+ if (ret)
+ goto out2;
+
+ return 0;
+out2:
+ debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+out1:
+ kset_unregister(btrfs_kset);
return ret;
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index cc286ce97d1e..f51963a8f929 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -53,7 +53,7 @@ static int test_btrfs_split_item(void)
return -ENOMEM;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+ path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
if (!eb) {
test_msg("Could not allocate dummy buffer\n");
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 7e99c2f98dd0..9e9f2368177d 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -258,8 +258,7 @@ static int test_find_delalloc(void)
}
ret = 0;
out_bits:
- clear_extent_bits(&tmp, 0, total_dirty - 1,
- (unsigned long)-1, GFP_NOFS);
+ clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
out:
if (locked_page)
page_cache_release(locked_page);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 3ae0f5b8bb80..a116b55ce788 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void)
goto out;
}
- root->node = alloc_dummy_extent_buffer(0, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -843,7 +843,7 @@ static int test_hole_first(void)
goto out;
}
- root->node = alloc_dummy_extent_buffer(0, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ec3dcb202357..73f299ebdabb 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -404,12 +404,22 @@ int btrfs_test_qgroups(void)
ret = -ENOMEM;
goto out;
}
+ /* We are using this root as our extent root */
+ root->fs_info->extent_root = root;
+
+ /*
+ * Some of the paths we test assume we have a filled out fs_info, so we
+ * just need to add the root in there so we don't panic.
+ */
+ root->fs_info->tree_root = root;
+ root->fs_info->quota_root = root;
+ root->fs_info->quota_enabled = 1;
/*
* Can't use bytenr 0, some things freak out
* *cough*backref walking code*cough*
*/
- root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -448,17 +458,6 @@ int btrfs_test_qgroups(void)
goto out;
}
- /* We are using this root as our extent root */
- root->fs_info->extent_root = root;
-
- /*
- * Some of the paths we test assume we have a filled out fs_info, so we
- * just need to addt he root in there so we don't panic.
- */
- root->fs_info->tree_root = root;
- root->fs_info->quota_root = root;
- root->fs_info->quota_enabled = 1;
-
test_msg("Running qgroup tests\n");
ret = test_no_shared_qgroup(root);
if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e88b59d13439..7e80f32550a6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -220,6 +220,7 @@ loop:
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
+ cur_trans->have_free_bgs = 0;
cur_trans->start_time = get_seconds();
cur_trans->delayed_refs.href_root = RB_ROOT;
@@ -248,6 +249,8 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
INIT_LIST_HEAD(&cur_trans->pending_ordered);
+ INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+ spin_lock_init(&cur_trans->dirty_bgs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -1020,6 +1023,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
u64 old_root_bytenr;
u64 old_root_used;
struct btrfs_root *tree_root = root->fs_info->tree_root;
+ bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
old_root_used = btrfs_root_used(&root->root_item);
btrfs_write_dirty_block_groups(trans, root);
@@ -1027,7 +1031,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
if (old_root_bytenr == root->node->start &&
- old_root_used == btrfs_root_used(&root->root_item))
+ old_root_used == btrfs_root_used(&root->root_item) &&
+ (!extent_root ||
+ list_empty(&trans->transaction->dirty_bgs)))
break;
btrfs_set_root_node(&root->root_item, root->node);
@@ -1038,7 +1044,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
return ret;
old_root_used = btrfs_root_used(&root->root_item);
- ret = btrfs_write_dirty_block_groups(trans, root);
+ if (extent_root) {
+ ret = btrfs_write_dirty_block_groups(trans, root);
+ if (ret)
+ return ret;
+ }
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret)
return ret;
}
@@ -1061,10 +1075,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
int ret;
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret)
- return ret;
-
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
0, &eb);
@@ -1097,6 +1107,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
+ clear_bit(BTRFS_ROOT_DIRTY, &root->state);
if (root != fs_info->extent_root)
list_add_tail(&root->dirty_list,
@@ -1983,6 +1994,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
switch_commit_roots(cur_trans, root->fs_info);
assert_qgroups_uptodate(trans);
+ ASSERT(list_empty(&cur_trans->dirty_bgs));
update_super_roots(root);
btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -2026,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
+ if (cur_trans->have_free_bgs)
+ btrfs_clear_space_info_full(root->fs_info);
+
root->fs_info->last_trans_committed = cur_trans->transid;
/*
* We needn't acquire the lock here because there is no other task
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 00ed29c4b3f9..937050a2b68e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,11 @@ struct btrfs_transaction {
atomic_t num_writers;
atomic_t use_count;
+ /*
+ * true if there is free bgs operations in this transaction
+ */
+ int have_free_bgs;
+
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
struct list_head list;
@@ -58,6 +63,8 @@ struct btrfs_transaction {
struct list_head pending_chunks;
struct list_head pending_ordered;
struct list_head switch_commits;
+ struct list_head dirty_bgs;
+ spinlock_t dirty_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1a9585d4380a..9a37f8b39bae 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
insert:
btrfs_release_path(path);
/* try to insert the key into the destination tree */
+ path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
+ path->skip_release_on_error = 0;
/* make sure any existing item is the correct size */
- if (ret == -EEXIST) {
+ if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
@@ -488,8 +490,20 @@ insert:
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0)
+ if (btrfs_inode_generation(eb, src_item) == 0) {
+ struct extent_buffer *dst_eb = path->nodes[0];
+
+ if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+ struct btrfs_map_token token;
+ u64 ino_size = btrfs_inode_size(eb, src_item);
+
+ btrfs_init_map_token(&token);
+ btrfs_set_token_inode_size(dst_eb, dst_item,
+ ino_size, &token);
+ }
goto no_copy;
+ }
if (overwrite_root &&
S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -844,7 +858,7 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
- char *name, int namelen)
+ const char *name, int namelen)
{
struct btrfs_path *path;
struct btrfs_inode_ref *ref;
@@ -1254,13 +1268,14 @@ out:
}
static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset)
+ struct btrfs_root *root, u64 ino)
{
int ret;
- ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
- offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
- if (ret > 0)
- ret = btrfs_insert_orphan_item(trans, root, offset);
+
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
+
return ret;
}
@@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
}
btrfs_release_path(path);
- if (ret < 0)
+ if (ret < 0 && ret != -ENOENT)
return ret;
return nlink;
}
@@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
nlink = ret;
ret = count_inode_extrefs(root, inode, path);
- if (ret == -ENOENT)
- ret = 0;
-
if (ret < 0)
goto out;
@@ -1557,6 +1570,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
}
/*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+ const char *name, const int name_len,
+ const u64 dirid, const u64 ino)
+{
+ struct btrfs_key search_key;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = dirid;
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ return false;
+}
+
+/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
@@ -1666,10 +1703,17 @@ out:
return ret;
insert:
+ if (name_in_log_ref(root->log_root, name, name_len,
+ key->objectid, log_key.objectid)) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
btrfs_release_path(path);
ret = insert_one_name(trans, root, path, key->objectid, key->offset,
name, name_len, log_type, &log_key);
- if (ret && ret != -ENOENT)
+ if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
update_size = false;
ret = 0;
@@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
- next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
return -ENOMEM;
@@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
if (atomic_read(&root->log_writers))
schedule();
- mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
+ mutex_lock(&root->log_mutex);
}
}
@@ -3219,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only)
+ struct inode *inode, int log_inode_only,
+ u64 logged_isize)
{
struct btrfs_map_token token;
@@ -3232,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* to say 'update this inode with these values'
*/
btrfs_set_token_inode_generation(leaf, item, 0, &token);
- btrfs_set_token_inode_size(leaf, item, 0, &token);
+ btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
} else {
btrfs_set_token_inode_generation(leaf, item,
BTRFS_I(inode)->generation,
@@ -3245,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->atime,
inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->atime,
inode->i_atime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->mtime,
inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->mtime,
inode->i_mtime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->ctime,
inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->ctime,
inode->i_ctime.tv_nsec, &token);
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
@@ -3284,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+ fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
btrfs_release_path(path);
return 0;
}
@@ -3293,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *dst_path,
struct btrfs_path *src_path, u64 *last_extent,
- int start_slot, int nr, int inode_only)
+ int start_slot, int nr, int inode_only,
+ u64 logged_isize)
{
unsigned long src_offset;
unsigned long dst_offset;
@@ -3350,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
dst_path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
- inode, inode_only == LOG_INODE_EXISTS);
+ inode, inode_only == LOG_INODE_EXISTS,
+ logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
src_offset, ins_sizes[i]);
@@ -3902,6 +3949,33 @@ process:
return ret;
}
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+ struct btrfs_path *path, u64 *size_ret)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *size_ret = i_size_read(inode);
+ } else {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ *size_ret = btrfs_inode_size(path->nodes[0], item);
+ }
+
+ btrfs_release_path(path);
+ return 0;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -3939,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 logged_isize = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3966,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- /* Only run delayed items if we are a dir or a new file */
+ /*
+ * Only run delayed items if we are a dir or a new file.
+ * Otherwise commit the delayed inode only, which is needed in
+ * order for the log replay code to mark inodes for link count
+ * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ */
if (S_ISDIR(inode->i_mode) ||
- BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+ BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
ret = btrfs_commit_inode_delayed_items(trans, inode);
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
- }
+ else
+ ret = btrfs_commit_inode_delayed_inode(inode);
+
+ if (ret) {
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
}
mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -3988,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key_type = BTRFS_INODE_EXTREF_KEY;
+ max_key.type = max_key_type;
+ }
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
- if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags);
- ret = btrfs_truncate_inode_items(trans, log,
- inode, 0, 0);
- } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags) ||
+ if (inode_only == LOG_INODE_EXISTS) {
+ /*
+ * Make sure the new inode item we write to the log has
+ * the same isize as the current one (if it exists).
+ * This is necessary to prevent data loss after log
+ * replay, and also to prevent doing a wrong expanding
+ * truncate - for e.g. create file, write 4K into offset
+ * 0, fsync, write 4K into offset 4096, add hard link,
+ * fsync some other file (to sync log), power fail - if
+ * we use the inode's current i_size, after log replay
+ * we get a 8Kb file, with the last 4Kb extent as a hole
+ * (zeroes), as if an expanding truncate happened,
+ * instead of getting a file of 4Kb only.
+ */
+ err = logged_inode_size(log, inode, path,
+ &logged_isize);
+ if (err)
+ goto out_unlock;
+ }
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key.type = BTRFS_INODE_EXTREF_KEY;
+ ret = drop_objectid_items(trans, log, path, ino,
+ max_key.type);
+ } else {
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
+ ret = btrfs_truncate_inode_items(trans, log,
+ inode, 0, 0);
+ }
+ } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags) ||
inode_only == LOG_INODE_EXISTS) {
- if (inode_only == LOG_INODE_ALL)
+ if (inode_only == LOG_INODE_ALL) {
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
fast_search = true;
- max_key.type = BTRFS_XATTR_ITEM_KEY;
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ } else {
+ max_key.type = BTRFS_INODE_EXTREF_KEY;
+ }
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
@@ -4047,7 +4163,8 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, &last_extent,
- ins_start_slot, ins_nr, inode_only);
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4071,7 +4188,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
&last_extent, ins_start_slot,
- ins_nr, inode_only);
+ ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4092,7 +4209,8 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, &last_extent,
- ins_start_slot, ins_nr, inode_only);
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4273,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
+ const struct dentry * const first_parent = parent;
+ const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
+ last_committed);
sb = inode->i_sb;
@@ -4328,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- inode_only = LOG_INODE_EXISTS;
while (1) {
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
@@ -4337,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (root != BTRFS_I(inode)->root)
break;
+ /*
+ * On unlink we must make sure our immediate parent directory
+ * inode is fully logged. This is to prevent leaving dangling
+ * directory index entries and a wrong directory inode's i_size.
+ * Not doing so can result in a directory being impossible to
+ * delete after log replay (rmdir will always fail with error
+ * -ENOTEMPTY).
+ */
+ if (did_unlink && parent == first_parent)
+ inode_only = LOG_INODE_ALL;
+ else
+ inode_only = LOG_INODE_EXISTS;
+
if (BTRFS_I(inode)->generation >
- root->fs_info->last_trans_committed) {
+ root->fs_info->last_trans_committed ||
+ inode_only == LOG_INODE_ALL) {
ret = btrfs_log_inode(trans, root, inode, inode_only,
0, LLONG_MAX, ctx);
if (ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 50c5a8762aed..cd4d1315aaa9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1310,6 +1310,8 @@ again:
if (ret) {
btrfs_error(root->fs_info, ret,
"Failed to remove dev extent item");
+ } else {
+ trans->transaction->have_free_bgs = 1;
}
out:
btrfs_free_path(path);
@@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
- if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+ if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
return;
btrfs_set_fs_incompat(info, RAID56);
@@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
- }
free_extent_map(em);
return len;
}
@@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6))
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
return ret;
@@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b)
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
{
struct btrfs_bio_stripe s;
- int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
int i;
u64 l;
int again = 1;
- int m;
while (again) {
again = 0;
- for (i = 0; i < real_stripes - 1; i++) {
- if (parity_smaller(raid_map[i], raid_map[i+1])) {
+ for (i = 0; i < num_stripes - 1; i++) {
+ if (parity_smaller(bbio->raid_map[i],
+ bbio->raid_map[i+1])) {
s = bbio->stripes[i];
- l = raid_map[i];
+ l = bbio->raid_map[i];
bbio->stripes[i] = bbio->stripes[i+1];
- raid_map[i] = raid_map[i+1];
+ bbio->raid_map[i] = bbio->raid_map[i+1];
bbio->stripes[i+1] = s;
- raid_map[i+1] = l;
-
- if (bbio->tgtdev_map) {
- m = bbio->tgtdev_map[i];
- bbio->tgtdev_map[i] =
- bbio->tgtdev_map[i + 1];
- bbio->tgtdev_map[i + 1] = m;
- }
+ bbio->raid_map[i+1] = l;
again = 1;
}
@@ -4909,10 +4900,41 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
}
}
+static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+{
+ struct btrfs_bio *bbio = kzalloc(
+ sizeof(struct btrfs_bio) +
+ sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+ sizeof(int) * (real_stripes) +
+ sizeof(u64) * (real_stripes),
+ GFP_NOFS);
+ if (!bbio)
+ return NULL;
+
+ atomic_set(&bbio->error, 0);
+ atomic_set(&bbio->refs, 1);
+
+ return bbio;
+}
+
+void btrfs_get_bbio(struct btrfs_bio *bbio)
+{
+ WARN_ON(!atomic_read(&bbio->refs));
+ atomic_inc(&bbio->refs);
+}
+
+void btrfs_put_bbio(struct btrfs_bio *bbio)
+{
+ if (!bbio)
+ return;
+ if (atomic_dec_and_test(&bbio->refs))
+ kfree(bbio);
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
- int mirror_num, u64 **raid_map_ret)
+ int mirror_num, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -4925,7 +4947,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 stripe_nr_orig;
u64 stripe_nr_end;
u64 stripe_len;
- u64 *raid_map = NULL;
int stripe_index;
int i;
int ret = 0;
@@ -4976,7 +4997,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_offset = offset - stripe_offset;
/* if we're here for raid56, we need to know the stripe aligned start */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
raid56_full_stripe_start = offset;
@@ -4989,8 +5010,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & REQ_DISCARD) {
/* we don't discard raid56 yet */
- if (map->type &
- (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
@@ -5000,7 +5020,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
stripe (on a single disk). */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
(rw & REQ_WRITE)) {
max_len = stripe_len * nr_data_stripes(map) -
(offset - raid56_full_stripe_start);
@@ -5047,7 +5067,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 physical_of_found = 0;
ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0, NULL);
+ logical, &tmp_length, &tmp_bbio, 0, 0);
if (ret) {
WARN_ON(tmp_bbio != NULL);
goto out;
@@ -5061,7 +5081,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
* is not left of the left cursor
*/
ret = -EIO;
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
goto out;
}
@@ -5096,11 +5116,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
} else {
WARN_ON(1);
ret = -EIO;
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
goto out;
}
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
@@ -5166,15 +5186,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
mirror_num = stripe_index - old_stripe_index + 1;
}
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
- u64 tmp;
-
- if (raid_map_ret &&
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ if (need_raid_map &&
((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
- int i, rot;
-
/* push stripe_nr back to the start of the full stripe */
stripe_nr = raid56_full_stripe_start;
do_div(stripe_nr, stripe_len * nr_data_stripes(map));
@@ -5183,32 +5198,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = map->num_stripes;
max_errors = nr_parity_stripes(map);
- raid_map = kmalloc_array(num_stripes, sizeof(u64),
- GFP_NOFS);
- if (!raid_map) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* Work out the disk rotation on this stripe-set */
- tmp = stripe_nr;
- rot = do_div(tmp, num_stripes);
-
- /* Fill in the logical address of each stripe */
- tmp = stripe_nr * nr_data_stripes(map);
- for (i = 0; i < nr_data_stripes(map); i++)
- raid_map[(i+rot) % num_stripes] =
- em->start + (tmp + i) * map->stripe_len;
-
- raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
- if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- raid_map[(i+rot+1) % num_stripes] =
- RAID6_Q_STRIPE;
-
*length = map->stripe_len;
stripe_index = 0;
stripe_offset = 0;
} else {
+ u64 tmp;
+
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
@@ -5246,17 +5241,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
tgtdev_indexes = num_stripes;
}
- bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
- GFP_NOFS);
+ bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
if (!bbio) {
- kfree(raid_map);
ret = -ENOMEM;
goto out;
}
- atomic_set(&bbio->error, 0);
if (dev_replace_is_ongoing)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+ /* build raid_map */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+ need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+ mirror_num > 1)) {
+ u64 tmp;
+ int i, rot;
+
+ bbio->raid_map = (u64 *)((void *)bbio->stripes +
+ sizeof(struct btrfs_bio_stripe) *
+ num_alloc_stripes +
+ sizeof(int) * tgtdev_indexes);
+
+ /* Work out the disk rotation on this stripe-set */
+ tmp = stripe_nr;
+ rot = do_div(tmp, num_stripes);
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * nr_data_stripes(map);
+ for (i = 0; i < nr_data_stripes(map); i++)
+ bbio->raid_map[(i+rot) % num_stripes] =
+ em->start + (tmp + i) * map->stripe_len;
+
+ bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ bbio->raid_map[(i+rot+1) % num_stripes] =
+ RAID6_Q_STRIPE;
+ }
+
if (rw & REQ_DISCARD) {
int factor = 0;
int sub_stripes = 0;
@@ -5340,6 +5360,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
max_errors = btrfs_chunk_max_errors(map);
+ if (bbio->raid_map)
+ sort_parity_stripes(bbio, num_stripes);
+
tgtdev_indexes = 0;
if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
dev_replace->tgtdev != NULL) {
@@ -5427,6 +5450,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
*bbio_ret = bbio;
+ bbio->map_type = map->type;
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
@@ -5443,10 +5467,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
bbio->mirror_num = map->num_stripes + 1;
}
- if (raid_map) {
- sort_parity_stripes(bbio, raid_map);
- *raid_map_ret = raid_map;
- }
out:
if (dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
@@ -5459,17 +5479,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
struct btrfs_bio **bbio_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- mirror_num, NULL);
+ mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
- u64 **raid_map_ret)
+ int need_raid_map)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- mirror_num, raid_map_ret);
+ mirror_num, need_raid_map);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -5511,8 +5531,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
do_div(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
do_div(length, map->num_stripes);
- else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
do_div(length, nr_data_stripes(map));
rmap_len = map->stripe_len * nr_data_stripes(map);
}
@@ -5565,7 +5584,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e
bio_endio_nodec(bio, err);
else
bio_endio(bio, err);
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
static void btrfs_end_bio(struct bio *bio, int err)
@@ -5808,7 +5827,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
- u64 *raid_map = NULL;
int ret;
int dev_nr = 0;
int total_devs = 1;
@@ -5819,7 +5837,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
btrfs_bio_counter_inc_blocked(root->fs_info);
ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
- mirror_num, &raid_map);
+ mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(root->fs_info);
return ret;
@@ -5832,15 +5850,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
- if (raid_map) {
+ if (bbio->raid_map) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
- ret = raid56_parity_write(root, bio, bbio,
- raid_map, map_length);
+ ret = raid56_parity_write(root, bio, bbio, map_length);
} else {
- ret = raid56_parity_recover(root, bio, bbio,
- raid_map, map_length,
+ ret = raid56_parity_recover(root, bio, bbio, map_length,
mirror_num, 1);
}
@@ -6238,17 +6254,22 @@ int btrfs_read_sys_array(struct btrfs_root *root)
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
- u8 *ptr;
- unsigned long sb_ptr;
+ u8 *array_ptr;
+ unsigned long sb_array_offset;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
- u32 cur;
+ u32 cur_offset;
struct btrfs_key key;
- sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
- BTRFS_SUPER_INFO_SIZE);
+ ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
+ /*
+ * This will create extent buffer of nodesize, superblock size is
+ * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
+ * overallocate but we can keep it as-is, only the first page is used.
+ */
+ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
btrfs_set_buffer_uptodate(sb);
@@ -6271,35 +6292,56 @@ int btrfs_read_sys_array(struct btrfs_root *root)
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
- ptr = super_copy->sys_chunk_array;
- sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
- cur = 0;
+ array_ptr = super_copy->sys_chunk_array;
+ sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
+ cur_offset = 0;
+
+ while (cur_offset < array_size) {
+ disk_key = (struct btrfs_disk_key *)array_ptr;
+ len = sizeof(*disk_key);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
- while (cur < array_size) {
- disk_key = (struct btrfs_disk_key *)ptr;
btrfs_disk_key_to_cpu(&key, disk_key);
- len = sizeof(*disk_key); ptr += len;
- sb_ptr += len;
- cur += len;
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
- chunk = (struct btrfs_chunk *)sb_ptr;
+ chunk = (struct btrfs_chunk *)sb_array_offset;
+ /*
+ * At least one btrfs_chunk with one stripe must be
+ * present, exact stripe count check comes afterwards
+ */
+ len = btrfs_chunk_item_size(1);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ len = btrfs_chunk_item_size(num_stripes);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
ret = read_one_chunk(root, &key, sb, chunk);
if (ret)
break;
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- len = btrfs_chunk_item_size(num_stripes);
} else {
ret = -EIO;
break;
}
- ptr += len;
- sb_ptr += len;
- cur += len;
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
}
free_extent_buffer(sb);
return ret;
+
+out_short_read:
+ printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
+ len, cur_offset);
+ free_extent_buffer(sb);
+ return -EIO;
}
int btrfs_read_chunk_tree(struct btrfs_root *root)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d6fe73c0f4a2..83069dec6898 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
struct btrfs_bio {
+ atomic_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
+ u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
unsigned long flags;
@@ -307,6 +309,12 @@ struct btrfs_bio {
int mirror_num;
int num_tgtdevs;
int *tgtdev_map;
+ /*
+ * logical block numbers for the start of each stripe
+ * The last one or two are p/q. These are sorted,
+ * so raid_map[0] is the start of our full stripe
+ */
+ u64 *raid_map;
struct btrfs_bio_stripe stripes[];
};
@@ -388,19 +396,15 @@ struct btrfs_balance_control {
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
-
-#define btrfs_bio_size(total_stripes, real_stripes) \
- (sizeof(struct btrfs_bio) + \
- (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
- (sizeof(int) * (real_stripes)))
-
+void btrfs_get_bbio(struct btrfs_bio *bbio);
+void btrfs_put_bbio(struct btrfs_bio *bbio);
int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
- u64 **raid_map_ret);
+ int need_raid_map);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 5bd853ba44ff..64fa248343f6 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
}
-static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
- int type)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct posix_acl *acl = ACL_NOT_CACHED;
-
- spin_lock(&ci->i_ceph_lock);
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
- acl = get_cached_acl(inode, type);
- spin_unlock(&ci->i_ceph_lock);
-
- return acl;
-}
-
struct posix_acl *ceph_get_acl(struct inode *inode, int type)
{
int size;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 24be059fd1f8..fd5599d32362 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -196,17 +196,22 @@ static int readpage_nounlock(struct file *filp, struct page *page)
u64 len = PAGE_CACHE_SIZE;
if (off >= i_size_read(inode)) {
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
return 0;
}
- /*
- * Uptodate inline data should have been added into page cache
- * while getting Fcr caps.
- */
- if (ci->i_inline_version != CEPH_INLINE_NONE)
- return -EINVAL;
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ /*
+ * Uptodate inline data should have been added
+ * into page cache while getting Fcr caps.
+ */
+ if (off == 0)
+ return -EINVAL;
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ return 0;
+ }
err = ceph_readpage_from_fscache(inode, page);
if (err == 0)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b93c631c6c87..8172775428a0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode,
struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
realmino);
if (realm) {
- ceph_get_snap_realm(mdsc, realm);
spin_lock(&realm->inodes_with_caps_lock);
ci->i_snap_realm = realm;
list_add(&ci->i_snap_realm_item,
@@ -1451,8 +1450,8 @@ static int __mark_caps_flushing(struct inode *inode,
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
if (list_empty(&ci->i_flushing_item)) {
+ ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
dout(" inode %p now flushing seq %lld\n", inode,
@@ -2073,17 +2072,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
* requested from the MDS.
*/
static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- loff_t endoff, int *got, struct page **pinned_page,
- int *check_max, int *err)
+ loff_t endoff, int *got, int *check_max, int *err)
{
struct inode *inode = &ci->vfs_inode;
int ret = 0;
- int have, implemented, _got = 0;
+ int have, implemented;
int file_wanted;
dout("get_cap_refs %p need %s want %s\n", inode,
ceph_cap_string(need), ceph_cap_string(want));
-again:
+
spin_lock(&ci->i_ceph_lock);
/* make sure file is actually open */
@@ -2138,50 +2136,34 @@ again:
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
if ((revoking & not) == 0) {
- _got = need | (have & want);
- __take_cap_refs(ci, _got);
+ *got = need | (have & want);
+ __take_cap_refs(ci, *got);
ret = 1;
}
} else {
+ int session_readonly = false;
+ if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+ struct ceph_mds_session *s = ci->i_auth_cap->session;
+ spin_lock(&s->s_cap_lock);
+ session_readonly = s->s_readonly;
+ spin_unlock(&s->s_cap_lock);
+ }
+ if (session_readonly) {
+ dout("get_cap_refs %p needed %s but mds%d readonly\n",
+ inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+ *err = -EROFS;
+ ret = 1;
+ goto out_unlock;
+ }
+
dout("get_cap_refs %p have %s needed %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
out_unlock:
spin_unlock(&ci->i_ceph_lock);
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
- (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- i_size_read(inode) > 0) {
- int ret1;
- struct page *page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- *pinned_page = page;
- goto out;
- }
- page_cache_release(page);
- }
- /*
- * drop cap refs first because getattr while holding
- * caps refs can cause deadlock.
- */
- ceph_put_cap_refs(ci, _got);
- _got = 0;
-
- /* getattr request will bring inline data into page cache */
- ret1 = __ceph_do_getattr(inode, NULL,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (ret1 >= 0) {
- ret = 0;
- goto again;
- }
- *err = ret1;
- ret = 1;
- }
-out:
dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(_got));
- *got = _got;
+ ret, ceph_cap_string(*got));
return ret;
}
@@ -2221,22 +2203,52 @@ static void check_max_size(struct inode *inode, loff_t endoff)
int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page)
{
- int check_max, ret, err;
+ int _got, check_max, ret, err = 0;
retry:
if (endoff > 0)
check_max_size(&ci->vfs_inode, endoff);
+ _got = 0;
check_max = 0;
- err = 0;
ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want, endoff,
- got, pinned_page,
- &check_max, &err));
+ try_get_cap_refs(ci, need, want, endoff,
+ &_got, &check_max, &err));
if (err)
ret = err;
+ if (ret < 0)
+ return ret;
+
if (check_max)
goto retry;
- return ret;
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(&ci->vfs_inode) > 0) {
+ struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ *pinned_page = page;
+ goto out;
+ }
+ page_cache_release(page);
+ }
+ /*
+ * drop cap refs first because getattr while holding
+ * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /* getattr request will bring inline data into page cache */
+ ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (ret < 0)
+ return ret;
+ goto retry;
+ }
+out:
+ *got = _got;
+ return 0;
}
/*
@@ -2432,13 +2444,13 @@ static void invalidate_aliases(struct inode *inode)
*/
static void handle_cap_grant(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *grant,
- void *snaptrace, int snaptrace_len,
u64 inline_version,
void *inline_data, int inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
struct ceph_cap *cap, int issued)
__releases(ci->i_ceph_lock)
+ __releases(mdsc->snap_rwsem)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
@@ -2639,10 +2651,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
spin_unlock(&ci->i_ceph_lock);
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace,
- snaptrace + snaptrace_len, false);
- downgrade_write(&mdsc->snap_rwsem);
kick_flushing_inode_caps(mdsc, session, inode);
up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued)
@@ -3052,6 +3060,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_cap *cap;
struct ceph_mds_caps *h;
struct ceph_mds_cap_peer *peer = NULL;
+ struct ceph_snap_realm *realm;
int mds = session->s_mds;
int op, issued;
u32 seq, mseq;
@@ -3153,11 +3162,23 @@ void ceph_handle_caps(struct ceph_mds_session *session,
goto done_unlocked;
case CEPH_CAP_OP_IMPORT:
+ realm = NULL;
+ if (snaptrace_len) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace,
+ snaptrace + snaptrace_len,
+ false, &realm);
+ downgrade_write(&mdsc->snap_rwsem);
+ } else {
+ down_read(&mdsc->snap_rwsem);
+ }
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &issued);
- handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len,
+ handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
msg->middle, session, cap, issued);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
}
@@ -3177,7 +3198,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_GRANT:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- handle_cap_grant(mdsc, inode, h, NULL, 0,
+ handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
msg->middle, session, cap, issued);
goto done_unlocked;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c241603764fd..0411dbb15815 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -26,8 +26,6 @@
* point by name.
*/
-const struct inode_operations ceph_dir_iops;
-const struct file_operations ceph_dir_fops;
const struct dentry_operations ceph_dentry_ops;
/*
@@ -672,13 +670,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
/*
* We created the item, then did a lookup, and found
* it was already linked to another inode we already
- * had in our cache (and thus got spliced). Link our
- * dentry to that inode, but don't hash it, just in
- * case the VFS wants to dereference it.
+ * had in our cache (and thus got spliced). To not
+ * confuse VFS (especially when inode is a directory),
+ * we don't link our dentry to that inode, return an
+ * error instead.
+ *
+ * This event should be rare and it happens only when
+ * we talk to old MDS. Recent MDS does not send traceless
+ * reply for request that creates new inode.
*/
- BUG_ON(!result->d_inode);
- d_instantiate(dentry, result->d_inode);
- return 0;
+ d_drop(result);
+ return -ESTALE;
}
return PTR_ERR(result);
}
@@ -1335,6 +1337,13 @@ const struct file_operations ceph_dir_fops = {
.fsync = ceph_dir_fsync,
};
+const struct file_operations ceph_snapdir_fops = {
+ .iterate = ceph_readdir,
+ .llseek = ceph_dir_llseek,
+ .open = ceph_open,
+ .release = ceph_release,
+};
+
const struct inode_operations ceph_dir_iops = {
.lookup = ceph_lookup,
.permission = ceph_permission,
@@ -1357,6 +1366,14 @@ const struct inode_operations ceph_dir_iops = {
.atomic_open = ceph_atomic_open,
};
+const struct inode_operations ceph_snapdir_iops = {
+ .lookup = ceph_lookup,
+ .permission = ceph_permission,
+ .getattr = ceph_getattr,
+ .mkdir = ceph_mkdir,
+ .rmdir = ceph_unlink,
+};
+
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
.d_release = ceph_d_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 905986dd4c3c..a3d774b35149 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -275,10 +275,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
+ err = ceph_handle_snapdir(req, dentry, err);
if (err)
goto out_req;
- err = ceph_handle_snapdir(req, dentry, err);
if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
@@ -392,13 +392,14 @@ more:
if (ret >= 0) {
int didpages;
if (was_short && (pos + ret < inode->i_size)) {
- u64 tmp = min(this_len - ret,
- inode->i_size - pos - ret);
+ int zlen = min(this_len - ret,
+ inode->i_size - pos - ret);
+ int zoff = (o_direct ? buf_align : io_align) +
+ read + ret;
dout(" zero gap %llu to %llu\n",
- pos + ret, pos + ret + tmp);
- ceph_zero_page_vector_range(page_align + read + ret,
- tmp, pages);
- ret += tmp;
+ pos + ret, pos + ret + zlen);
+ ceph_zero_page_vector_range(zoff, zlen, pages);
+ ret += zlen;
}
didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
@@ -878,28 +879,34 @@ again:
i_size = i_size_read(inode);
if (retry_op == READ_INLINE) {
- /* does not support inline data > PAGE_SIZE */
- if (i_size > PAGE_CACHE_SIZE) {
- ret = -EIO;
- } else if (iocb->ki_pos < i_size) {
+ BUG_ON(ret > 0 || read > 0);
+ if (iocb->ki_pos < i_size &&
+ iocb->ki_pos < PAGE_CACHE_SIZE) {
loff_t end = min_t(loff_t, i_size,
iocb->ki_pos + len);
+ end = min_t(loff_t, end, PAGE_CACHE_SIZE);
if (statret < end)
zero_user_segment(page, statret, end);
ret = copy_page_to_iter(page,
iocb->ki_pos & ~PAGE_MASK,
end - iocb->ki_pos, to);
iocb->ki_pos += ret;
- } else {
- ret = 0;
+ read += ret;
+ }
+ if (iocb->ki_pos < i_size && read < len) {
+ size_t zlen = min_t(size_t, len - read,
+ i_size - iocb->ki_pos);
+ ret = iov_iter_zero(zlen, to);
+ iocb->ki_pos += ret;
+ read += ret;
}
__free_pages(page, 0);
- return ret;
+ return read;
}
/* hit EOF or hole? */
if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
- ret < len) {
+ ret < len) {
dout("sync_read hit hole, ppos %lld < size %lld"
", reading more\n", iocb->ki_pos,
inode->i_size);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 6b5173605154..119c43c80638 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -82,8 +82,8 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
- inode->i_op = &ceph_dir_iops;
- inode->i_fop = &ceph_dir_fops;
+ inode->i_op = &ceph_snapdir_iops;
+ inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
return inode;
@@ -838,30 +838,31 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ceph_vinop(inode), inode->i_mode);
}
- /* set dir completion flag? */
- if (S_ISDIR(inode->i_mode) &&
- ci->i_files == 0 && ci->i_subdirs == 0 &&
- ceph_snap(inode) == CEPH_NOSNAP &&
- (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
- (issued & CEPH_CAP_FILE_EXCL) == 0 &&
- !__ceph_dir_is_complete(ci)) {
- dout(" marking %p complete (empty)\n", inode);
- __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count),
- ci->i_ordered_count);
- }
-
/* were we issued a capability? */
if (info->cap.caps) {
if (ceph_snap(inode) == CEPH_NOSNAP) {
+ unsigned caps = le32_to_cpu(info->cap.caps);
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode,
- le32_to_cpu(info->cap.caps),
+ cap_fmode, caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
le64_to_cpu(info->cap.realm),
info->cap.flags, &new_cap);
+
+ /* set dir completion flag? */
+ if (S_ISDIR(inode->i_mode) &&
+ ci->i_files == 0 && ci->i_subdirs == 0 &&
+ (caps & CEPH_CAP_FILE_SHARED) &&
+ (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+ !__ceph_dir_is_complete(ci)) {
+ dout(" marking %p complete (empty)\n", inode);
+ __ceph_dir_set_complete(ci,
+ atomic_read(&ci->i_release_count),
+ ci->i_ordered_count);
+ }
+
wake = true;
} else {
dout(" %p got snap_caps %s\n", inode,
@@ -1446,12 +1447,14 @@ retry_lookup:
}
if (!dn->d_inode) {
- dn = splice_dentry(dn, in, NULL);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
+ struct dentry *realdn = splice_dentry(dn, in, NULL);
+ if (IS_ERR(realdn)) {
+ err = PTR_ERR(realdn);
+ d_drop(dn);
dn = NULL;
goto next_item;
}
+ dn = realdn;
}
di = dn->d_fsdata;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5f62fb7a5d0a..71c073f38e54 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -480,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
mdsc->max_sessions = newmax;
}
mdsc->sessions[mds] = s;
+ atomic_inc(&mdsc->num_sessions);
atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
@@ -503,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
mdsc->sessions[s->s_mds] = NULL;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
+ atomic_dec(&mdsc->num_sessions);
}
/*
@@ -842,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
struct ceph_options *opt = mdsc->fsc->client->options;
void *p;
- const char* metadata[3][2] = {
+ const char* metadata[][2] = {
{"hostname", utsname()->nodename},
+ {"kernel_version", utsname()->release},
{"entity_id", opt->name ? opt->name : ""},
{NULL, NULL}
};
@@ -1464,19 +1467,33 @@ out_unlocked:
return err;
}
+static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_flushing_caps)
+ ret = ci->i_cap_flush_seq >= want_flush_seq;
+ else
+ ret = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
/*
* flush all dirty inode data to disk.
*
* returns true if we've flushed through want_flush_seq
*/
-static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
{
- int mds, ret = 1;
+ int mds;
dout("check_cap_flush want %lld\n", want_flush_seq);
mutex_lock(&mdsc->mutex);
- for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
struct ceph_mds_session *session = mdsc->sessions[mds];
+ struct inode *inode = NULL;
if (!session)
continue;
@@ -1489,29 +1506,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
list_entry(session->s_cap_flushing.next,
struct ceph_inode_info,
i_flushing_item);
- struct inode *inode = &ci->vfs_inode;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_cap_flush_seq <= want_flush_seq) {
+ if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n", inode,
- ci->i_cap_flush_seq, want_flush_seq,
- session->s_mds);
- ret = 0;
+ "seq %lld <= %lld to mds%d\n",
+ &ci->vfs_inode, ci->i_cap_flush_seq,
+ want_flush_seq, session->s_mds);
+ inode = igrab(&ci->vfs_inode);
}
- spin_unlock(&ci->i_ceph_lock);
}
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
- if (!ret)
- return ret;
+ if (inode) {
+ wait_event(mdsc->cap_flushing_wq,
+ check_cap_flush(inode, want_flush_seq));
+ iput(inode);
+ }
+
mutex_lock(&mdsc->mutex);
}
mutex_unlock(&mdsc->mutex);
dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
- return ret;
}
/*
@@ -1923,7 +1940,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->num_releases = cpu_to_le16(releases);
/* time stamp */
- ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+ {
+ struct ceph_timespec ts;
+ ceph_encode_timespec(&ts, &req->r_stamp);
+ ceph_encode_copy(&p, &ts, sizeof(ts));
+ }
BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
@@ -2012,7 +2033,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
/* time stamp */
p = msg->front.iov_base + req->r_request_release_offset;
- ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+ {
+ struct ceph_timespec ts;
+ ceph_encode_timespec(&ts, &req->r_stamp);
+ ceph_encode_copy(&p, &ts, sizeof(ts));
+ }
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2159,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
p = rb_next(p);
if (req->r_got_unsafe)
continue;
+ if (req->r_attempts > 0)
+ continue; /* only new requests */
if (req->r_session &&
req->r_session->s_mds == mds) {
dout(" kicking tid %llu\n", req->r_tid);
@@ -2286,6 +2313,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
struct ceph_mds_request *req;
struct ceph_mds_reply_head *head = msg->front.iov_base;
struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
+ struct ceph_snap_realm *realm;
u64 tid;
int err, result;
int mds = session->s_mds;
@@ -2401,11 +2429,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
}
/* snap trace */
+ realm = NULL;
if (rinfo->snapblob_len) {
down_write(&mdsc->snap_rwsem);
ceph_update_snap_trace(mdsc, rinfo->snapblob,
- rinfo->snapblob + rinfo->snapblob_len,
- le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+ rinfo->snapblob + rinfo->snapblob_len,
+ le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
+ &realm);
downgrade_write(&mdsc->snap_rwsem);
} else {
down_read(&mdsc->snap_rwsem);
@@ -2423,6 +2453,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&req->r_fill_mutex);
up_read(&mdsc->snap_rwsem);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
@@ -2487,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
BUG_ON(req->r_err);
BUG_ON(req->r_got_result);
+ req->r_attempts = 0;
req->r_num_fwd = fwd_seq;
req->r_resend_mds = next_mds;
put_request_session(req);
@@ -2580,6 +2613,14 @@ static void handle_session(struct ceph_mds_session *session,
send_flushmsg_ack(mdsc, session, seq);
break;
+ case CEPH_SESSION_FORCE_RO:
+ dout("force_session_readonly %p\n", session);
+ spin_lock(&session->s_cap_lock);
+ session->s_readonly = true;
+ spin_unlock(&session->s_cap_lock);
+ wake_up_session_caps(session, 0);
+ break;
+
default:
pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
WARN_ON(1);
@@ -2610,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_mds_request *req, *nreq;
+ struct rb_node *p;
int err;
dout("replay_unsafe_requests mds%d\n", session->s_mds);
@@ -2622,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
ceph_con_send(&session->s_con, req->r_request);
}
}
+
+ /*
+ * also re-send old requests when MDS enters reconnect stage. So that MDS
+ * can process completed request in clientreplay stage.
+ */
+ p = rb_first(&mdsc->request_tree);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (req->r_got_unsafe)
+ continue;
+ if (req->r_attempts == 0)
+ continue; /* only old requests */
+ if (req->r_session &&
+ req->r_session->s_mds == session->s_mds) {
+ err = __prepare_send_request(mdsc, req, session->s_mds);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+ }
+ }
mutex_unlock(&mdsc->mutex);
}
@@ -2787,6 +2851,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
spin_unlock(&session->s_gen_ttl_lock);
spin_lock(&session->s_cap_lock);
+ /* don't know if session is readonly */
+ session->s_readonly = 0;
/*
* notify __ceph_remove_cap() that we are composing cap reconnect.
* If a cap get released before being added to the cap reconnect,
@@ -2933,9 +2999,6 @@ static void check_new_map(struct ceph_mds_client *mdsc,
mutex_unlock(&s->s_mutex);
s->s_state = CEPH_MDS_SESSION_RESTARTING;
}
-
- /* kick any requests waiting on the recovering mds */
- kick_requests(mdsc, i);
} else if (oldstate == newstate) {
continue; /* nothing new with this mds */
}
@@ -3295,6 +3358,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
mdsc->sessions = NULL;
+ atomic_set(&mdsc->num_sessions, 0);
mdsc->max_sessions = 0;
mdsc->stopping = 0;
init_rwsem(&mdsc->snap_rwsem);
@@ -3428,14 +3492,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
dout("sync\n");
mutex_lock(&mdsc->mutex);
want_tid = mdsc->last_tid;
- want_flush = mdsc->cap_flush_seq;
mutex_unlock(&mdsc->mutex);
- dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
ceph_flush_dirty_caps(mdsc);
+ spin_lock(&mdsc->cap_dirty_lock);
+ want_flush = mdsc->cap_flush_seq;
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
wait_unsafe_requests(mdsc, want_tid);
- wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+ wait_caps_flush(mdsc, want_flush);
}
/*
@@ -3443,17 +3510,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
*/
static bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
- int i, n = 0;
-
if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
return true;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++)
- if (mdsc->sessions[i])
- n++;
- mutex_unlock(&mdsc->mutex);
- return n == 0;
+ return atomic_read(&mdsc->num_sessions) == 0;
}
/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e2817d00f7d9..1875b5d985c6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -137,6 +137,7 @@ struct ceph_mds_session {
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
int s_cap_reconnect;
+ int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator;
@@ -272,6 +273,7 @@ struct ceph_mds_client {
struct list_head waiting_for_map;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
+ atomic_t num_sessions;
int max_sessions; /* len of s_mds_sessions */
int stopping; /* true if shutting down */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ce35fbd4ba5d..a97e39f09ba6 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
* safe. we do need to protect against concurrent empty list
* additions, however.
*/
- if (atomic_read(&realm->nref) == 0) {
+ if (atomic_inc_return(&realm->nref) == 1) {
spin_lock(&mdsc->snap_empty_lock);
list_del_init(&realm->empty_item);
spin_unlock(&mdsc->snap_empty_lock);
}
-
- atomic_inc(&realm->nref);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 0); /* tree does not take a ref */
+ atomic_set(&realm->nref, 1); /* for caller */
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
@@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
*
* caller must hold snap_rwsem for write.
*/
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
- u64 ino)
+static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
{
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
@@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
return NULL;
}
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *r;
+ r = __lookup_snap_realm(mdsc, ino);
+ if (r)
+ ceph_get_snap_realm(mdsc, r);
+ return r;
+}
+
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
@@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
}
realm->parent_ino = parentino;
realm->parent = parent;
- ceph_get_snap_realm(mdsc, parent);
list_add(&realm->child_item, &parent->children);
return 1;
}
@@ -631,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
* Caller must hold snap_rwsem for write.
*/
int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
- void *p, void *e, bool deletion)
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret)
{
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm;
+ struct ceph_snap_realm *realm = NULL;
+ struct ceph_snap_realm *first_realm = NULL;
int invalidate = 0;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
@@ -704,13 +713,18 @@ more:
dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
realm, invalidate, p, e);
- if (p < e)
- goto more;
-
/* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate)
+ if (invalidate && p >= e)
rebuild_snap_realms(realm);
+ if (!first_realm)
+ first_realm = realm;
+ else
+ ceph_put_snap_realm(mdsc, realm);
+
+ if (p < e)
+ goto more;
+
/*
* queue cap snaps _after_ we've built the new snap contexts,
* so that i_head_snapc can be set appropriately.
@@ -721,12 +735,21 @@ more:
queue_realm_cap_snaps(realm);
}
+ if (realm_ret)
+ *realm_ret = first_realm;
+ else
+ ceph_put_snap_realm(mdsc, first_realm);
+
__cleanup_empty_realms(mdsc);
return 0;
bad:
err = -EINVAL;
fail:
+ if (realm && !IS_ERR(realm))
+ ceph_put_snap_realm(mdsc, realm);
+ if (first_realm)
+ ceph_put_snap_realm(mdsc, first_realm);
pr_err("update_snap_trace error %d\n", err);
return err;
}
@@ -844,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
if (IS_ERR(realm))
goto out;
}
- ceph_get_snap_realm(mdsc, realm);
dout("splitting snap_realm %llx %p\n", realm->ino, realm);
for (i = 0; i < num_split_inos; i++) {
@@ -905,7 +927,7 @@ skip_inode:
/* we may have taken some of the old realm's children. */
for (i = 0; i < num_split_realms; i++) {
struct ceph_snap_realm *child =
- ceph_lookup_snap_realm(mdsc,
+ __lookup_snap_realm(mdsc,
le64_to_cpu(split_realms[i]));
if (!child)
continue;
@@ -918,7 +940,7 @@ skip_inode:
* snap, we can avoid queueing cap_snaps.
*/
ceph_update_snap_trace(mdsc, p, e,
- op == CEPH_SNAP_OP_DESTROY);
+ op == CEPH_SNAP_OP_DESTROY, NULL);
if (op == CEPH_SNAP_OP_SPLIT)
/* we took a reference when we created the realm, above */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 5ae62587a71d..a63997b8bcff 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -414,6 +414,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noshare");
if (opt->flags & CEPH_OPT_NOCRC)
seq_puts(m, ",nocrc");
+ if (opt->flags & CEPH_OPT_NOMSGAUTH)
+ seq_puts(m, ",nocephx_require_signatures");
+ if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
+ seq_puts(m, ",notcp_nodelay");
if (opt->name)
seq_printf(m, ",name=%s", opt->name);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e1aa32d0759d..04c8124ed30e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -693,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
extern int ceph_update_snap_trace(struct ceph_mds_client *m,
- void *p, void *e, bool deletion);
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret);
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
@@ -892,7 +893,9 @@ extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
int ceph_uninline_data(struct file *filp, struct page *locked_page);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
+extern const struct file_operations ceph_snapdir_fops;
extern const struct inode_operations ceph_dir_iops;
+extern const struct inode_operations ceph_snapdir_iops;
extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
ceph_snapdir_dentry_ops;
diff --git a/include/dt-bindings/thermal/thermal_exynos.h b/include/dt-bindings/thermal/thermal_exynos.h
new file mode 100644
index 000000000000..0646500bca69
--- /dev/null
+++ b/include/dt-bindings/thermal/thermal_exynos.h
@@ -0,0 +1,28 @@
+/*
+ * thermal_exynos.h - Samsung EXYNOS TMU device tree definitions
+ *
+ * Copyright (C) 2014 Samsung Electronics
+ * Lukasz Majewski <l.majewski@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _EXYNOS_THERMAL_TMU_DT_H
+#define _EXYNOS_THERMAL_TMU_DT_H
+
+#define TYPE_ONE_POINT_TRIMMING 0
+#define TYPE_ONE_POINT_TRIMMING_25 1
+#define TYPE_ONE_POINT_TRIMMING_85 2
+#define TYPE_TWO_POINT_TRIMMING 3
+#define TYPE_NONE 4
+
+#endif /* _EXYNOS_THERMAL_TMU_DT_H */
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index c0dadaac26e3..31eb03d0c766 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -158,17 +158,6 @@ enum {
};
-/* pool operations */
-enum {
- POOL_OP_CREATE = 0x01,
- POOL_OP_DELETE = 0x02,
- POOL_OP_AUID_CHANGE = 0x03,
- POOL_OP_CREATE_SNAP = 0x11,
- POOL_OP_DELETE_SNAP = 0x12,
- POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
- POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
-};
-
struct ceph_mon_request_header {
__le64 have_version;
__le16 session_mon;
@@ -191,31 +180,6 @@ struct ceph_mon_statfs_reply {
struct ceph_statfs st;
} __attribute__ ((packed));
-const char *ceph_pool_op_name(int op);
-
-struct ceph_mon_poolop {
- struct ceph_mon_request_header monhdr;
- struct ceph_fsid fsid;
- __le32 pool;
- __le32 op;
- __le64 auid;
- __le64 snapid;
- __le32 name_len;
-} __attribute__ ((packed));
-
-struct ceph_mon_poolop_reply {
- struct ceph_mon_request_header monhdr;
- struct ceph_fsid fsid;
- __le32 reply_code;
- __le32 epoch;
- char has_data;
- char data[0];
-} __attribute__ ((packed));
-
-struct ceph_mon_unmanaged_snap {
- __le64 snapid;
-} __attribute__ ((packed));
-
struct ceph_osd_getmap {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
@@ -307,6 +271,7 @@ enum {
CEPH_SESSION_RECALL_STATE,
CEPH_SESSION_FLUSHMSG,
CEPH_SESSION_FLUSHMSG_ACK,
+ CEPH_SESSION_FORCE_RO,
};
extern const char *ceph_session_op_name(int op);
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 8b11a79ca1cb..16fff9608848 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -30,8 +30,9 @@
#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
#define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */
+#define CEPH_OPT_TCP_NODELAY (1<<5) /* TCP_NODELAY on TCP sockets */
-#define CEPH_OPT_DEFAULT (0)
+#define CEPH_OPT_DEFAULT (CEPH_OPT_TCP_NODELAY)
#define ceph_set_opt(client, opt) \
(client)->options->flags |= CEPH_OPT_##opt;
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index d9d396c16503..e15499422fdc 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -57,6 +57,7 @@ struct ceph_messenger {
atomic_t stopping;
bool nocrc;
+ bool tcp_nodelay;
/*
* the global_seq counts connections i (attempt to) initiate
@@ -264,7 +265,8 @@ extern void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr,
u64 supported_features,
u64 required_features,
- bool nocrc);
+ bool nocrc,
+ bool tcp_nodelay);
extern void ceph_con_init(struct ceph_connection *con, void *private,
const struct ceph_connection_operations *ops,
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index deb47e45ac7c..81810dc21f06 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -40,7 +40,7 @@ struct ceph_mon_request {
};
/*
- * ceph_mon_generic_request is being used for the statfs, poolop and
+ * ceph_mon_generic_request is being used for the statfs and
* mon_get_version requests which are being done a bit differently
* because we need to get data back to the caller
*/
@@ -50,7 +50,6 @@ struct ceph_mon_generic_request {
struct rb_node node;
int result;
void *buf;
- int buf_len;
struct completion completion;
struct ceph_msg *request; /* original request */
struct ceph_msg *reply; /* and reply */
@@ -117,10 +116,4 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc);
extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
-extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 *snapid);
-
-extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 snapid);
-
#endif
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index 611e1c5893b4..b6dec05c7196 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -495,8 +495,7 @@ struct btrfs_ioctl_send_args {
/* Error codes as returned by the kernel */
enum btrfs_err_code {
- notused,
- BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
+ BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET = 1,
BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 5d5ab67f516d..ec565508e904 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -239,6 +239,8 @@ enum {
Opt_nocrc,
Opt_cephx_require_signatures,
Opt_nocephx_require_signatures,
+ Opt_tcp_nodelay,
+ Opt_notcp_nodelay,
};
static match_table_t opt_tokens = {
@@ -259,6 +261,8 @@ static match_table_t opt_tokens = {
{Opt_nocrc, "nocrc"},
{Opt_cephx_require_signatures, "cephx_require_signatures"},
{Opt_nocephx_require_signatures, "nocephx_require_signatures"},
+ {Opt_tcp_nodelay, "tcp_nodelay"},
+ {Opt_notcp_nodelay, "notcp_nodelay"},
{-1, NULL}
};
@@ -457,6 +461,7 @@ ceph_parse_options(char *options, const char *dev_name,
case Opt_nocrc:
opt->flags |= CEPH_OPT_NOCRC;
break;
+
case Opt_cephx_require_signatures:
opt->flags &= ~CEPH_OPT_NOMSGAUTH;
break;
@@ -464,6 +469,13 @@ ceph_parse_options(char *options, const char *dev_name,
opt->flags |= CEPH_OPT_NOMSGAUTH;
break;
+ case Opt_tcp_nodelay:
+ opt->flags |= CEPH_OPT_TCP_NODELAY;
+ break;
+ case Opt_notcp_nodelay:
+ opt->flags &= ~CEPH_OPT_TCP_NODELAY;
+ break;
+
default:
BUG_ON(token);
}
@@ -518,10 +530,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
/* msgr */
if (ceph_test_opt(client, MYIP))
myaddr = &client->options->my_addr;
+
ceph_messenger_init(&client->msgr, myaddr,
client->supported_features,
client->required_features,
- ceph_test_opt(client, NOCRC));
+ ceph_test_opt(client, NOCRC),
+ ceph_test_opt(client, TCP_NODELAY));
/* subsystems */
err = ceph_monc_init(&client->monc, client);
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 30560202f57b..139a9cb19b0c 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -42,17 +42,3 @@ const char *ceph_osd_state_name(int s)
return "???";
}
}
-
-const char *ceph_pool_op_name(int op)
-{
- switch (op) {
- case POOL_OP_CREATE: return "create";
- case POOL_OP_DELETE: return "delete";
- case POOL_OP_AUID_CHANGE: return "auid change";
- case POOL_OP_CREATE_SNAP: return "create snap";
- case POOL_OP_DELETE_SNAP: return "delete snap";
- case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
- case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
- }
- return "???";
-}
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index d2d525529f87..14d9995097cc 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -127,8 +127,6 @@ static int monc_show(struct seq_file *s, void *p)
op = le16_to_cpu(req->request->hdr.type);
if (op == CEPH_MSG_STATFS)
seq_printf(s, "%llu statfs\n", req->tid);
- else if (op == CEPH_MSG_POOLOP)
- seq_printf(s, "%llu poolop\n", req->tid);
else if (op == CEPH_MSG_MON_GET_VERSION)
seq_printf(s, "%llu mon_get_version", req->tid);
else
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 33a2f201e460..6b3f54ed65ba 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -510,6 +510,16 @@ static int ceph_tcp_connect(struct ceph_connection *con)
return ret;
}
+ if (con->msgr->tcp_nodelay) {
+ int optval = 1;
+
+ ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (char *)&optval, sizeof(optval));
+ if (ret)
+ pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d",
+ ret);
+ }
+
sk_set_memalloc(sock->sk);
con->sock = sock;
@@ -2922,7 +2932,8 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
struct ceph_entity_addr *myaddr,
u64 supported_features,
u64 required_features,
- bool nocrc)
+ bool nocrc,
+ bool tcp_nodelay)
{
msgr->supported_features = supported_features;
msgr->required_features = required_features;
@@ -2937,6 +2948,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
encode_my_addr(msgr);
msgr->nocrc = nocrc;
+ msgr->tcp_nodelay = tcp_nodelay;
atomic_set(&msgr->stopping, 0);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index f2148e22b148..2b3cf05e87b0 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -410,7 +410,7 @@ out_unlocked:
}
/*
- * generic requests (e.g., statfs, poolop)
+ * generic requests (currently statfs, mon_get_version)
*/
static struct ceph_mon_generic_request *__lookup_generic_req(
struct ceph_mon_client *monc, u64 tid)
@@ -569,7 +569,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
return;
bad:
- pr_err("corrupt generic reply, tid %llu\n", tid);
+ pr_err("corrupt statfs reply, tid %llu\n", tid);
ceph_msg_dump(msg);
}
@@ -588,7 +588,6 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
kref_init(&req->kref);
req->buf = buf;
- req->buf_len = sizeof(*buf);
init_completion(&req->completion);
err = -ENOMEM;
@@ -611,7 +610,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
err = do_generic_request(monc, req);
out:
- kref_put(&req->kref, release_generic_request);
+ put_generic_request(req);
return err;
}
EXPORT_SYMBOL(ceph_monc_do_statfs);
@@ -647,7 +646,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc,
return;
bad:
- pr_err("corrupt mon_get_version reply\n");
+ pr_err("corrupt mon_get_version reply, tid %llu\n", tid);
ceph_msg_dump(msg);
}
@@ -670,7 +669,6 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
kref_init(&req->kref);
req->buf = newest;
- req->buf_len = sizeof(*newest);
init_completion(&req->completion);
req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
@@ -701,134 +699,12 @@ int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
mutex_unlock(&monc->mutex);
out:
- kref_put(&req->kref, release_generic_request);
+ put_generic_request(req);
return err;
}
EXPORT_SYMBOL(ceph_monc_do_get_version);
/*
- * pool ops
- */
-static int get_poolop_reply_buf(const char *src, size_t src_len,
- char *dst, size_t dst_len)
-{
- u32 buf_len;
-
- if (src_len != sizeof(u32) + dst_len)
- return -EINVAL;
-
- buf_len = le32_to_cpu(*(__le32 *)src);
- if (buf_len != dst_len)
- return -EINVAL;
-
- memcpy(dst, src + sizeof(u32), dst_len);
- return 0;
-}
-
-static void handle_poolop_reply(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
-{
- struct ceph_mon_generic_request *req;
- struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
- u64 tid = le64_to_cpu(msg->hdr.tid);
-
- if (msg->front.iov_len < sizeof(*reply))
- goto bad;
- dout("handle_poolop_reply %p tid %llu\n", msg, tid);
-
- mutex_lock(&monc->mutex);
- req = __lookup_generic_req(monc, tid);
- if (req) {
- if (req->buf_len &&
- get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
- msg->front.iov_len - sizeof(*reply),
- req->buf, req->buf_len) < 0) {
- mutex_unlock(&monc->mutex);
- goto bad;
- }
- req->result = le32_to_cpu(reply->reply_code);
- get_generic_request(req);
- }
- mutex_unlock(&monc->mutex);
- if (req) {
- complete(&req->completion);
- put_generic_request(req);
- }
- return;
-
-bad:
- pr_err("corrupt generic reply, tid %llu\n", tid);
- ceph_msg_dump(msg);
-}
-
-/*
- * Do a synchronous pool op.
- */
-static int do_poolop(struct ceph_mon_client *monc, u32 op,
- u32 pool, u64 snapid,
- char *buf, int len)
-{
- struct ceph_mon_generic_request *req;
- struct ceph_mon_poolop *h;
- int err;
-
- req = kzalloc(sizeof(*req), GFP_NOFS);
- if (!req)
- return -ENOMEM;
-
- kref_init(&req->kref);
- req->buf = buf;
- req->buf_len = len;
- init_completion(&req->completion);
-
- err = -ENOMEM;
- req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS,
- true);
- if (!req->request)
- goto out;
- req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS,
- true);
- if (!req->reply)
- goto out;
-
- /* fill out request */
- req->request->hdr.version = cpu_to_le16(2);
- h = req->request->front.iov_base;
- h->monhdr.have_version = 0;
- h->monhdr.session_mon = cpu_to_le16(-1);
- h->monhdr.session_mon_tid = 0;
- h->fsid = monc->monmap->fsid;
- h->pool = cpu_to_le32(pool);
- h->op = cpu_to_le32(op);
- h->auid = 0;
- h->snapid = cpu_to_le64(snapid);
- h->name_len = 0;
-
- err = do_generic_request(monc, req);
-
-out:
- kref_put(&req->kref, release_generic_request);
- return err;
-}
-
-int ceph_monc_create_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 *snapid)
-{
- return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
- pool, 0, (char *)snapid, sizeof(*snapid));
-
-}
-EXPORT_SYMBOL(ceph_monc_create_snapid);
-
-int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
- u32 pool, u64 snapid)
-{
- return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
- pool, snapid, NULL, 0);
-
-}
-
-/*
* Resend pending generic requests.
*/
static void __resend_generic_request(struct ceph_mon_client *monc)
@@ -1112,10 +988,6 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
handle_get_version_reply(monc, msg);
break;
- case CEPH_MSG_POOLOP_REPLY:
- handle_poolop_reply(monc, msg);
- break;
-
case CEPH_MSG_MON_MAP:
ceph_monc_handle_map(monc, msg);
break;
@@ -1154,7 +1026,6 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
case CEPH_MSG_MON_SUBSCRIBE_ACK:
m = ceph_msg_get(monc->m_subscribe_ack);
break;
- case CEPH_MSG_POOLOP_REPLY:
case CEPH_MSG_STATFS_REPLY:
return get_generic_reply(con, hdr, skip);
case CEPH_MSG_AUTH_REPLY:
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 53299c7b0ca4..41a4abc7e98e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1035,10 +1035,11 @@ static void put_osd(struct ceph_osd *osd)
{
dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
atomic_read(&osd->o_ref) - 1);
- if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) {
+ if (atomic_dec_and_test(&osd->o_ref)) {
struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
- ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
+ if (osd->o_auth.authorizer)
+ ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
kfree(osd);
}
}
@@ -1048,14 +1049,24 @@ static void put_osd(struct ceph_osd *osd)
*/
static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
{
- dout("__remove_osd %p\n", osd);
+ dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
WARN_ON(!list_empty(&osd->o_requests));
WARN_ON(!list_empty(&osd->o_linger_requests));
- rb_erase(&osd->o_node, &osdc->osds);
list_del_init(&osd->o_osd_lru);
- ceph_con_close(&osd->o_con);
- put_osd(osd);
+ rb_erase(&osd->o_node, &osdc->osds);
+ RB_CLEAR_NODE(&osd->o_node);
+}
+
+static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
+
+ if (!RB_EMPTY_NODE(&osd->o_node)) {
+ ceph_con_close(&osd->o_con);
+ __remove_osd(osdc, osd);
+ put_osd(osd);
+ }
}
static void remove_all_osds(struct ceph_osd_client *osdc)
@@ -1065,7 +1076,7 @@ static void remove_all_osds(struct ceph_osd_client *osdc)
while (!RB_EMPTY_ROOT(&osdc->osds)) {
struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
struct ceph_osd, o_node);
- __remove_osd(osdc, osd);
+ remove_osd(osdc, osd);
}
mutex_unlock(&osdc->request_mutex);
}
@@ -1106,7 +1117,7 @@ static void remove_old_osds(struct ceph_osd_client *osdc)
list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
if (time_before(jiffies, osd->lru_ttl))
break;
- __remove_osd(osdc, osd);
+ remove_osd(osdc, osd);
}
mutex_unlock(&osdc->request_mutex);
}
@@ -1121,8 +1132,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
if (list_empty(&osd->o_requests) &&
list_empty(&osd->o_linger_requests)) {
- __remove_osd(osdc, osd);
-
+ remove_osd(osdc, osd);
return -ENODEV;
}
@@ -1926,6 +1936,7 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
{
struct rb_node *p, *n;
+ dout("%s %p\n", __func__, osdc);
for (p = rb_first(&osdc->osds); p; p = n) {
struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);