From ce7791ffee1e1ee9f97193b817c7dd1fa6746aad Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 20 May 2016 01:57:20 +0100 Subject: Btrfs: fix race between readahead and device replace/removal The list of devices is protected by the device_list_mutex and the device replace code, in its finishing phase correctly takes that mutex before removing the source device from that list. However the readahead code was iterating that list without acquiring the respective mutex leading to crashes later on due to invalid memory accesses: [125671.831036] general protection fault: 0000 [#1] PREEMPT SMP [125671.832129] Modules linked in: btrfs dm_flakey dm_mod crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis tpm ppdev evdev parport_pc psmouse sg parport processor ser [125671.834973] CPU: 10 PID: 19603 Comm: kworker/u32:19 Tainted: G W 4.6.0-rc7-btrfs-next-29+ #1 [125671.834973] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [125671.834973] Workqueue: btrfs-readahead btrfs_readahead_helper [btrfs] [125671.834973] task: ffff8801ac520540 ti: ffff8801ac918000 task.ti: ffff8801ac918000 [125671.834973] RIP: 0010:[] [] __radix_tree_lookup+0x6a/0x105 [125671.834973] RSP: 0018:ffff8801ac91bc28 EFLAGS: 00010206 [125671.834973] RAX: 0000000000000000 RBX: 6b6b6b6b6b6b6b6a RCX: 0000000000000000 [125671.834973] RDX: 0000000000000000 RSI: 00000000000c1bff RDI: ffff88002ebd62a8 [125671.834973] RBP: ffff8801ac91bc70 R08: 0000000000000001 R09: 0000000000000000 [125671.834973] R10: ffff8801ac91bc70 R11: 0000000000000000 R12: ffff88002ebd62a8 [125671.834973] R13: 0000000000000000 R14: 0000000000000000 R15: 00000000000c1bff [125671.834973] FS: 0000000000000000(0000) GS:ffff88023fd40000(0000) knlGS:0000000000000000 [125671.834973] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [125671.834973] CR2: 000000000073cae4 CR3: 00000000b7723000 CR4: 00000000000006e0 [125671.834973] Stack: [125671.834973] 0000000000000000 ffff8801422d5600 ffff8802286bbc00 0000000000000000 [125671.834973] 0000000000000001 ffff8802286bbc00 00000000000c1bff 0000000000000000 [125671.834973] ffff88002e639eb8 ffff8801ac91bc80 ffffffff81270541 ffff8801ac91bcb0 [125671.834973] Call Trace: [125671.834973] [] radix_tree_lookup+0xd/0xf [125671.834973] [] reada_peer_zones_set_lock+0x3e/0x60 [btrfs] [125671.834973] [] reada_pick_zone+0x29/0x103 [btrfs] [125671.834973] [] reada_start_machine_worker+0x129/0x2d3 [btrfs] [125671.834973] [] btrfs_scrubparity_helper+0x185/0x3aa [btrfs] [125671.834973] [] btrfs_readahead_helper+0xe/0x10 [btrfs] [125671.834973] [] process_one_work+0x271/0x4e9 [125671.834973] [] worker_thread+0x1eb/0x2c9 [125671.834973] [] ? rescuer_thread+0x2b3/0x2b3 [125671.834973] [] kthread+0xd4/0xdc [125671.834973] [] ret_from_fork+0x22/0x40 [125671.834973] [] ? kthread_stop+0x286/0x286 So fix this by taking the device_list_mutex in the readahead code. We can't use here the lighter approach of using a rcu_read_lock() and rcu_read_unlock() pair together with a list_for_each_entry_rcu() call because we end up doing calls to sleeping functions (kzalloc()) in the respective code path. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/reada.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 298631eaee78..8428db7cd88f 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -761,12 +761,14 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) do { enqueued = 0; + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(fs_info, device); } + mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); -- cgit v1.2.3 From 57ba4cb85bffc0c7c6567c89d23713721fea9655 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 20 May 2016 04:34:23 +0100 Subject: Btrfs: fix race between device replace and block group removal When it's finishing, the device replace code iterates all extent maps representing block group and for each one that has a stripe that refers to the source device, it replaces its device with the target device. However when it replaces the source device with the target device it, the target device still has an ID of 0ULL (BTRFS_DEV_REPLACE_DEVID), only after its ID is changed to match the one from the source device. This leads to races with the chunk removal code that can temporarly see a device with an ID of 0ULL and then attempt to use that ID to remove items from the device tree and fail, causing a transaction abort: [ 9238.594364] BTRFS info (device sdf): dev_replace from /dev/sdf (devid 3) to /dev/sde finished [ 9238.594377] ------------[ cut here ]------------ [ 9238.594402] WARNING: CPU: 14 PID: 21566 at fs/btrfs/volumes.c:2771 btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594403] BTRFS: Transaction aborted (error 1) [ 9238.594416] Modules linked in: btrfs crc32c_generic acpi_cpufreq xor tpm_tis tpm raid6_pq ppdev parport_pc processor psmouse parport i2c_piix4 evdev sg i2c_core se rio_raw pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix virtio_pci libata virtio_ring virtio e1000 scsi_mod fl oppy [last unloaded: btrfs] [ 9238.594418] CPU: 14 PID: 21566 Comm: btrfs-cleaner Not tainted 4.6.0-rc7-btrfs-next-29+ #1 [ 9238.594419] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 9238.594421] 0000000000000000 ffff88017f1dbc60 ffffffff8126b42c ffff88017f1dbcb0 [ 9238.594422] 0000000000000000 ffff88017f1dbca0 ffffffff81052b14 00000ad37f1dbd18 [ 9238.594423] 0000000000000001 ffff88018068a558 ffff88005c4b9c00 ffff880233f60db0 [ 9238.594424] Call Trace: [ 9238.594428] [] dump_stack+0x67/0x90 [ 9238.594430] [] __warn+0xc2/0xdd [ 9238.594432] [] warn_slowpath_fmt+0x4b/0x53 [ 9238.594434] [] ? kmem_cache_free+0x128/0x188 [ 9238.594450] [] btrfs_remove_chunk+0x2e5/0x793 [btrfs] [ 9238.594452] [] ? arch_local_irq_save+0x9/0xc [ 9238.594464] [] btrfs_delete_unused_bgs+0x317/0x382 [btrfs] [ 9238.594476] [] cleaner_kthread+0x1ad/0x1c7 [btrfs] [ 9238.594489] [] ? btree_invalidatepage+0x8e/0x8e [btrfs] [ 9238.594490] [] kthread+0xd4/0xdc [ 9238.594494] [] ret_from_fork+0x22/0x40 [ 9238.594495] [] ? kthread_stop+0x286/0x286 [ 9238.594496] ---[ end trace 183efbe50275f059 ]--- The sequence of steps leading to this is like the following: CPU 1 CPU 2 btrfs_dev_replace_finishing() at this point dev_replace->tgtdev->devid == BTRFS_DEV_REPLACE_DEVID (0ULL) ... btrfs_start_transaction() btrfs_commit_transaction() btrfs_delete_unused_bgs() btrfs_remove_chunk() looks up for the extent map corresponding to the chunk lock_chunks() (chunk_mutex) check_system_chunk() unlock_chunks() (chunk_mutex) locks fs_info->chunk_mutex btrfs_dev_replace_update_device_in_mapping_tree() --> iterates fs_info->mapping_tree and replaces the device in every extent map's map->stripes[] with dev_replace->tgtdev, which still has an id of 0ULL (BTRFS_DEV_REPLACE_DEVID) iterates over all stripes from the extent map --> calls btrfs_free_dev_extent() passing it the target device that still has an ID of 0ULL --> btrfs_free_dev_extent() fails --> aborts current transaction finishes setting up the target device, namely it sets tgtdev->devid to the value of srcdev->devid (which is necessarily > 0) frees the srcdev unlocks fs_info->chunk_mutex So fix this by taking the device list mutex while processing the stripes for the chunk's extent map. This is similar to the race between device replace and block group creation that was fixed by commit 50460e37186a ("Btrfs: fix race when finishing dev replace leading to transaction abort"). Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/volumes.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c01824eef08..04ca48362ef1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2761,6 +2761,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; /* Just in case */ root = root->fs_info->chunk_root; @@ -2787,12 +2788,19 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; ret = btrfs_free_dev_extent(trans, device, map->stripes[i].physical, &dev_extent_len); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } @@ -2811,11 +2819,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); if (ret) { + mutex_unlock(&fs_devices->device_list_mutex); btrfs_abort_transaction(trans, root, ret); goto out; } } } + mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); -- cgit v1.2.3 From f0e9b7d6401959816599191d1d9db90b6fd750db Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 14 May 2016 09:12:53 +0100 Subject: Btrfs: fix race setting block group readonly during device replace When we do a device replace, for each device extent we find from the source device, we set the corresponding block group to readonly mode to prevent writes into it from happening while we are copying the device extent from the source to the target device. However just before we set the block group to readonly mode some concurrent task might have already allocated an extent from it or decided it could perform a nocow write into one of its extents, which can make the device replace process to miss copying an extent since it uses the extent tree's commit root to search for extents and only once it finishes searching for all extents belonging to the block group it does set the left cursor to the logical end address of the block group - this is a problem if the respective ordered extents finish while we are searching for extents using the extent tree's commit root and no transaction commit happens while we are iterating the tree, since it's the delayed references created by the ordered extents (when they complete) that insert the extent items into the extent tree (using the non-commit root of course). Example: CPU 1 CPU 2 btrfs_dev_replace_start() btrfs_scrub_dev() scrub_enumerate_chunks() --> finds device extent belonging to block group X starts buffered write against some inode writepages is run against that inode forcing dellaloc to run btrfs_writepages() extent_writepages() extent_write_cache_pages() __extent_writepage() writepage_delalloc() run_delalloc_range() cow_file_range() btrfs_reserve_extent() --> allocates an extent from block group X (which is not yet in RO mode) btrfs_add_ordered_extent() --> creates ordered extent Y flush_epd_write_bio() --> bio against the extent from block group X is submitted btrfs_inc_block_group_ro(bg X) --> sets block group X to readonly scrub_chunk(bg X) scrub_stripe(device extent from srcdev) --> keeps searching for extent items belonging to the block group using the extent tree's commit root --> it never blocks due to fs_info->scrub_pause_req as no one tries to commit transaction N --> copies all extents found from the source device into the target device --> finishes search loop bio completes ordered extent Y completes and creates delayed data reference which will add an extent item to the extent tree when run (typically at transaction commit time) --> so the task doing the scrub/device replace at CPU 1 misses this and does not copy this extent into the new/target device btrfs_dec_block_group_ro(bg X) --> turns block group X back to RW mode dev_replace->cursor_left is set to the logical end offset of block group X So fix this by waiting for all cow and nocow writes after setting a block group to readonly mode. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/ordered-data.c | 6 +++++- fs/btrfs/ordered-data.h | 2 +- fs/btrfs/scrub.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 559170464d7c..e96634a725c3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -718,12 +718,13 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, return count; } -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; struct list_head splice; int done; + int total_done = 0; INIT_LIST_HEAD(&splice); @@ -742,6 +743,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, done = btrfs_wait_ordered_extents(root, nr, range_start, range_len); btrfs_put_fs_root(root); + total_done += done; spin_lock(&fs_info->ordered_root_lock); if (nr != -1) { @@ -752,6 +754,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, list_splice_tail(&splice, &fs_info->ordered_roots); spin_unlock(&fs_info->ordered_root_lock); mutex_unlock(&fs_info->ordered_operations_mutex); + + return total_done; } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 2049c9be85ee..451507776ff5 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -199,7 +199,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, const u64 range_start, const u64 range_len); -void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, +int btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr, const u64 range_start, const u64 range_len); void btrfs_get_logged_extents(struct inode *inode, struct list_head *logged_list, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 46d847f66e4b..1611572d47bd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3582,6 +3582,46 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ scrub_pause_on(fs_info); ret = btrfs_inc_block_group_ro(root, cache); + if (!ret && is_dev_replace) { + /* + * If we are doing a device replace wait for any tasks + * that started dellaloc right before we set the block + * group to RO mode, as they might have just allocated + * an extent from it or decided they could do a nocow + * write. And if any such tasks did that, wait for their + * ordered extents to complete and then commit the + * current transaction, so that we can later see the new + * extent items in the extent tree - the ordered extents + * create delayed data references (for cow writes) when + * they complete, which will be run and insert the + * corresponding extent items into the extent tree when + * we commit the transaction they used when running + * inode.c:btrfs_finish_ordered_io(). We later use + * the commit root of the extent tree to find extents + * to copy from the srcdev into the tgtdev, and we don't + * want to miss any new extents. + */ + btrfs_wait_block_group_reservations(cache); + btrfs_wait_nocow_writers(cache); + ret = btrfs_wait_ordered_roots(fs_info, -1, + cache->key.objectid, + cache->key.offset); + if (ret > 0) { + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + ret = PTR_ERR(trans); + else + ret = btrfs_commit_transaction(trans, + root); + if (ret) { + scrub_pause_off(fs_info); + btrfs_put_block_group(cache); + break; + } + } + } scrub_pause_off(fs_info); if (ret == 0) { -- cgit v1.2.3 From 81e87a736c5581e83f52dc054b368993ec7f16d7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 14 May 2016 16:32:35 +0100 Subject: Btrfs: fix unprotected assignment of the left cursor for device replace We were assigning new values to fields of the device replace object without holding the respective lock after processing each device extent. This is important for the left cursor field which can be accessed by a concurrent task running __btrfs_map_block (which, correctly, takes the device replace lock). So change these fields while holding the device replace lock. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/scrub.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1611572d47bd..19a1eda70361 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3642,9 +3642,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length, found_key.offset, cache, is_dev_replace); @@ -3718,8 +3720,10 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, break; } + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); skip: key.offset = found_key.offset + length; btrfs_release_path(path); -- cgit v1.2.3 From 1a1a8b732c7e958e6eba0680439e814efde2362d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 14 May 2016 19:44:40 +0100 Subject: Btrfs: fix race setting block group back to RW mode during device replace After it finishes processing a device extent, the device replace code sets back the block group to RW mode and then after that it sets the left cursor to match the logical end address of the block group, so that future writes into extents belonging to the block group go both the source (old) and target (new) devices. However from the moment we turn the block group back to RW mode we have a short time window, that lasts until we update the left cursor's value, where extents can be allocated from the block group and written to, in which case they will not be copied/written to the target (new) device. Fix this by updating the left cursor's value before turning the block group back to RW mode. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/scrub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 19a1eda70361..70427ef66b04 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3682,6 +3682,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, scrub_pause_off(fs_info); + btrfs_dev_replace_lock(&fs_info->dev_replace, 1); + dev_replace->cursor_left = dev_replace->cursor_right; + dev_replace->item_needs_writeback = 1; + btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); + if (ro_set) btrfs_dec_block_group_ro(root, cache); @@ -3719,11 +3724,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ret = -ENOMEM; break; } - - btrfs_dev_replace_lock(&fs_info->dev_replace, 1); - dev_replace->cursor_left = dev_replace->cursor_right; - dev_replace->item_needs_writeback = 1; - btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); skip: key.offset = found_key.offset + length; btrfs_release_path(path); -- cgit v1.2.3 From 22ab04e814f4fe2ce72a13d291491f98ef6ac757 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 May 2016 20:29:44 +0100 Subject: Btrfs: fix race between device replace and chunk allocation While iterating and copying extents from the source device, the device replace code keeps adjusting a left cursor that is used to make sure that once we finish processing a device extent, any future writes to extents from the corresponding block group will get into both the source and target devices. This left cursor is also used for resuming the device replace operation at mount time. However using this left cursor to decide whether writes go into both devices or only the source device is not enough to guarantee we don't miss copying extents into the target device. There are two cases where the current approach fails. The first one is related to when there are holes in the device and they get allocated for new block groups while the device replace operation is iterating the device extents (more on this explained below). The second one is that when that loop over the device extents finishes, we start dellaloc, wait for all ordered extents and then commit the current transaction, we might have got new block groups allocated that are now using a device extent that has an offset greater then or equals to the value of the left cursor, in which case writes to extents belonging to these new block groups will get issued only to the source device. For the first case where the current approach of using a left cursor fails, consider the source device currently has the following layout: [ extent bg A ] [ hole, unallocated space ] [extent bg B ] 3Gb 4Gb 5Gb While we are iterating the device extents from the source device using the commit root of the device tree, the following happens: CPU 1 CPU 2 scrub_enumerate_chunks() --> searches the device tree for extents belonging to the source device using the device tree's commit root --> 1st iteration finds extent belonging to block group A --> sets block group A to RO mode (btrfs_inc_block_group_ro) --> sets cursor left to found_key.offset which is 3Gb --> scrub_chunk() starts copies all allocated extents from block group's A stripe at source device into target device btrfs_alloc_chunk() --> allocates device extent in the range [4Gb, 5Gb[ from the source device for a new block group C extent allocated from block group C for a direct IO, buffered write or btree node/leaf extent is written to, perhaps in response to a writepages() call from the VM or directly through direct IO the write is made only against the source device and not against the target device because the extent's offset is in the interval [4Gb, 5Gb[ which is larger then the value of cursor_left (3Gb) --> scrub_chunks() finishes --> updates left cursor from 3Gb to 4Gb --> btrfs_dec_block_group_ro() sets block group A back to RW mode --> 2nd iteration finds extent belonging to block group B - it did not find the new extent in the range [4Gb, 5Gb[ for block group C because we are using the device tree's commit root or even because the block group's items are not all yet inserted in the respective btrees, that is, the block group is still attached to some transaction handle's new_bgs list and btrfs_create_pending_block_groups() was not called yet against that transaction handle, so the device extent items were not yet inserted into the devices tree --> so we end not copying anything from the newly allocated device extent from the source device to the target device So fix this by making __btrfs_map_block() always redirect writes to the target device as well, independently of the left cursor's value. With this change the left cursor is now used only for the purpose of tracking progress and allow a mount operation to resume a device replace. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/volumes.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 04ca48362ef1..765aabd9145f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5773,20 +5773,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - if (physical_of_found + map->stripe_len <= - dev_replace->cursor_left) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; - tgtdev_indexes++; - num_stripes++; - } + tgtdev_indexes++; + num_stripes++; } } -- cgit v1.2.3 From b7ec35b304b64af2830027350cc99d31e6e537c2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 28 Apr 2016 16:07:25 +0200 Subject: libceph: change ceph_osdmap_flag() to take osdc For the benefit of every single caller, take osdc instead of map. Also, now that osdc->osdmap can't ever be NULL, drop the check. Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 8 ++++---- include/linux/ceph/osd_client.h | 5 +++++ include/linux/ceph/osdmap.h | 5 ----- net/ceph/osd_client.c | 39 +++++++++++++++++++-------------------- 4 files changed, 28 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a888df6f2d71..8eeb9f579db5 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1349,7 +1349,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; } @@ -1440,7 +1440,7 @@ retry_snap: ceph_put_cap_refs(ci, got); if (written >= 0) { - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL)) + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; written = generic_write_sync(iocb, written); @@ -1672,8 +1672,8 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) && - !(mode & FALLOC_FL_PUNCH_HOLE)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { ret = -ENOSPC; goto unlock; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 19b14862d3e0..1b3b6e155392 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -279,6 +279,11 @@ struct ceph_osd_client { struct workqueue_struct *notify_wq; }; +static inline bool ceph_osdmap_flag(struct ceph_osd_client *osdc, int flag) +{ + return osdc->osdmap->flags & flag; +} + extern int ceph_osdc_setup(void); extern void ceph_osdc_cleanup(void); diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index ddc426b22d81..9ccf4dbe55f8 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -189,11 +189,6 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) return !ceph_osd_is_up(map, osd); } -static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag) -{ - return map && (map->flags & flag); -} - extern char *ceph_osdmap_state_str(char *str, int len, int state); extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 0160d7d09a1e..79c3bad87e62 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1276,9 +1276,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, const struct ceph_osd_request_target *t, struct ceph_pg_pool_info *pi) { - bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); - bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || __pool_full(pi); WARN_ON(pi->id != t->base_oloc.pool); @@ -1303,8 +1303,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, bool force_resend = false; bool need_check_tiering = false; bool need_resend = false; - bool sort_bitwise = ceph_osdmap_flag(osdc->osdmap, - CEPH_OSDMAP_SORTBITWISE); + bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); enum calc_target_result ct_res; int ret; @@ -1590,9 +1589,9 @@ static void maybe_request_map(struct ceph_osd_client *osdc) verify_osdc_locked(osdc); WARN_ON(!osdc->osdmap->epoch); - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("%s osdc %p continuous\n", __func__, osdc); continuous = true; } else { @@ -1629,19 +1628,19 @@ again: } if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) { + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("req %p pausewr\n", req); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_READ) && - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) { + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("req %p pauserd\n", req); req->r_t.paused = true; maybe_request_map(osdc); } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY | CEPH_OSD_FLAG_FULL_FORCE)) && - (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || pool_full(osdc, req->r_t.base_oloc.pool))) { dout("req %p full/pool_full\n", req); pr_warn_ratelimited("FULL or reached pool quota\n"); @@ -2280,7 +2279,7 @@ static void send_linger_ping(struct ceph_osd_linger_request *lreq) struct ceph_osd_request *req = lreq->ping_req; struct ceph_osd_req_op *op = &req->r_ops[0]; - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD)) { + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { dout("%s PAUSERD\n", __func__); return; } @@ -3050,7 +3049,7 @@ static int handle_one_map(struct ceph_osd_client *osdc, bool skipped_map = false; bool was_full; - was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); set_pool_was_full(osdc); if (incremental) @@ -3088,7 +3087,7 @@ static int handle_one_map(struct ceph_osd_client *osdc, osdc->osdmap = newmap; } - was_full &= !ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); scan_requests(&osdc->homeless_osd, skipped_map, was_full, true, need_resend, need_resend_linger); @@ -3174,9 +3173,9 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) if (ceph_check_fsid(osdc->client, &fsid) < 0) goto bad; - was_pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); - was_pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); /* incremental maps */ @@ -3238,9 +3237,9 @@ done: * we find out when we are no longer full and stop returning * ENOSPC. */ - pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); - pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); if (was_pauserd || was_pausewr || pauserd || pausewr) maybe_request_map(osdc); -- cgit v1.2.3 From 2999241daa8d77947f108dfbde35c268cd7bd709 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 27 May 2016 17:42:05 +0100 Subject: Btrfs: fix race between device replace and discard While we are finishing a device replace operation, we can make a discard operation (fs mounted with -o discard) do an invalid memory access like the one reported by the following trace: [ 3206.384654] general protection fault: 0000 [#1] PREEMPT SMP [ 3206.387520] Modules linked in: dm_mod btrfs crc32c_generic xor raid6_pq acpi_cpufreq tpm_tis psmouse tpm ppdev sg parport_pc evdev i2c_piix4 parport processor serio_raw i2c_core pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci virtio_ring scsi_mod e1000 virtio floppy [last unloaded: btrfs] [ 3206.388595] CPU: 14 PID: 29194 Comm: fsstress Not tainted 4.6.0-rc7-btrfs-next-29+ #1 [ 3206.388595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [ 3206.388595] task: ffff88017ace0100 ti: ffff880171b98000 task.ti: ffff880171b98000 [ 3206.388595] RIP: 0010:[] [] blkdev_issue_discard+0x5c/0x2a7 [ 3206.388595] RSP: 0018:ffff880171b9bb80 EFLAGS: 00010246 [ 3206.388595] RAX: ffff880171b9bc28 RBX: 000000000090d000 RCX: 0000000000000000 [ 3206.388595] RDX: ffffffff82fa1b48 RSI: ffffffff8179f46c RDI: ffffffff82fa1b48 [ 3206.388595] RBP: ffff880171b9bcc0 R08: 0000000000000000 R09: 0000000000000001 [ 3206.388595] R10: ffff880171b9bce0 R11: 000000000090f000 R12: ffff880171b9bbe8 [ 3206.388595] R13: 0000000000000010 R14: 0000000000004868 R15: 6b6b6b6b6b6b6b6b [ 3206.388595] FS: 00007f6182e4e700(0000) GS:ffff88023fdc0000(0000) knlGS:0000000000000000 [ 3206.388595] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3206.388595] CR2: 00007f617c2bbb18 CR3: 000000017ad9c000 CR4: 00000000000006e0 [ 3206.388595] Stack: [ 3206.388595] 0000000000004878 0000000000000000 0000000002400040 0000000000000000 [ 3206.388595] 0000000000000000 ffff880171b9bbe8 ffff880171b9bbb0 ffff880171b9bbb0 [ 3206.388595] ffff880171b9bbc0 ffff880171b9bbc0 ffff880171b9bbd0 ffff880171b9bbd0 [ 3206.388595] Call Trace: [ 3206.388595] [] btrfs_issue_discard+0x12f/0x143 [btrfs] [ 3206.388595] [] ? btrfs_issue_discard+0x12f/0x143 [btrfs] [ 3206.388595] [] btrfs_discard_extent+0x87/0xde [btrfs] [ 3206.388595] [] btrfs_finish_extent_commit+0xb2/0x1df [btrfs] [ 3206.388595] [] ? __mutex_unlock_slowpath+0x150/0x15b [ 3206.388595] [] btrfs_commit_transaction+0x7fc/0x980 [btrfs] [ 3206.388595] [] ? __mutex_unlock_slowpath+0x150/0x15b [ 3206.388595] [] btrfs_sync_file+0x38f/0x428 [btrfs] [ 3206.388595] [] vfs_fsync_range+0x8c/0x9e [ 3206.388595] [] vfs_fsync+0x1c/0x1e [ 3206.388595] [] do_fsync+0x31/0x4a [ 3206.388595] [] SyS_fsync+0x10/0x14 [ 3206.388595] [] entry_SYSCALL_64_fastpath+0x18/0xa8 [ 3206.388595] [] ? time_hardirqs_off+0x9/0x14 [ 3206.388595] [] ? trace_hardirqs_off_caller+0x1f/0xaa This happens because when we call btrfs_map_block() from btrfs_discard_extent() to get a btrfs_bio structure, the device replace operation has not finished yet, but before we use the device of one of the stripes from the returned btrfs_bio structure, the device object is freed. This is illustrated by the following diagram. CPU 1 CPU 2 btrfs_dev_replace_start() (...) btrfs_dev_replace_finishing() btrfs_start_transaction() btrfs_commit_transaction() (...) btrfs_sync_file() btrfs_start_transaction() (...) btrfs_commit_transaction() btrfs_finish_extent_commit() btrfs_discard_extent() btrfs_map_block() --> returns a struct btrfs_bio with a stripe that has a device field pointing to source device of the replace operation (the device that is being replaced) mutex_lock(&uuid_mutex) mutex_lock(&fs_info->fs_devices->device_list_mutex) mutex_lock(&fs_info->chunk_mutex) btrfs_dev_replace_update_device_in_mapping_tree() --> iterates the mapping tree and for each extent map that has a stripe pointing to the source device, it updates the stripe to point to the target device instead btrfs_rm_dev_replace_blocked() --> waits for fs_info->bio_counter to go down to 0 btrfs_rm_dev_replace_remove_srcdev() --> removes source device from the list of devices mutex_unlock(&fs_info->chunk_mutex) mutex_unlock(&fs_info->fs_devices->device_list_mutex) mutex_unlock(&uuid_mutex) btrfs_rm_dev_replace_free_srcdev() --> frees the source device --> iterates over all stripes of the returned struct btrfs_bio --> for each stripe it dereferences its device pointer --> it ends up finding a pointer to the device used as the source device for the replace operation and that was already freed So fix this by surrounding the call to btrfs_map_block(), and the code that uses the returned struct btrfs_bio, with calls to btrfs_bio_counter_inc_blocked() and btrfs_bio_counter_dec(), so that the finishing phase of the device replace operation blocks until the the bio counter decreases to zero before it frees the source device. This is the same approach we do at btrfs_map_bio() for example. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/extent-tree.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a400951e8678..689d25ac6a68 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2042,6 +2042,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, struct btrfs_bio *bbio = NULL; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are discarding. + */ + btrfs_bio_counter_inc_blocked(root->fs_info); /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(root->fs_info, REQ_DISCARD, bytenr, &num_bytes, &bbio, 0); @@ -2074,6 +2079,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } btrfs_put_bbio(bbio); } + btrfs_bio_counter_dec(root->fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; -- cgit v1.2.3 From b5de8d0df80fa87f1f97fbcc4bbc8cad0a018802 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 27 May 2016 22:21:27 +0100 Subject: Btrfs: fix race between device replace and read repair While we are finishing a device replace operation we can have a concurrent task trying to do a read repair operation, in which case it will call btrfs_map_block() to get a struct btrfs_bio which can have a stripe that points to the source device of the device replace operation. This allows for the read repair task to dereference the stripe's device pointer after the device replace operation has freed the source device, resulting in an invalid memory access. This is similar to the problem solved by my previous patch in the same series and named "Btrfs: fix race between device replace and discard". So fix this by surrounding the call to btrfs_map_block() and the code that uses the returned struct btrfs_bio with calls to btrfs_bio_counter_inc_blocked() and btrfs_bio_counter_dec(), giving the proper serialization with the finishing phase of the device replace operation. Signed-off-by: Filipe Manana Reviewed-by: Josef Bacik --- fs/btrfs/extent_io.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3cd57825c75f..6e953de83f08 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2025,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, bio->bi_iter.bi_size = 0; map_length = length; + /* + * Avoid races with device replace and make sure our bbio has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } @@ -2037,6 +2044,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, dev = bbio->stripes[mirror_num-1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { + btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } @@ -2045,6 +2053,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { /* try to remap that extent elsewhere? */ + btrfs_bio_counter_dec(fs_info); bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); return -EIO; @@ -2054,6 +2063,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); + btrfs_bio_counter_dec(fs_info); bio_put(bio); return 0; } -- cgit v1.2.3 From d2138455286f9db8fbff9b50b53b51766f6c1f92 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 17 May 2016 11:52:48 +0800 Subject: FS-Cache: wake write waiter after invalidating writes Signed-off-by: Yan, Zheng Acked-by: David Howells --- fs/fscache/page.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 3078b679fcd1..c8c4f79c7ce1 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -887,6 +887,8 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie) put_page(results[i]); } + wake_up_bit(&cookie->flags, 0); + _leave(""); } -- cgit v1.2.3 From 480ce08a70e4179f34808a3bdbfe6627f624cf54 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 20 May 2016 18:32:31 +0800 Subject: FS-Cache: make check_consistency callback return int __fscache_check_consistency() calls check_consistency() callback and return the callback's return value. But the return type of check_consistency() is bool. So __fscache_check_consistency() return 1 if the cache is inconsistent. This is inconsistent with the document. Signed-off-by: Yan, Zheng Acked-by: David Howells --- fs/cachefiles/interface.c | 2 +- include/linux/fscache-cache.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 861d611b8c05..ce5f345d70f5 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -380,7 +380,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) * check if the backing cache is updated to FS-Cache * - called by FS-Cache when evaluates if need to invalidate the cache */ -static bool cachefiles_check_consistency(struct fscache_operation *op) +static int cachefiles_check_consistency(struct fscache_operation *op) { struct cachefiles_object *object; struct cachefiles_cache *cache; diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 604e1526cd00..13ba552e6c09 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -241,7 +241,7 @@ struct fscache_cache_ops { /* check the consistency between the backing cache and the FS-Cache * cookie */ - bool (*check_consistency)(struct fscache_operation *op); + int (*check_consistency)(struct fscache_operation *op); /* store the updated auxiliary data on an object */ void (*update_object)(struct fscache_object *object); -- cgit v1.2.3 From 368e35857dfab264f512d040c4486c9b13297988 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 17 May 2016 11:58:02 +0800 Subject: ceph: call __fscache_uncache_page() if readpages fails If readpages fails, fscache needs to cleanup its internal state. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index eeb71e5de27a..4ff62fc98fb5 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -276,8 +276,10 @@ static void finish_read(struct ceph_osd_request *req) for (i = 0; i < num_pages; i++) { struct page *page = osd_data->pages[i]; - if (rc < 0 && rc != -ENOENT) + if (rc < 0 && rc != -ENOENT) { + ceph_fscache_readpage_cancel(inode, page); goto unlock; + } if (bytes < (int)PAGE_SIZE) { /* zero (remainder of) page */ int s = bytes < 0 ? 0 : bytes; -- cgit v1.2.3 From 1464975816c79a7cd28dc314384f060a122a9d55 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 20 May 2016 15:41:20 +0800 Subject: ceph: avoid unnecessary fscache invalidation/revlidation ceph_fill_file_size() has already called ceph_fscache_invalidate() if it return true. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c17b5d76d75e..7bdf7d59a36d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2993,10 +2993,9 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (fill_inline) ceph_fill_inline_data(inode, NULL, inline_data, inline_len); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_queue_revalidate(inode); - } else if (queue_revalidate) + else if (queue_revalidate) ceph_queue_revalidate(inode); if (writeback) @@ -3199,10 +3198,8 @@ static void handle_cap_trunc(struct inode *inode, truncate_seq, truncate_size, size); spin_unlock(&ci->i_ceph_lock); - if (queue_trunc) { + if (queue_trunc) ceph_queue_vmtruncate(inode); - ceph_fscache_invalidate(inode); - } } /* -- cgit v1.2.3 From 46b59b2be05a71d80d76883d2f495f182d768f47 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 May 2016 15:25:03 +0800 Subject: ceph: disable fscache when inode is opened for write All other filesystems do not add dirty pages to fscache. They all disable fscache when inode is opened for write. Only ceph adds dirty pages to fscache, but the code is buggy. Signed-off-by: Yan, Zheng --- fs/ceph/addr.c | 2 -- fs/ceph/cache.c | 54 ++++++++++++++++++++++++++++++++++++++---------------- fs/ceph/cache.h | 30 ++++++++++++------------------ fs/ceph/file.c | 19 ++----------------- 4 files changed, 52 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4ff62fc98fb5..26a9d10d75e9 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -537,8 +537,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); - ceph_readpage_to_fscache(inode, page); - set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), &ci->i_layout, snapc, diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index c052b5bf219b..c19db6afd0cc 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -181,32 +181,26 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .now_uncached = ceph_fscache_inode_now_uncached, }; -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci) +void ceph_fscache_register_inode_cookie(struct inode *inode) { - struct inode* inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ if (fsc->fscache == NULL) return; /* Only cache for regular files that are read only */ - if ((ci->vfs_inode.i_mode & S_IFREG) == 0) + if (!S_ISREG(inode->i_mode)) return; - /* Avoid multiple racing open requests */ - inode_lock(inode); - - if (ci->fscache) - goto done; - - ci->fscache = fscache_acquire_cookie(fsc->fscache, - &ceph_fscache_inode_object_def, - ci, true); - fscache_check_consistency(ci->fscache); -done: + inode_lock_nested(inode, I_MUTEX_CHILD); + if (!ci->fscache) { + ci->fscache = fscache_acquire_cookie(fsc->fscache, + &ceph_fscache_inode_object_def, + ci, false); + } inode_unlock(inode); - } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) @@ -222,6 +216,34 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) fscache_relinquish_cookie(cookie, 0); } +static bool ceph_fscache_can_enable(void *data) +{ + struct inode *inode = data; + return !inode_is_open_for_write(inode); +} + +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + if (!fscache_cookie_valid(ci->fscache)) + return; + + if (inode_is_open_for_write(inode)) { + dout("fscache_file_set_cookie %p %p disabling cache\n", + inode, filp); + fscache_disable_cookie(ci->fscache, false); + fscache_uncache_all_inode_pages(ci->fscache, inode); + } else { + fscache_enable_cookie(ci->fscache, ceph_fscache_can_enable, + inode); + if (fscache_cookie_enabled(ci->fscache)) { + dout("fscache_file_set_cookie %p %p enabing cache\n", + inode, filp); + } + } +} + static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) { if (!error) diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 5ac591bd012b..dfe9b5823bd3 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -35,9 +35,9 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); void ceph_fscache_inode_init(struct ceph_inode_info *ci); -void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, - struct ceph_inode_info* ci); +void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); +void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); int ceph_readpage_from_fscache(struct inode *inode, struct page *page); int ceph_readpages_from_fscache(struct inode *inode, @@ -48,12 +48,6 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); void ceph_queue_revalidate(struct inode *inode); -static inline void ceph_fscache_update_objectsize(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - fscache_attr_changed(ci->fscache); -} - static inline void ceph_fscache_invalidate(struct inode *inode) { fscache_invalidate(ceph_inode(inode)->fscache); @@ -112,8 +106,16 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } -static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc, - struct ceph_inode_info* ci) +static inline void ceph_fscache_register_inode_cookie(struct inode *inode) +{ +} + +static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) +{ +} + +static inline void ceph_fscache_file_set_cookie(struct inode *inode, + struct file *filp) { } @@ -141,10 +143,6 @@ static inline void ceph_readpage_to_fscache(struct inode *inode, { } -static inline void ceph_fscache_update_objectsize(struct inode *inode) -{ -} - static inline void ceph_fscache_invalidate(struct inode *inode) { } @@ -154,10 +152,6 @@ static inline void ceph_invalidate_fscache_page(struct inode *inode, { } -static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) -{ -} - static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) { return 1; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8eeb9f579db5..ce2f5795e44b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -137,23 +137,11 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) { struct ceph_file_info *cf; int ret = 0; - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; switch (inode->i_mode & S_IFMT) { case S_IFREG: - /* First file open request creates the cookie, we want to keep - * this cookie around for the filetime of the inode as not to - * have to worry about fscache register / revoke / operation - * races. - * - * Also, if we know the operation is going to invalidate data - * (non readonly) just nuke the cache right away. - */ - ceph_fscache_register_inode_cookie(mdsc->fsc, ci); - if ((fmode & CEPH_FILE_MODE_WR)) - ceph_fscache_invalidate(inode); + ceph_fscache_register_inode_cookie(inode); + ceph_fscache_file_set_cookie(inode, file); case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); @@ -1407,7 +1395,6 @@ retry_snap: iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1418,8 +1405,6 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (i_size_read(inode) > old_size) - ceph_fscache_update_objectsize(inode); inode_unlock(inode); } -- cgit v1.2.3 From f7f7e7a0635dedd5064fba255cb3facfa87b06d6 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 18 May 2016 20:31:55 +0800 Subject: ceph: improve fscache revalidation There are several issues in fscache revalidation code. - In ceph_revalidate_work(), fscache_invalidate() is called when fscache_check_consistency() return 0. This is complete wrong because 0 means cache is valid. - Handle_cap_grant() calls ceph_queue_revalidate() if client already has CAP_FILE_CACHE. This code is confusing. Client should revalidate the cache each time it got CAP_FILE_CACHE anew. - In Handle_cap_grant(), fscache_invalidate() is called if MDS revokes CAP_FILE_CACHE. This is inconsistency with the case that inode get evicted. In the later case, the cache is not discarded. Client may use the cache when inode is reloaded. This patch moves the fscache revalidation into ceph_get_caps(). Client revalidates the cache after it gets CAP_FILE_CACHE. i_rdcache_gen should keep constance while CAP_FILE_CACHE is used. If i_fscache_gen is not equal to i_rdcache_gen, client needs to check cache's consistency. Signed-off-by: Yan, Zheng --- fs/ceph/cache.c | 84 ++++++++++++--------------------------------------------- fs/ceph/cache.h | 20 +++++++++++--- fs/ceph/caps.c | 16 +++++------ fs/ceph/super.h | 4 +-- 4 files changed, 41 insertions(+), 83 deletions(-) (limited to 'fs') diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index c19db6afd0cc..5b3f4828f214 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -69,15 +69,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, fsc, true); - - if (fsc->fscache == NULL) { + if (!fsc->fscache) pr_err("Unable to resgister fsid: %p fscache cookie", fsc); - return 0; - } - - fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1); - if (fsc->revalidate_wq == NULL) - return -ENOMEM; return 0; } @@ -260,8 +253,7 @@ static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int static inline bool cache_valid(struct ceph_inode_info *ci) { - return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) && - (ci->i_fscache_gen == ci->i_rdcache_gen)); + return ci->i_fscache_gen == ci->i_rdcache_gen; } @@ -354,69 +346,27 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - if (fsc->revalidate_wq) - destroy_workqueue(fsc->revalidate_wq); - fscache_relinquish_cookie(fsc->fscache, 0); fsc->fscache = NULL; } -static void ceph_revalidate_work(struct work_struct *work) -{ - int issued; - u32 orig_gen; - struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info, - i_revalidate_work); - struct inode *inode = &ci->vfs_inode; - - spin_lock(&ci->i_ceph_lock); - issued = __ceph_caps_issued(ci, NULL); - orig_gen = ci->i_rdcache_gen; - spin_unlock(&ci->i_ceph_lock); - - if (!(issued & CEPH_CAP_FILE_CACHE)) { - dout("revalidate_work lost cache before validation %p\n", - inode); - goto out; - } - - if (!fscache_check_consistency(ci->fscache)) - fscache_invalidate(ci->fscache); - - spin_lock(&ci->i_ceph_lock); - /* Update the new valid generation (backwards sanity check too) */ - if (orig_gen > ci->i_fscache_gen) { - ci->i_fscache_gen = orig_gen; - } - spin_unlock(&ci->i_ceph_lock); - -out: - iput(&ci->vfs_inode); -} - -void ceph_queue_revalidate(struct inode *inode) +/* + * caller should hold CEPH_CAP_FILE_{RD,CACHE} + */ +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) { - struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); - struct ceph_inode_info *ci = ceph_inode(inode); - - if (fsc->revalidate_wq == NULL || ci->fscache == NULL) + if (cache_valid(ci)) return; - ihold(inode); - - if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq, - &ci->i_revalidate_work)) { - dout("ceph_queue_revalidate %p\n", inode); - } else { - dout("ceph_queue_revalidate %p failed\n)", inode); - iput(inode); + /* resue i_truncate_mutex. There should be no pending + * truncate while the caller holds CEPH_CAP_FILE_RD */ + mutex_lock(&ci->i_truncate_mutex); + if (!cache_valid(ci)) { + if (fscache_check_consistency(ci->fscache)) + fscache_invalidate(ci->fscache); + spin_lock(&ci->i_ceph_lock); + ci->i_fscache_gen = ci->i_rdcache_gen; + spin_unlock(&ci->i_ceph_lock); } -} - -void ceph_fscache_inode_init(struct ceph_inode_info *ci) -{ - ci->fscache = NULL; - /* The first load is verifed cookie open time */ - ci->i_fscache_gen = 1; - INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work); + mutex_unlock(&ci->i_truncate_mutex); } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index dfe9b5823bd3..7e72c7594f0c 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -34,10 +34,10 @@ void ceph_fscache_unregister(void); int ceph_fscache_register_fs(struct ceph_fs_client* fsc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); -void ceph_fscache_inode_init(struct ceph_inode_info *ci); void ceph_fscache_register_inode_cookie(struct inode *inode); void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); +void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); int ceph_readpage_from_fscache(struct inode *inode, struct page *page); int ceph_readpages_from_fscache(struct inode *inode, @@ -46,7 +46,12 @@ int ceph_readpages_from_fscache(struct inode *inode, unsigned *nr_pages); void ceph_readpage_to_fscache(struct inode *inode, struct page *page); void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); -void ceph_queue_revalidate(struct inode *inode); + +static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) +{ + ci->fscache = NULL; + ci->i_fscache_gen = 0; +} static inline void ceph_fscache_invalidate(struct inode *inode) { @@ -82,6 +87,11 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, return fscache_readpages_cancel(ci->fscache, pages); } +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +{ + ci->i_fscache_gen = ci->i_rdcache_gen - 1; +} + #else static inline int ceph_fscache_register(void) @@ -119,6 +129,10 @@ static inline void ceph_fscache_file_set_cookie(struct inode *inode, { } +static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) +{ +} + static inline void ceph_fscache_uncache_page(struct inode *inode, struct page *pages) { @@ -167,7 +181,7 @@ static inline void ceph_fscache_readpages_cancel(struct inode *inode, { } -static inline void ceph_queue_revalidate(struct inode *inode) +static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) { } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7bdf7d59a36d..6f60d0a3d0f9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2393,6 +2393,9 @@ again: snap_rwsem_locked = true; } *got = need | (have & want); + if ((need & CEPH_CAP_FILE_RD) && + !(*got & CEPH_CAP_FILE_CACHE)) + ceph_disable_fscache_readpage(ci); __take_cap_refs(ci, *got, true); ret = 1; } @@ -2554,6 +2557,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, break; } + if ((_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) + ceph_fscache_revalidate_cookie(ci); + *got = _got; return 0; } @@ -2795,7 +2801,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; - bool queue_revalidate = false; bool deleted_inode = false; bool fill_inline = false; @@ -2837,8 +2842,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ci->i_rdcache_revoking = ci->i_rdcache_gen; } } - - ceph_fscache_invalidate(inode); } /* side effects now are allowed */ @@ -2880,11 +2883,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } } - /* Do we need to revalidate our fscache cookie. Don't bother on the - * first cache cap as we already validate at cookie creation time. */ - if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) - queue_revalidate = true; - if (newcaps & CEPH_CAP_ANY_RD) { /* ctime/mtime/atime? */ ceph_decode_timespec(&mtime, &grant->mtime); @@ -2995,8 +2993,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (queue_trunc) ceph_queue_vmtruncate(inode); - else if (queue_revalidate) - ceph_queue_revalidate(inode); if (writeback) /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0130a8592191..0168b49fb6ad 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -103,7 +103,6 @@ struct ceph_fs_client { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - struct workqueue_struct *revalidate_wq; #endif }; @@ -360,8 +359,7 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - u32 i_fscache_gen; /* sequence, for delayed fscache validate */ - struct work_struct i_revalidate_work; + u32 i_fscache_gen; #endif struct inode vfs_inode; /* at end */ }; -- cgit v1.2.3 From f6973c09490c919398fc87d9c6bec9c0b670c4c4 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 20 May 2016 16:57:29 +0800 Subject: ceph: use i_version to check validity of fscache Signed-off-by: Yan, Zheng --- fs/ceph/cache.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 5b3f4828f214..238c55b01723 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -25,6 +25,7 @@ #include "cache.h" struct ceph_aux_inode { + u64 version; struct timespec mtime; loff_t size; }; @@ -98,6 +99,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, const struct inode* inode = &ci->vfs_inode; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); @@ -124,6 +126,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OBSOLETE; memset(&aux, 0, sizeof(aux)); + aux.version = ci->i_version; aux.mtime = inode->i_mtime; aux.size = i_size_read(inode); -- cgit v1.2.3 From 8dff9c85341032767d7b519217a79ea04cd676b0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Sat, 19 Sep 2015 11:28:25 -0700 Subject: Btrfs: deal with duplciates during extent_map insertion in btrfs_get_extent When dealing with inline extents, btrfs_get_extent will incorrectly try to insert a duplicate extent_map. The dup hits -EEXIST from add_extent_map, but then we try to merge with the existing one and end up trying to insert a zero length extent_map. This actually works most of the time, except when there are extent maps past the end of the inline extent. rocksdb will trigger this sometimes because it preallocates an extent and then truncates down. Josef made a script to trigger with xfs_io: #!/bin/bash xfs_io -f -c "pwrite 0 1000" inline xfs_io -c "falloc -k 4k 1M" inline xfs_io -c "pread 0 1000" -c "fadvise -d 0 1000" -c "pread 0 1000" inline xfs_io -c "fadvise -d 0 1000" inline cat inline You'll get EIOs trying to read inline after this because add_extent_map is returning EEXIST Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2debd42c4660..e5558d9e396e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6979,7 +6979,18 @@ insert: * existing will always be non-NULL, since there must be * extent causing the -EEXIST. */ - if (start >= extent_map_end(existing) || + if (existing->start == em->start && + extent_map_end(existing) == extent_map_end(em) && + em->block_start == existing->block_start) { + /* + * these two extents are the same, it happens + * with inlines especially + */ + free_extent_map(em); + em = existing; + err = 0; + + } else if (start >= extent_map_end(existing) || start <= existing->start) { /* * The existing extent map is the one nearest to -- cgit v1.2.3 From eedf265aa003b4781de24cfed40a655a664457e6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Jun 2016 10:29:47 -0500 Subject: devpts: Make each mount of devpts an independent filesystem. The /dev/ptmx device node is changed to lookup the directory entry "pts" in the same directory as the /dev/ptmx device node was opened in. If there is a "pts" entry and that entry is a devpts filesystem /dev/ptmx uses that filesystem. Otherwise the open of /dev/ptmx fails. The DEVPTS_MULTIPLE_INSTANCES configuration option is removed, so that userspace can now safely depend on each mount of devpts creating a new instance of the filesystem. Each mount of devpts is now a separate and equal filesystem. Reserved ttys are now available to all instances of devpts where the mounter is in the initial mount namespace. A new vfs helper path_pts is introduced that finds a directory entry named "pts" in the directory of the passed in path, and changes the passed in path to point to it. The helper path_pts uses a function path_parent_directory that was factored out of follow_dotdot. In the implementation of devpts: - devpts_mnt is killed as it is no longer meaningful if all mounts of devpts are equal. - pts_sb_from_inode is replaced by just inode->i_sb as all cached inodes in the tty layer are now from the devpts filesystem. - devpts_add_ref is rolled into the new function devpts_ptmx. And the unnecessary inode hold is removed. - devpts_del_ref is renamed devpts_release and reduced to just a deacrivate_super. - The newinstance mount option continues to be accepted but is now ignored. In devpts_fs.h definitions for when !CONFIG_UNIX98_PTYS are removed as they are never used. Documentation/filesystems/devices.txt is updated to describe the current situation. This has been verified to work properly on openwrt-15.05, centos5, centos6, centos7, debian-6.0.2, debian-7.9, debian-8.2, ubuntu-14.04.3, ubuntu-15.10, fedora23, magia-5, mint-17.3, opensuse-42.1, slackware-14.1, gentoo-20151225 (13.0?), archlinux-2015-12-01. With the caveat that on centos6 and on slackware-14.1 that there wind up being two instances of the devpts filesystem mounted on /dev/pts, the lower copy does not end up getting used. Signed-off-by: "Eric W. Biederman" Cc: Greg KH Cc: Peter Hurley Cc: Peter Anvin Cc: Andy Lutomirski Cc: Al Viro Cc: Serge Hallyn Cc: Willy Tarreau Cc: Aurelien Jarno Cc: One Thousand Gnomes Cc: Jann Horn Cc: Jiri Slaby Cc: Florian Weimer Cc: Konstantin Khlebnikov Signed-off-by: Linus Torvalds --- Documentation/filesystems/devpts.txt | 145 +++----------------------- drivers/tty/Kconfig | 11 -- drivers/tty/pty.c | 15 +-- fs/devpts/inode.c | 191 ++++++++++------------------------- fs/namei.c | 49 +++++++-- include/linux/devpts_fs.h | 9 +- include/linux/namei.h | 2 + 7 files changed, 126 insertions(+), 296 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/devpts.txt b/Documentation/filesystems/devpts.txt index 30d2fcb32f72..9f94fe276dea 100644 --- a/Documentation/filesystems/devpts.txt +++ b/Documentation/filesystems/devpts.txt @@ -1,141 +1,26 @@ +Each mount of the devpts filesystem is now distinct such that ptys +and their indicies allocated in one mount are independent from ptys +and their indicies in all other mounts. -To support containers, we now allow multiple instances of devpts filesystem, -such that indices of ptys allocated in one instance are independent of indices -allocated in other instances of devpts. +All mounts of the devpts filesystem now create a /dev/pts/ptmx node +with permissions 0000. -To preserve backward compatibility, this support for multiple instances is -enabled only if: +To retain backwards compatibility the a ptmx device node (aka any node +created with "mknod name c 5 2") when opened will look for an instance +of devpts under the name "pts" in the same directory as the ptmx device +node. - - CONFIG_DEVPTS_MULTIPLE_INSTANCES=y, and - - '-o newinstance' mount option is specified while mounting devpts - -IOW, devpts now supports both single-instance and multi-instance semantics. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=n, there is no change in behavior and -this referred to as the "legacy" mode. In this mode, the new mount options -(-o newinstance and -o ptmxmode) will be ignored with a 'bogus option' message -on console. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and devpts is mounted without the -'newinstance' option (as in current start-up scripts) the new mount binds -to the initial kernel mount of devpts. This mode is referred to as the -'single-instance' mode and the current, single-instance semantics are -preserved, i.e PTYs are common across the system. - -The only difference between this single-instance mode and the legacy mode -is the presence of new, '/dev/pts/ptmx' node with permissions 0000, which -can safely be ignored. - -If CONFIG_DEVPTS_MULTIPLE_INSTANCES=y and 'newinstance' option is specified, -the mount is considered to be in the multi-instance mode and a new instance -of the devpts fs is created. Any ptys created in this instance are independent -of ptys in other instances of devpts. Like in the single-instance mode, the -/dev/pts/ptmx node is present. To effectively use the multi-instance mode, -open of /dev/ptmx must be a redirected to '/dev/pts/ptmx' using a symlink or -bind-mount. - -Eg: A container startup script could do the following: - - $ chmod 0666 /dev/pts/ptmx - $ rm /dev/ptmx - $ ln -s pts/ptmx /dev/ptmx - $ ns_exec -cm /bin/bash - - # We are now in new container - - $ umount /dev/pts - $ mount -t devpts -o newinstance lxcpts /dev/pts - $ sshd -p 1234 - -where 'ns_exec -cm /bin/bash' calls clone() with CLONE_NEWNS flag and execs -/bin/bash in the child process. A pty created by the sshd is not visible in -the original mount of /dev/pts. +As an option instead of placing a /dev/ptmx device node at /dev/ptmx +it is possible to place a symlink to /dev/pts/ptmx at /dev/ptmx or +to bind mount /dev/ptx/ptmx to /dev/ptmx. If you opt for using +the devpts filesystem in this manner devpts should be mounted with +the ptmxmode=0666, or chmod 0666 /dev/pts/ptmx should be called. Total count of pty pairs in all instances is limited by sysctls: kernel.pty.max = 4096 - global limit -kernel.pty.reserve = 1024 - reserve for initial instance +kernel.pty.reserve = 1024 - reserved for filesystems mounted from the initial mount namespace kernel.pty.nr - current count of ptys Per-instance limit could be set by adding mount option "max=". This feature was added in kernel 3.4 together with sysctl kernel.pty.reserve. In kernels older than 3.4 sysctl kernel.pty.max works as per-instance limit. - -User-space changes ------------------- - -In multi-instance mode (i.e '-o newinstance' mount option is specified at least -once), following user-space issues should be noted. - -1. If -o newinstance mount option is never used, /dev/pts/ptmx can be ignored - and no change is needed to system-startup scripts. - -2. To effectively use multi-instance mode (i.e -o newinstance is specified) - administrators or startup scripts should "redirect" open of /dev/ptmx to - /dev/pts/ptmx using either a bind mount or symlink. - - $ mount -t devpts -o newinstance devpts /dev/pts - - followed by either - - $ rm /dev/ptmx - $ ln -s pts/ptmx /dev/ptmx - $ chmod 666 /dev/pts/ptmx - or - $ mount -o bind /dev/pts/ptmx /dev/ptmx - -3. The '/dev/ptmx -> pts/ptmx' symlink is the preferred method since it - enables better error-reporting and treats both single-instance and - multi-instance mounts similarly. - - But this method requires that system-startup scripts set the mode of - /dev/pts/ptmx correctly (default mode is 0000). The scripts can set the - mode by, either - - - adding ptmxmode mount option to devpts entry in /etc/fstab, or - - using 'chmod 0666 /dev/pts/ptmx' - -4. If multi-instance mode mount is needed for containers, but the system - startup scripts have not yet been updated, container-startup scripts - should bind mount /dev/ptmx to /dev/pts/ptmx to avoid breaking single- - instance mounts. - - Or, in general, container-startup scripts should use: - - mount -t devpts -o newinstance -o ptmxmode=0666 devpts /dev/pts - if [ ! -L /dev/ptmx ]; then - mount -o bind /dev/pts/ptmx /dev/ptmx - fi - - When all devpts mounts are multi-instance, /dev/ptmx can permanently be - a symlink to pts/ptmx and the bind mount can be ignored. - -5. A multi-instance mount that is not accompanied by the /dev/ptmx to - /dev/pts/ptmx redirection would result in an unusable/unreachable pty. - - mount -t devpts -o newinstance lxcpts /dev/pts - - immediately followed by: - - open("/dev/ptmx") - - would create a pty, say /dev/pts/7, in the initial kernel mount. - But /dev/pts/7 would be invisible in the new mount. - -6. The permissions for /dev/pts/ptmx node should be specified when mounting - /dev/pts, using the '-o ptmxmode=%o' mount option (default is 0000). - - mount -t devpts -o newinstance -o ptmxmode=0644 devpts /dev/pts - - The permissions can be later be changed as usual with 'chmod'. - - chmod 666 /dev/pts/ptmx - -7. A mount of devpts without the 'newinstance' option results in binding to - initial kernel mount. This behavior while preserving legacy semantics, - does not provide strict isolation in a container environment. i.e by - mounting devpts without the 'newinstance' option, a container could - get visibility into the 'host' or root container's devpts. - - To workaround this and have strict isolation, all mounts of devpts, - including the mount in the root container, should use the newinstance - option. diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig index 82c4d2e45319..95103054c0e4 100644 --- a/drivers/tty/Kconfig +++ b/drivers/tty/Kconfig @@ -120,17 +120,6 @@ config UNIX98_PTYS All modern Linux systems use the Unix98 ptys. Say Y unless you're on an embedded system and want to conserve memory. -config DEVPTS_MULTIPLE_INSTANCES - bool "Support multiple instances of devpts" - depends on UNIX98_PTYS - default n - ---help--- - Enable support for multiple instances of devpts filesystem. - If you want to have isolated PTY namespaces (eg: in containers), - say Y here. Otherwise, say N. If enabled, each mount of devpts - filesystem with the '-o newinstance' option will create an - independent PTY namespace. - config LEGACY_PTYS bool "Legacy (BSD) PTY support" default y diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index dd4b8417e7f4..f856c4544eea 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -668,7 +668,7 @@ static void pty_unix98_remove(struct tty_driver *driver, struct tty_struct *tty) else fsi = tty->link->driver_data; devpts_kill_index(fsi, tty->index); - devpts_put_ref(fsi); + devpts_release(fsi); } static const struct tty_operations ptm_unix98_ops = { @@ -733,10 +733,11 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; - fsi = devpts_get_ref(inode, filp); - retval = -ENODEV; - if (!fsi) + fsi = devpts_acquire(filp); + if (IS_ERR(fsi)) { + retval = PTR_ERR(fsi); goto out_free_file; + } /* find a device that is not in use. */ mutex_lock(&devpts_mutex); @@ -745,7 +746,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) retval = index; if (index < 0) - goto out_put_ref; + goto out_put_fsi; mutex_lock(&tty_mutex); @@ -789,8 +790,8 @@ err_release: return retval; out: devpts_kill_index(fsi, index); -out_put_ref: - devpts_put_ref(fsi); +out_put_fsi: + devpts_release(fsi); out_free_file: tty_free_file(filp); return retval; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 0b2954d7172d..37c134a132c7 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -95,8 +95,6 @@ static struct ctl_table pty_root_table[] = { static DEFINE_MUTEX(allocated_ptys_lock); -static struct vfsmount *devpts_mnt; - struct pts_mount_opts { int setuid; int setgid; @@ -104,7 +102,7 @@ struct pts_mount_opts { kgid_t gid; umode_t mode; umode_t ptmxmode; - int newinstance; + int reserve; int max; }; @@ -117,11 +115,9 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES {Opt_ptmxmode, "ptmxmode=%o"}, {Opt_newinstance, "newinstance"}, {Opt_max, "max=%d"}, -#endif {Opt_err, NULL} }; @@ -137,15 +133,48 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -static inline struct super_block *pts_sb_from_inode(struct inode *inode) +struct pts_fs_info *devpts_acquire(struct file *filp) { -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES - if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) - return inode->i_sb; -#endif - if (!devpts_mnt) - return NULL; - return devpts_mnt->mnt_sb; + struct pts_fs_info *result; + struct path path; + struct super_block *sb; + int err; + + path = filp->f_path; + path_get(&path); + + /* Has the devpts filesystem already been found? */ + sb = path.mnt->mnt_sb; + if (sb->s_magic != DEVPTS_SUPER_MAGIC) { + /* Is a devpts filesystem at "pts" in the same directory? */ + err = path_pts(&path); + if (err) { + result = ERR_PTR(err); + goto out; + } + + /* Is the path the root of a devpts filesystem? */ + result = ERR_PTR(-ENODEV); + sb = path.mnt->mnt_sb; + if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || + (path.mnt->mnt_root != sb->s_root)) + goto out; + } + + /* + * pty code needs to hold extra references in case of last /dev/tty close + */ + atomic_inc(&sb->s_active); + result = DEVPTS_SB(sb); + +out: + path_put(&path); + return result; +} + +void devpts_release(struct pts_fs_info *fsi) +{ + deactivate_super(fsi->sb); } #define PARSE_MOUNT 0 @@ -154,9 +183,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) /* * parse_mount_options(): * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. The exception is - * 'newinstance' option which can only be set/cleared on a mount (i.e. - * cannot be changed during remount). + * specified in @data, set it to its default value. * * Note: @data may be NULL (in which case all options are set to default). */ @@ -174,9 +201,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; opts->max = NR_UNIX98_PTY_MAX; - /* newinstance makes sense only on initial mount */ + /* Only allow instances mounted from the initial mount + * namespace to tap the reserve pool of ptys. + */ if (op == PARSE_MOUNT) - opts->newinstance = 0; + opts->reserve = + (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); while ((p = strsep(&data, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; @@ -211,16 +241,12 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return -EINVAL; opts->mode = option & S_IALLUGO; break; -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES case Opt_ptmxmode: if (match_octal(&args[0], &option)) return -EINVAL; opts->ptmxmode = option & S_IALLUGO; break; case Opt_newinstance: - /* newinstance makes sense only on initial mount */ - if (op == PARSE_MOUNT) - opts->newinstance = 1; break; case Opt_max: if (match_int(&args[0], &option) || @@ -228,7 +254,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return -EINVAL; opts->max = option; break; -#endif default: pr_err("called with bogus options\n"); return -EINVAL; @@ -238,7 +263,6 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) return 0; } -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES static int mknod_ptmx(struct super_block *sb) { int mode; @@ -305,12 +329,6 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } } -#else -static inline void update_ptmx_mode(struct pts_fs_info *fsi) -{ - return; -} -#endif static int devpts_remount(struct super_block *sb, int *flags, char *data) { @@ -344,11 +362,9 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid)); seq_printf(seq, ",mode=%03o", opts->mode); -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode); if (opts->max < NR_UNIX98_PTY_MAX) seq_printf(seq, ",max=%d", opts->max); -#endif return 0; } @@ -410,40 +426,11 @@ fail: return -ENOMEM; } -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES -static int compare_init_pts_sb(struct super_block *s, void *p) -{ - if (devpts_mnt) - return devpts_mnt->mnt_sb == s; - return 0; -} - /* * devpts_mount() * - * If the '-o newinstance' mount option was specified, mount a new - * (private) instance of devpts. PTYs created in this instance are - * independent of the PTYs in other devpts instances. - * - * If the '-o newinstance' option was not specified, mount/remount the - * initial kernel mount of devpts. This type of mount gives the - * legacy, single-instance semantics. - * - * The 'newinstance' option is needed to support multiple namespace - * semantics in devpts while preserving backward compatibility of the - * current 'single-namespace' semantics. i.e all mounts of devpts - * without the 'newinstance' mount option should bind to the initial - * kernel mount, like mount_single(). - * - * Mounts with 'newinstance' option create a new, private namespace. - * - * NOTE: - * - * For single-mount semantics, devpts cannot use mount_single(), - * because mount_single()/sget() find and use the super-block from - * the most recent mount of devpts. But that recent mount may be a - * 'newinstance' mount and mount_single() would pick the newinstance - * super-block instead of the initial super-block. + * Mount a new (private) instance of devpts. PTYs created in this + * instance are independent of the PTYs in other devpts instances. */ static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) @@ -456,18 +443,7 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type, if (error) return ERR_PTR(error); - /* Require newinstance for all user namespace mounts to ensure - * the mount options are not changed. - */ - if ((current_user_ns() != &init_user_ns) && !opts.newinstance) - return ERR_PTR(-EINVAL); - - if (opts.newinstance) - s = sget(fs_type, NULL, set_anon_super, flags, NULL); - else - s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags, - NULL); - + s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); @@ -491,18 +467,6 @@ out_undo_sget: return ERR_PTR(error); } -#else -/* - * This supports only the legacy single-instance semantics (no - * multiple-instance semantics) - */ -static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - return mount_single(fs_type, flags, data, devpts_fill_super); -} -#endif - static void devpts_kill_sb(struct super_block *sb) { struct pts_fs_info *fsi = DEVPTS_SB(sb); @@ -516,9 +480,7 @@ static struct file_system_type devpts_fs_type = { .name = "devpts", .mount = devpts_mount, .kill_sb = devpts_kill_sb, -#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT, -#endif }; /* @@ -531,16 +493,13 @@ int devpts_new_index(struct pts_fs_info *fsi) int index; int ida_ret; - if (!fsi) - return -ENODEV; - retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; mutex_lock(&allocated_ptys_lock); - if (pty_count >= pty_limit - - (fsi->mount_opts.newinstance ? pty_reserve : 0)) { + if (pty_count >= (pty_limit - + (fsi->mount_opts.reserve ? 0 : pty_reserve))) { mutex_unlock(&allocated_ptys_lock); return -ENOSPC; } @@ -571,30 +530,6 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx) mutex_unlock(&allocated_ptys_lock); } -/* - * pty code needs to hold extra references in case of last /dev/tty close - */ -struct pts_fs_info *devpts_get_ref(struct inode *ptmx_inode, struct file *file) -{ - struct super_block *sb; - struct pts_fs_info *fsi; - - sb = pts_sb_from_inode(ptmx_inode); - if (!sb) - return NULL; - fsi = DEVPTS_SB(sb); - if (!fsi) - return NULL; - - atomic_inc(&sb->s_active); - return fsi; -} - -void devpts_put_ref(struct pts_fs_info *fsi) -{ - deactivate_super(fsi->sb); -} - /** * devpts_pty_new -- create a new inode in /dev/pts/ * @ptmx_inode: inode of the master @@ -607,16 +542,12 @@ void devpts_put_ref(struct pts_fs_info *fsi) struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { struct dentry *dentry; - struct super_block *sb; + struct super_block *sb = fsi->sb; struct inode *inode; struct dentry *root; struct pts_mount_opts *opts; char s[12]; - if (!fsi) - return ERR_PTR(-ENODEV); - - sb = fsi->sb; root = sb->s_root; opts = &fsi->mount_opts; @@ -676,20 +607,8 @@ void devpts_pty_kill(struct dentry *dentry) static int __init init_devpts_fs(void) { int err = register_filesystem(&devpts_fs_type); - struct ctl_table_header *table; - if (!err) { - struct vfsmount *mnt; - - table = register_sysctl_table(pty_root_table); - mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - unregister_filesystem(&devpts_fs_type); - unregister_sysctl_table(table); - } else { - devpts_mnt = mnt; - } + register_sysctl_table(pty_root_table); } return err; } diff --git a/fs/namei.c b/fs/namei.c index 4c4f95ac8aa5..6a82fb7e2127 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1416,21 +1416,28 @@ static void follow_mount(struct path *path) } } +static int path_parent_directory(struct path *path) +{ + struct dentry *old = path->dentry; + /* rare case of legitimate dget_parent()... */ + path->dentry = dget_parent(path->dentry); + dput(old); + if (unlikely(!path_connected(path))) + return -ENOENT; + return 0; +} + static int follow_dotdot(struct nameidata *nd) { while(1) { - struct dentry *old = nd->path.dentry; - if (nd->path.dentry == nd->root.dentry && nd->path.mnt == nd->root.mnt) { break; } if (nd->path.dentry != nd->path.mnt->mnt_root) { - /* rare case of legitimate dget_parent()... */ - nd->path.dentry = dget_parent(nd->path.dentry); - dput(old); - if (unlikely(!path_connected(&nd->path))) - return -ENOENT; + int ret = path_parent_directory(&nd->path); + if (ret) + return ret; break; } if (!follow_up(&nd->path)) @@ -2514,6 +2521,34 @@ struct dentry *lookup_one_len_unlocked(const char *name, } EXPORT_SYMBOL(lookup_one_len_unlocked); +#ifdef CONFIG_UNIX98_PTYS +int path_pts(struct path *path) +{ + /* Find something mounted on "pts" in the same directory as + * the input path. + */ + struct dentry *child, *parent; + struct qstr this; + int ret; + + ret = path_parent_directory(path); + if (ret) + return ret; + + parent = path->dentry; + this.name = "pts"; + this.len = 3; + child = d_hash_and_lookup(parent, &this); + if (!child) + return -ENOENT; + + path->dentry = child; + dput(parent); + follow_mount(path); + return 0; +} +#endif + int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 5871f292b596..277ab9af9ac2 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -15,13 +15,12 @@ #include -struct pts_fs_info; - #ifdef CONFIG_UNIX98_PTYS -/* Look up a pts fs info and get a ref to it */ -struct pts_fs_info *devpts_get_ref(struct inode *, struct file *); -void devpts_put_ref(struct pts_fs_info *); +struct pts_fs_info; + +struct pts_fs_info *devpts_acquire(struct file *); +void devpts_release(struct pts_fs_info *); int devpts_new_index(struct pts_fs_info *); void devpts_kill_index(struct pts_fs_info *, int); diff --git a/include/linux/namei.h b/include/linux/namei.h index ec5ec2818a28..d3d0398f2a1b 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -45,6 +45,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_ROOT 0x2000 #define LOOKUP_EMPTY 0x4000 +extern int path_pts(struct path *path); + extern int user_path_at_empty(int, const char __user *, unsigned, struct path *, int *empty); static inline int user_path_at(int dfd, const char __user *name, unsigned flags, -- cgit v1.2.3