diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 2245 |
1 files changed, 1279 insertions, 966 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 488edca8333a..9e4aec7330cb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -308,7 +308,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ - if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); @@ -393,39 +393,20 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, - struct folio *locked_folio, u64 offset, u64 bytes) { - unsigned long index = offset >> PAGE_SHIFT; - unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = 0, page_end = 0; + pgoff_t index = offset >> PAGE_SHIFT; + const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT; struct folio *folio; - if (locked_folio) { - page_start = folio_pos(locked_folio); - page_end = page_start + folio_size(locked_folio) - 1; - } - while (index <= end_index) { - /* - * For locked page, we will call btrfs_mark_ordered_io_finished - * through btrfs_mark_ordered_io_finished() on it - * in run_delalloc_range() for the error handling, which will - * clear page Ordered and run the ordered extent accounting. - * - * Here we can't just clear the Ordered bit, or - * btrfs_mark_ordered_io_finished() would skip the accounting - * for the page range, and the ordered extent will never finish. - */ - if (locked_folio && index == (page_start >> PAGE_SHIFT)) { + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); + if (IS_ERR(folio)) { index++; continue; } - folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); - index++; - if (IS_ERR(folio)) - continue; + index = folio_end(folio) >> PAGE_SHIFT; /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle @@ -436,23 +417,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, folio_put(folio); } - if (locked_folio) { - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_start + folio_size(locked_folio)) - return; - /* - * In case this page belongs to the delalloc range being - * instantiated then skip it, since the first page of a range is - * going to be properly cleaned up by the caller of - * run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - folio_pos(locked_folio) - - folio_size(locked_folio); - offset = folio_pos(locked_folio) + folio_size(locked_folio); - } - } - return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } @@ -461,18 +425,18 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { - int err; + int ret; if (args->default_acl) { - err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ret = __btrfs_set_acl(trans, args->inode, args->default_acl, ACL_TYPE_DEFAULT); - if (err) - return err; + if (ret) + return ret; } if (args->acl) { - err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); - if (err) - return err; + ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (ret) + return ret; } if (!args->default_acl && !args->acl) cache_no_acl(args->inode); @@ -527,8 +491,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t datasize; key.objectid = btrfs_ino(inode); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -564,7 +528,6 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, kunmap_local(kaddr); folio_put(folio); } - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -605,23 +568,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (offset != 0) return false; - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (fs_info->sectorsize != PAGE_SIZE) - return false; - /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; + /* We do not allow a non-compressed extent to be as large as block size. */ + if (data_len >= fs_info->sectorsize) + return false; + /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; @@ -711,7 +665,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -734,12 +688,12 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) return 1; - lock_extent(&inode->io_tree, offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, offset, end, &cached); ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { - unlock_extent(&inode->io_tree, offset, end, &cached); + btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); return ret; } @@ -825,33 +779,19 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(inode)); + DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0; } - /* - * Only enable sector perfect compression for experimental builds. - * - * This is a big feature change for subpage cases, and can hit - * different corner cases, so only limit this feature for - * experimental build for now. - * - * ETA for moving this out of experimental builds is 6.15. - */ - if (fs_info->sectorsize < PAGE_SIZE && - !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(end + 1)) - return 0; - } + /* Defrag ioctl takes precedence over mount options and properties. */ + if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) + return 0; + if (BTRFS_COMPRESS_NONE < inode->defrag_compress && + inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) + return 1; /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; - /* defrag ioctl */ - if (inode->defrag_compress) - return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; @@ -871,21 +811,20 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, btrfs_add_inode_defrag(inode, small_write); } -static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { - unsigned long end_index = end >> PAGE_SHIFT; + const pgoff_t end_index = end >> PAGE_SHIFT; struct folio *folio; int ret = 0; - for (unsigned long index = start >> PAGE_SHIFT; - index <= end_index; index++) { - folio = filemap_get_folio(inode->i_mapping, index); + for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) { + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, end + 1 - start); folio_put(folio); } @@ -925,6 +864,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned int poff; int i; int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -933,7 +873,7 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(inode, start, end); /* * All the folios should have been locked thus no failure. @@ -1007,13 +947,15 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) + if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { compress_type = inode->defrag_compress; - else if (inode->prop_compress) + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { compress_type = inode->prop_compress; + } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + ret = btrfs_compress_folios(compress_type, compress_level, mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) @@ -1129,19 +1071,13 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, locked_folio, - start, end - start + 1); - if (locked_folio) { - const u64 page_start = folio_pos(locked_folio); - - folio_start_writeback(locked_folio); - folio_end_writeback(locked_folio); - btrfs_mark_ordered_io_finished(inode, locked_folio, - page_start, PAGE_SIZE, - !ret); - mapping_set_error(locked_folio->mapping, ret); - folio_unlock(locked_folio); - } + if (locked_folio) + btrfs_folio_end_lock(inode->root->fs_info, locked_folio, + start, async_extent->ram_size); + btrfs_err_rl(inode->root->fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, async_extent->ram_size, ret); } } @@ -1160,6 +1096,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1180,7 +1117,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1196,10 +1136,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); /* Here we're doing allocation and writeback of the compressed pages */ file_extent.disk_bytenr = ins.objectid; @@ -1214,10 +1155,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = PTR_ERR(em); goto out_free_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_COMPRESSED); + 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1237,12 +1178,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, @@ -1269,7 +1212,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 alloc_hint = 0; read_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, start, num_bytes); + em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the @@ -1277,15 +1220,15 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * block is also bogus then just don't worry about it. */ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); + btrfs_free_extent_map(em); + em = btrfs_search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - alloc_hint = extent_map_block_start(em); + alloc_hint = btrfs_extent_map_block_start(em); if (em) - free_extent_map(em); + btrfs_free_extent_map(em); } else { - alloc_hint = extent_map_block_start(em); - free_extent_map(em); + alloc_hint = btrfs_extent_map_block_start(em); + btrfs_free_extent_map(em); } } read_unlock(&em_tree->lock); @@ -1316,10 +1259,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the - * while-loop, the ordered extents created in previous iterations are kept - * intact. So, the caller must clean them up by calling - * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for - * example. + * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, @@ -1373,6 +1313,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* + * We're not doing compressed IO, don't unlock the first page (which + * the caller expects to stay locked), don't clear any dirty bits and + * don't set any writeback bits. + * + * Do set the Ordered (Private2) bit so we know this page was properly + * setup for writepage. + */ + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); + page_ops |= PAGE_SET_ORDERED; + + /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the @@ -1415,8 +1366,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } @@ -1431,24 +1387,28 @@ static noinline int cow_file_range(struct btrfs_inode *inode, file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; - lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, - &cached); + /* + * Locked range will be released either during error clean up or + * after the whole range is finished. + */ + btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, + &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_REGULAR); + 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1476,21 +1436,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); - /* - * We're not doing compressed IO, don't unlock the first page - * (which the caller expects to stay locked), don't clear any - * dirty bits and don't set any writeback bits - * - * Do set the Ordered flag so we know this page was - * properly setup for writepage. - */ - page_ops = (keep_locked ? 0 : PAGE_UNLOCK); - page_ops |= PAGE_SET_ORDERED; - - extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, - locked_folio, &cached, - EXTENT_LOCKED | EXTENT_DELALLOC, - page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else @@ -1507,6 +1452,8 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } + extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, + EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; @@ -1516,7 +1463,7 @@ out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: @@ -1527,35 +1474,30 @@ out_unlock: * We process each region below. */ - clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; - page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; - /* * For the range (1). We have already instantiated the ordered extents - * for this region. They are cleaned up by - * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are - * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | - * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup - * function. + * for this region, thus we need to cleanup those ordered extents. + * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV + * are also handled by the ordered extents cleanup. * - * However, in case of @keep_locked, we still need to unlock the pages - * (except @locked_folio) to ensure all the pages are unlocked. + * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and + * finish the writeback of the involved folios, which will be never submitted. */ - if (keep_locked && orig_start < start) { + if (orig_start < start) { + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); + + btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, - locked_folio, NULL, 0, page_ops); + locked_folio, NULL, clear_bits, page_ops); } - /* - * At this point we're unlocked, we want to make sure we're only - * clearing these flags under the extent lock, so lock the rest of the - * range and clear everything up. - */ - lock_extent(&inode->io_tree, start, end, NULL); + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (2). If we reserved an extent for our delalloc range @@ -1589,6 +1531,10 @@ out_unlock: btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), orig_start, end + 1 - orig_start, ret); return ret; } @@ -1626,8 +1572,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); + async_extent = list_first_entry(&async_chunk->extents, + struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } @@ -1797,9 +1743,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ - lock_extent(io_tree, start, end, &cached_state); - count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0, NULL); + btrfs_lock_extent(io_tree, start, end, &cached_state); + count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1809,14 +1755,14 @@ static int fallback_to_cow(struct btrfs_inode *inode, bytes = range_bytes; spin_lock(&sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) - clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - NULL); + btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, + &cached_state); } - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that @@ -1837,7 +1783,6 @@ struct can_nocow_file_extent_args { /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; - bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. @@ -1892,8 +1837,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ - if (!args->strict && - btrfs_file_extent_generation(leaf, fi) <= + if (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; @@ -1922,9 +1866,8 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ btrfs_release_path(path); - ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), - key->offset - args->file_extent.offset, - args->file_extent.disk_bytenr, args->strict, path); + ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, + args->file_extent.disk_bytenr, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; @@ -1971,6 +1914,112 @@ static int can_nocow_file_extent(struct btrfs_path *path, } /* + * Cleanup the dirty folios which will never be submitted due to error. + * + * When running a delalloc range, we may need to split the ranges (due to + * fragmentation or NOCOW). If we hit an error in the later part, we will error + * out and previously successfully executed range will never be submitted, thus + * we have to cleanup those folios by clearing their dirty flag, starting and + * finishing the writeback. + */ +static void cleanup_dirty_folios(struct btrfs_inode *inode, + struct folio *locked_folio, + u64 start, u64 end, int error) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct address_space *mapping = inode->vfs_inode.i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + u32 len; + + ASSERT(end + 1 - start < U32_MAX); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + len = end + 1 - start; + + /* + * Handle the locked folio first. + * The btrfs_folio_clamp_*() helpers can handle range out of the folio case. + */ + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + + for (pgoff_t index = start_index; index <= end_index; index++) { + struct folio *folio; + + /* Already handled at the beginning. */ + if (index == locked_folio->index) + continue; + folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS); + /* Cache already dropped, no need to do any cleanup. */ + if (IS_ERR(folio)) + continue; + btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len); + folio_unlock(folio); + folio_put(folio); + } + mapping_set_error(mapping, error); +} + +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + btrfs_free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1U << BTRFS_ORDERED_PREALLOC) + : (1U << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + /* + * On error, we need to cleanup the ordered extents we created. + * + * We do not clear the folio Dirty flags because they are set and + * cleaered by the caller. + */ + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, file_pos, len); + return ret; +} + +/* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * @@ -1985,6 +2034,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; + /* + * If not 0, represents the inclusive end of the last fallback_to_cow() + * range. Only for error handling. + */ + u64 cow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; @@ -2009,15 +2063,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2072,12 +2123,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -2143,74 +2195,21 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, found_key.offset - 1); - cow_start = (u64)-1; if (ret) { + cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } + cow_start = (u64)-1; } - nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; - lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); - - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - struct extent_map *em; - - em = btrfs_create_io_em(inode, cur_offset, - &nocow_args.file_extent, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - &nocow_args.file_extent, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_folio, &cached_state, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; } btrfs_release_path(path); @@ -2218,11 +2217,12 @@ must_cow: cow_start = cur_offset; if (cow_start != (u64)-1) { - cur_offset = end; ret = fallback_to_cow(inode, locked_folio, cow_start, end); - cow_start = (u64)-1; - if (ret) + if (ret) { + cow_end = end; goto error; + } + cow_start = (u64)-1; } btrfs_free_path(path); @@ -2230,13 +2230,59 @@ must_cow: error: /* - * If an error happened while a COW region is outstanding, cur_offset - * needs to be reset to cow_start to ensure the COW region is unlocked - * as well. + * There are several error cases: + * + * 1) Failed without falling back to COW + * start cur_offset end + * |/////////////| | + * + * In this case, cow_start should be (u64)-1. + * + * For range [start, cur_offset) the folios are already unlocked (except + * @locked_folio), EXTENT_DELALLOC already removed. + * Need to clear the dirty flags and finish the ordered extents. + * + * 2) Failed with error before calling fallback_to_cow() + * + * start cow_start end + * |/////////////| | + * + * In this case, only @cow_start is set, @cur_offset is between + * [cow_start, end) + * + * It's mostly the same as case 1), just replace @cur_offset with + * @cow_start. + * + * 3) Failed with error from fallback_to_cow() + * + * start cow_start cow_end end + * |/////////////|-----------| | + * + * In this case, both @cow_start and @cow_end is set. + * + * For range [start, cow_start) it's the same as case 1). + * But for range [cow_start, cow_end), all the cleanup is handled by + * cow_file_range(), we should not touch anything in that range. + * + * So for all above cases, if @cow_start is set, cleanup ordered extents + * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). */ if (cow_start != (u64)-1) cur_offset = cow_start; + if (cur_offset > start) { + btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); + cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + } + + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to @cow_end + 1 to skip the COW range, as + * cow_file_range() will do the proper cleanup at error. + */ + if (cow_end) + cur_offset = cow_end + 1; + /* * We need to lock the extent here because we're clearing DELALLOC and * we're not locked at this point. @@ -2244,7 +2290,7 @@ error: if (cur_offset < end) { struct extent_state *cached = NULL; - lock_extent(&inode->io_tree, cur_offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -2255,6 +2301,10 @@ error: btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL); } btrfs_free_path(path); + btrfs_err_rl(fs_info, + "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", + __func__, btrfs_root_id(inode->root), + btrfs_ino(inode), start, end + 1 - start, ret); return ret; } @@ -2262,7 +2312,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) + btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2283,12 +2333,11 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ - ASSERT(!(end <= folio_pos(locked_folio) || - start >= folio_pos(locked_folio) + folio_size(locked_folio))); + ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio))); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); - goto out; + return ret; } if (btrfs_inode_can_compress(inode) && @@ -2302,11 +2351,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol else ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); - -out: - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, locked_folio, start, - end - start + 1); return ret; } @@ -2556,7 +2600,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, !btrfs_is_free_space_inode(inode) && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -2640,12 +2684,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (em_len > search_len) em_len = search_len; - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state); + ret = btrfs_set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, cached_state); next: - search_start = extent_map_end(em); - free_extent_map(em); + search_start = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); if (ret) return ret; } @@ -2675,8 +2719,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC | extra_bits, cached_state); + return btrfs_set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2697,7 +2741,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); - u64 page_end = folio_pos(folio) + folio_size(folio) - 1; + u64 page_end = folio_end(folio) - 1; int ret = 0; bool free_delalloc_space = true; @@ -2751,7 +2795,7 @@ again: if (ret) goto out_page; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (folio_test_ordered(folio)) @@ -2759,8 +2803,8 @@ again: ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -2786,7 +2830,7 @@ out_reserved: if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* @@ -2833,6 +2877,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio) return 0; /* + * For experimental build, we error out instead of EAGAIN. + * + * We should not hit such out-of-band dirty folios anymore. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { + DEBUG_WARN(); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + btrfs_root_id(BTRFS_I(inode)->root), + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio)); + return -EUCLEAN; + } + + /* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. @@ -2851,7 +2910,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust - * page->mapping outside of the folio lock. + * folio->mapping outside of the folio lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); @@ -2872,7 +2931,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); @@ -2907,8 +2966,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; + ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); @@ -2921,7 +2980,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -2944,8 +3002,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) @@ -2955,8 +3013,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, file_pos - offset, qgroup_reserved, &ins); out: - btrfs_free_path(path); - return ret; } @@ -3072,8 +3128,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) * depending on their current state). */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); + clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED; + btrfs_lock_extent_bits(io_tree, start, end, + EXTENT_LOCKED | EXTENT_FINISHING_ORDERED, + &cached_state); } if (freespace_inode) @@ -3137,8 +3195,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3157,9 +3215,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) - clear_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); @@ -3168,15 +3226,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } out: - clear_extent_bit(&inode->io_tree, start, end, clear_bits, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 unwritten_start = start; - /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered @@ -3188,10 +3244,6 @@ out: if (ret) btrfs_mark_ordered_extent_error(ordered_extent); - if (truncated) - unwritten_start += logical_len; - clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* * Drop extent maps for the part of the extent we didn't write. * @@ -3206,9 +3258,15 @@ out: * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ - if (!btrfs_is_free_space_inode(inode)) + if (!btrfs_is_free_space_inode(inode)) { + u64 unwritten_start = start; + + if (truncated) + unwritten_start += logical_len; + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + } /* * If the ordered extent had an IOERR or something else went @@ -3235,7 +3293,7 @@ out: NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes, 1); + ordered_extent->disk_num_bytes, true); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. @@ -3272,20 +3330,16 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. + * + * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected) +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected) { SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; - - ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); shash->tfm = fs_info->csum_shash; - - kaddr = kmap_local_page(page) + pgoff; crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - kunmap_local(kaddr); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; @@ -3314,6 +3368,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; + void *kaddr; ASSERT(bv->bv_len == fs_info->sectorsize); @@ -3321,19 +3376,22 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, return true; if (btrfs_is_data_reloc_root(inode->root) && - test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - NULL)) { + btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + NULL)) { /* Skip the range without csum for data reloc inode */ - clear_extent_bits(&inode->io_tree, file_offset, end, - EXTENT_NODATASUM); + btrfs_clear_extent_bit(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM, NULL); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, - csum_expected)) + kaddr = bvec_kmap_local(bv); + if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { + kunmap_local(kaddr); goto zeroit; + } + kunmap_local(kaddr); return true; zeroit: @@ -3363,6 +3421,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode) if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; + WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq @@ -3479,11 +3538,10 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; - struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; @@ -3502,6 +3560,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -3625,10 +3685,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (!inode || inode->i_nlink) { + if (!inode || inode->vfs_inode.i_nlink) { if (inode) { - ret = btrfs_drop_verity_items(BTRFS_I(inode)); - iput(inode); + ret = btrfs_drop_verity_items(inode); + iput(&inode->vfs_inode); inode = NULL; if (ret) goto out; @@ -3651,7 +3711,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) nr_unlink++; /* this will do delete_inode and everything for us */ - iput(inode); + iput(&inode->vfs_inode); } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3668,19 +3728,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); - btrfs_free_path(path); return ret; } /* - * very simple check to peek ahead in the leaf looking for xattrs. If we - * don't find any xattrs, we know there can't be any acls. + * Look ahead in the leaf for xattrs. If we don't find any then we know there + * can't be any ACLs. + * + * @leaf: the eb leaf where to search + * @slot: the slot the inode is in + * @objectid: the objectid of the inode * - * slot is the slot the inode is in, objectid is the objectid of the inode + * Return true if there is xattr/ACL, false otherwise. */ -static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid, - int *first_xattr_slot) +static noinline bool acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3700,45 +3763,50 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* we found a different objectid, there must not be acls */ + /* We found a different objectid, there must be no ACLs. */ if (found_key.objectid != objectid) - return 0; + return false; - /* we found an xattr, assume we've got an acl */ + /* We found an xattr, assume we've got an ACL. */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) - return 1; + return true; } /* - * we found a key greater than an xattr key, there can't - * be any acls later on + * We found a key greater than an xattr key, there can't be any + * ACLs later on. */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) - return 0; + return false; slot++; scanned++; /* - * it goes inode, inode backrefs, xattrs, extents, - * so if there are a ton of hard links to an inode there can - * be a lot of backrefs. Don't waste time searching too hard, - * this is just an optimization + * The item order goes like: + * - inode + * - inode backrefs + * - xattrs + * - extents, + * + * so if there are lots of hard links to an inode there can be + * a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization. */ if (scanned >= 8) break; } - /* we hit the end of the leaf before we found an xattr or - * something larger than an xattr. We have to assume the inode - * has acls + /* + * We hit the end of the leaf before we found an xattr or something + * larger than an xattr. We have to assume the inode has ACLs. */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; - return 1; + return true; } static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) @@ -3758,7 +3826,8 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) if (!inode->file_extent_tree) return -ENOMEM; - extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); /* Lockdep class is set only for the file extent tree. */ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); @@ -3801,12 +3870,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3815,7 +3885,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + ret = btrfs_init_file_extent_tree(inode); if (ret) goto out; @@ -3825,7 +3895,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) ASSERT(path); - btrfs_get_inode_key(BTRFS_I(inode), &location); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3845,41 +3915,42 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); + btrfs_set_inode_mapping_order(inode); cache_index: /* @@ -3891,9 +3962,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3922,7 +3992,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3930,15 +4000,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3946,13 +4016,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -3960,50 +4029,49 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - btrfs_root_id(root), ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + ret = btrfs_add_inode_to_root(inode, true); if (ret) goto out; return 0; out: - iget_failed(inode); + iget_failed(vfs_inode); return ret; } @@ -4015,45 +4083,35 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *item, struct inode *inode) { - struct btrfs_map_token token; u64 flags; - btrfs_init_map_token(&token, leaf); - - btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); - btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); - btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); - btrfs_set_token_inode_mode(&token, item, inode->i_mode); - btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); - - btrfs_set_token_timespec_sec(&token, &item->atime, - inode_get_atime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->atime, - inode_get_atime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->mtime, - inode_get_mtime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode_get_mtime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime_sec(inode)); - btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime_nsec(inode)); - - btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); - - btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); - btrfs_set_token_inode_generation(&token, item, - BTRFS_I(inode)->generation); - btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); - btrfs_set_token_inode_transid(&token, item, trans->transid); - btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); + + btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); - btrfs_set_token_inode_flags(&token, item, flags); - btrfs_set_token_inode_block_group(&token, item, 0); + btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_block_group(leaf, item, 0); } /* @@ -4063,7 +4121,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; int ret; @@ -4077,7 +4135,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = -ENOENT; - goto failed; + return ret; } leaf = path->nodes[0]; @@ -4085,12 +4143,8 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); - ret = 0; -failed: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -4135,6 +4189,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, return ret; } +static void update_time_after_link_or_unlink(struct btrfs_inode *dir) +{ + struct timespec64 now; + + /* + * If we are replaying a log tree, we do not want to update the mtime + * and ctime of the parent directory with the current time, since the + * log replay procedure is responsible for setting them to their correct + * values (the ones it had when the fsync was done). + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags)) + return; + + now = inode_set_ctime_current(&dir->vfs_inode); + inode_set_mtime_to_ts(&dir->vfs_inode, now); +} + /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and @@ -4156,20 +4227,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { - ret = di ? PTR_ERR(di) : -ENOENT; - goto err; + btrfs_free_path(path); + return di ? PTR_ERR(di) : -ENOENT; } ret = btrfs_delete_one_dir_name(trans, root, path, di); + /* + * Down the call chains below we'll also need to allocate a path, so no + * need to hold on to this one for longer than necessary. + */ + btrfs_free_path(path); if (ret) - goto err; - btrfs_release_path(path); + return ret; /* * If we don't have dir index, we have to get it by looking up @@ -4191,11 +4264,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { - btrfs_info(fs_info, - "failed to delete reference to %.*s, inode %llu parent %llu", - name->len, name->name, ino, dir_ino); + btrfs_crit(fs_info, + "failed to delete reference to %.*s, root %llu inode %llu parent %llu", + name->len, name->name, btrfs_root_id(root), ino, dir_ino); btrfs_abort_transaction(trans, ret); - goto err; + return ret; } skip_backref: if (rename_ctx) @@ -4204,7 +4277,7 @@ skip_backref: ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); - goto err; + return ret; } /* @@ -4228,19 +4301,14 @@ skip_backref: * holding. */ btrfs_run_delayed_iput(fs_info, inode); -err: - btrfs_free_path(path); - if (ret) - goto out; btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_set_ctime_current(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); - ret = btrfs_update_inode(trans, dir); -out: - return ret; + update_time_after_link_or_unlink(dir); + + return btrfs_update_inode(trans, dir); } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -4424,7 +4492,7 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7); @@ -4446,7 +4514,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); - goto out; + return ret; } btrfs_release_path(path); } @@ -4457,14 +4525,13 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = 0; @@ -4474,8 +4541,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } -out: - btrfs_free_path(path); + return ret; } @@ -4647,68 +4713,68 @@ out_up_write: return ret; } -static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_inode *dir = BTRFS_I(vfs_dir); + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; struct btrfs_trans_handle *trans; - u64 last_unlink_trans; struct fscrypt_name fname; - if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) + if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { btrfs_err(fs_info, "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } - return btrfs_delete_subvolume(BTRFS_I(dir), dentry); + return btrfs_delete_subvolume(dir, dentry); } - ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ - trans = __unlink_start_trans(BTRFS_I(dir)); + trans = __unlink_start_trans(dir); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } - if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); + /* + * Propagate the last_unlink_trans value of the deleted dir to its + * parent directory. This is to prevent an unrecoverable log tree in the + * case we do something like this: + * 1) create dir foo + * 2) create snapshot under dir foo + * 3) delete the snapshot + * 4) rmdir foo + * 5) mkdir foo + * 6) fsync foo or some file inside foo + * + * This is because we can't unlink other roots when replaying the dir + * deletes for directory foo. + */ + if (inode->last_unlink_trans >= trans->transid) + btrfs_record_snapshot_destroy(trans, dir); + + if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + ret = btrfs_unlink_subvol(trans, dir, dentry); goto out; } - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); if (ret) goto out; - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; - /* now the directory is empty */ - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &fname.disk_name); - if (!ret) { - btrfs_i_size_write(BTRFS_I(inode), 0); - /* - * Propagate the last_unlink_trans value of the deleted dir to - * its parent directory. This is to prevent an unrecoverable - * log tree in the case we do something like this: - * 1) create dir foo - * 2) create snapshot under dir foo - * 3) delete the snapshot - * 4) rmdir foo - * 5) mkdir foo - * 6) fsync foo or some file inside foo - */ - if (last_unlink_trans >= trans->transid) - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; - } + ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name); + if (!ret) + btrfs_i_size_write(inode, 0); out: btrfs_end_transaction(trans); out_notrans: @@ -4718,20 +4784,80 @@ out_notrans: return ret; } +static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize) +{ + ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u", + blockstart, blocksize); + + if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1) + return true; + return false; +} + +static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) +{ + const pgoff_t index = (start >> PAGE_SHIFT); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct folio *folio; + u64 zero_start; + u64 zero_end; + int ret = 0; + +again: + folio = filemap_lock_folio(mapping, index); + /* No folio present. */ + if (IS_ERR(folio)) + return 0; + + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto out_unlock; + } + } + folio_wait_writeback(folio); + + /* + * We do not need to lock extents nor wait for OE, as it's already + * beyond EOF. + */ + + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = folio_end(folio); + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start); + +out_unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + /* - * Read, zero a chunk and write a block. + * Handle the truncation of a fs block. * - * @inode - inode that we're zeroing - * @from - the offset to start zeroing - * @len - the length to zero, 0 to zero the entire range respective to the - * offset - * @front - zero up to the offset instead of from the offset on + * @inode - inode that we're zeroing + * @offset - the file offset of the block to truncate + * The value must be inside [@start, @end], and the function will do + * extra checks if the block that covers @offset needs to be zeroed. + * @start - the start file offset of the range we want to zero + * @end - the end (inclusive) file offset of the range we want to zero. * - * This will find the block for the "from" offset and cow the block and zero the - * part we want to zero. This is used with truncate and hole punching. + * If the range is not block aligned, read out the folio that covers @offset, + * and if needed zero blocks that are inside the folio and covered by [@start, @end). + * If @start or @end + 1 lands inside a block, that block will be marked dirty + * for writeback. + * + * This is utilized by hole punch, zero range, file expansion. */ -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front) +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -4741,27 +4867,66 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (blocksize - 1); + pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); - size_t write_bytes = blocksize; int ret = 0; + const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), + blocksize); + const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize), + blocksize); + bool need_truncate_head = false; + bool need_truncate_tail = false; + u64 zero_start; + u64 zero_end; u64 block_start; u64 block_end; - if (IS_ALIGNED(offset, blocksize) && - (!len || IS_ALIGNED(len, blocksize))) + /* @offset should be inside the range. */ + ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu", + offset, start, end); + + /* The range is aligned at both ends. */ + if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) { + /* + * For block size < page size case, we may have polluted blocks + * beyond EOF. So we also need to zero them out. + */ + if (end == (u64)-1 && blocksize < PAGE_SIZE) + ret = truncate_block_zero_beyond_eof(inode, start); + goto out; + } + + /* + * @offset may not be inside the head nor tail block. In that case we + * don't need to do anything. + */ + if (!in_head_block && !in_tail_block) + goto out; + + /* + * Skip the truncatioin if the range in the target block is already aligned. + * The seemingly complex check will also handle the same block case. + */ + if (in_head_block && !IS_ALIGNED(start, blocksize)) + need_truncate_head = true; + if (in_tail_block && !IS_ALIGNED(end + 1, blocksize)) + need_truncate_tail = true; + if (!need_truncate_head && !need_truncate_tail) goto out; - block_start = round_down(from, blocksize); + block_start = round_down(offset, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, blocksize, false); if (ret < 0) { + size_t write_bytes = blocksize; + if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { - /* For nocow case, no need to reserve data space */ + /* For nocow case, no need to reserve data space. */ + ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u", + write_bytes, blocksize); only_release_metadata = true; } else { goto out; @@ -4778,10 +4943,13 @@ again: folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out; } @@ -4811,11 +4979,11 @@ again: folio_wait_writeback(folio); - lock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -4823,37 +4991,46 @@ again: goto again; } - clear_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } - if (offset != blocksize) { - if (!len) - len = blocksize - offset; - if (front) - folio_zero_range(folio, block_start - folio_pos(folio), - offset); - else - folio_zero_range(folio, - (block_start - folio_pos(folio)) + offset, - len); + if (end == (u64)-1) { + /* + * We're truncating beyond EOF, the remaining blocks normally are + * already holes thus no need to zero again, but it's possible for + * fs block size < page size cases to have memory mapped writes + * to pollute ranges beyond EOF. + * + * In that case although such polluted blocks beyond EOF will + * not reach disk, it still affects our page caches. + */ + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = min_t(u64, folio_end(folio) - 1, end); + } else { + zero_start = max_t(u64, block_start, start); + zero_end = min_t(u64, block_end, end); } + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); - unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) - set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL); + btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, &cached_state); + + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); out_unlock: if (ret) { @@ -4946,7 +5123,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - ret = btrfs_truncate_block(inode, oldsize, 0, 0); + ret = btrfs_truncate_block(inode, oldsize, oldsize, -1); if (ret) return ret; @@ -4963,7 +5140,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) em = NULL; break; } - last_byte = min(extent_map_end(em), block_end); + last_byte = min(btrfs_extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; @@ -4979,7 +5156,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (ret) break; - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, cur_offset, cur_offset + hole_size - 1, @@ -4996,7 +5173,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->generation = btrfs_get_fs_generation(fs_info); ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); } else { ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); @@ -5004,14 +5181,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) break; } next: - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); - unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); + btrfs_free_extent_map(em); + btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return ret; } @@ -5091,7 +5268,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { - int err; + int ret2; /* * Truncate failed, so fix up the in-memory size. We @@ -5099,9 +5276,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * wait for disk_i_size to be stable and then update the * in-memory size to match. */ - err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); - if (err) - return err; + ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); + if (ret2) + return ret2; i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } @@ -5114,31 +5291,31 @@ static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; - int err; + int ret; if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(idmap, dentry, attr); - if (err) - return err; + ret = setattr_prepare(idmap, dentry, attr); + if (ret) + return ret; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr); - if (err) - return err; + ret = btrfs_setsize(inode, attr); + if (ret) + return ret; } if (attr->ia_valid) { setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); - err = btrfs_dirty_inode(BTRFS_I(inode)); + ret = btrfs_dirty_inode(BTRFS_I(inode)); - if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(idmap, dentry, inode->i_mode); + if (!ret && attr->ia_valid & ATTR_MODE) + ret = posix_acl_chmod(idmap, dentry, inode->i_mode); } - return err; + return ret; } /* @@ -5195,7 +5372,7 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5209,9 +5386,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1, NULL); - clear_extent_bit(io_tree, start, end, - EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, - &cached_state); + btrfs_clear_extent_bit(io_tree, start, end, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5272,7 +5449,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_fs_info *fs_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *rsv = NULL; + struct btrfs_block_rsv rsv; int ret; trace_btrfs_inode_evict(inode); @@ -5320,11 +5497,9 @@ void btrfs_evict_inode(struct inode *inode) */ btrfs_kill_delayed_inode_items(BTRFS_I(inode)); - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) - goto out; - rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = btrfs_calc_metadata_size(fs_info, 1); + rsv.failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5336,11 +5511,11 @@ void btrfs_evict_inode(struct inode *inode) .min_type = 0, }; - trans = evict_refill_and_join(root, rsv); + trans = evict_refill_and_join(root, &rsv); if (IS_ERR(trans)) - goto out; + goto out_release; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; @@ -5352,7 +5527,7 @@ void btrfs_evict_inode(struct inode *inode) */ btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) - goto out; + goto out_release; else if (!ret) break; } @@ -5366,16 +5541,17 @@ void btrfs_evict_inode(struct inode *inode) * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ - trans = evict_refill_and_join(root, rsv); + trans = evict_refill_and_join(root, &rsv); if (!IS_ERR(trans)) { - trans->block_rsv = rsv; + trans->block_rsv = &rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); } +out_release: + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); out: - btrfs_free_block_rsv(fs_info, rsv); /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want @@ -5397,7 +5573,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; @@ -5408,7 +5584,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) - goto out; + return ret; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen. @@ -5437,7 +5613,6 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); - btrfs_free_path(path); return ret; } @@ -5452,7 +5627,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key *location, struct btrfs_root **sub_root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -5508,7 +5683,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, location->offset = 0; err = 0; out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return err; } @@ -5559,7 +5733,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5571,40 +5745,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* * Get an inode object given its inode number and corresponding root. Path is * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path) +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } /* * Get an inode object given its inode number and corresponding root. */ -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_path *path; int ret; @@ -5612,55 +5788,60 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; path = btrfs_alloc_path(); - if (!path) + if (!path) { + iget_failed(&inode->vfs_inode); return ERR_PTR(-ENOMEM); + } ret = btrfs_read_locked_inode(inode, path); btrfs_free_path(path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } -static struct inode *new_simple_dir(struct inode *dir, - struct btrfs_key *key, - struct btrfs_root *root) +static struct btrfs_inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) { struct timespec64 ts; - struct inode *inode = new_inode(dir->i_sb); + struct inode *vfs_inode; + struct btrfs_inode *inode; - if (!inode) + vfs_inode = new_inode(dir->i_sb); + if (!vfs_inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = btrfs_grab_root(root); - BTRFS_I(inode)->ref_root_id = key->objectid; - set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); - set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + inode = BTRFS_I(vfs_inode); + inode->root = btrfs_grab_root(root); + inode->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); + set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); - btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); + btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ - inode->i_op = &simple_dir_inode_operations; - inode->i_opflags &= ~IOP_XATTR; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + vfs_inode->i_op = &simple_dir_inode_operations; + vfs_inode->i_opflags &= ~IOP_XATTR; + vfs_inode->i_fop = &simple_dir_operations; + vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - ts = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, ts); - inode_set_atime_to_ts(inode, inode_get_atime(dir)); - BTRFS_I(inode)->i_otime_sec = ts.tv_sec; - BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + ts = inode_set_ctime_current(vfs_inode); + inode_set_mtime_to_ts(vfs_inode, ts); + inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); + inode->i_otime_sec = ts.tv_sec; + inode->i_otime_nsec = ts.tv_nsec; - inode->i_uid = dir->i_uid; - inode->i_gid = dir->i_gid; + vfs_inode->i_uid = dir->i_uid; + vfs_inode->i_gid = dir->i_gid; return inode; } @@ -5674,15 +5855,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); -static inline u8 btrfs_inode_type(struct inode *inode) +static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) { - return fs_umode_to_ftype(inode->i_mode); + return fs_umode_to_ftype(inode->vfs_inode.i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; @@ -5699,18 +5880,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", - inode->i_mode, btrfs_inode_type(inode), + inode->vfs_inode.i_mode, btrfs_inode_type(inode), di_type); - iput(inode); + iput(&inode->vfs_inode); return ERR_PTR(-EUCLEAN); } - return inode; + return &inode->vfs_inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, @@ -5725,19 +5906,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) btrfs_put_root(sub_root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); down_read(&fs_info->cleanup_work_sem); - if (!sb_rdonly(inode->i_sb)) + if (!sb_rdonly(inode->vfs_inode.i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { - iput(inode); + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } } - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return &inode->vfs_inode; } static int btrfs_dentry_delete(const struct dentry *dentry) @@ -5777,7 +5961,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; @@ -5791,15 +5975,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; /* FIXME: we should be able to handle this */ if (ret == 0) - goto out; - ret = 0; + return ret; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } path->slots[0]--; @@ -5810,13 +5993,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; + + return 0; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) @@ -5919,7 +6101,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); void *addr; LIST_HEAD(ins_list); LIST_HEAD(del_list); @@ -6002,8 +6184,7 @@ again: if (ret) goto nopos; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); - if (ret) + if (btrfs_readdir_delayed_dir_index(ctx, &ins_list)) goto nopos; /* @@ -6032,7 +6213,6 @@ nopos: err: if (put) btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); - btrfs_free_path(path); return ret; } @@ -6210,7 +6390,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6296,6 +6476,8 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); + btrfs_set_inode_mapping_order(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6380,7 +6562,6 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6390,7 +6571,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, path = NULL; if (args->subvol) { - struct inode *parent; + struct btrfs_inode *parent; /* * Subvolumes inherit properties from their parent subvolume, @@ -6400,11 +6581,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { - ret = btrfs_inode_inherit_props(trans, inode, parent); - iput(parent); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + parent); + iput(&parent->vfs_inode); } } else { - ret = btrfs_inode_inherit_props(trans, inode, dir); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + BTRFS_I(dir)); } if (ret) { btrfs_err(fs_info, @@ -6438,13 +6621,17 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto discard; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } return 0; @@ -6502,7 +6689,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, - btrfs_inode_type(&inode->vfs_inode), index); + btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6513,15 +6700,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); - /* - * If we are replaying a log tree, we do not want to update the mtime - * and ctime of the parent directory with the current time, since the - * log replay procedure is responsible for setting them to their correct - * values (the ones it had when the fsync was done). - */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) - inode_set_mtime_to_ts(&parent_inode->vfs_inode, - inode_set_ctime_current(&parent_inode->vfs_inode)); + update_time_after_link_or_unlink(parent_inode); ret = btrfs_update_inode(trans, parent_inode); if (ret) @@ -6531,20 +6710,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; - int err; - err = btrfs_del_root_ref(trans, key.objectid, - btrfs_root_id(root), parent_ino, - &local_index, name); - if (err) - btrfs_abort_transaction(trans, err); + int ret2; + + ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root), + parent_ino, &local_index, name); + if (ret2) + btrfs_abort_transaction(trans, ret2); } else if (add_backref) { - u64 local_index; - int err; + int ret2; - err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, - &local_index); - if (err) - btrfs_abort_transaction(trans, err); + ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL); + if (ret2) + btrfs_abort_transaction(trans, ret2); } /* Return the original error code */ @@ -6563,20 +6740,20 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, }; unsigned int trans_num_items; struct btrfs_trans_handle *trans; - int err; + int ret; - err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); - if (err) + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) goto out_inode; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_new_inode_args; } - err = btrfs_create_new_inode(trans, &new_inode_args); - if (!err) + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (!ret) d_instantiate_new(dentry, inode); btrfs_end_transaction(trans); @@ -6584,9 +6761,9 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry, out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: - if (err) + if (ret) iput(inode); - return err; + return ret; } static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -6627,7 +6804,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct fscrypt_name fname; u64 index; - int err; + int ret; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ @@ -6637,12 +6814,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; - err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); - if (err) + ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (ret) goto fail; - err = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (err) + ret = btrfs_set_inode_index(BTRFS_I(dir), &index); + if (ret) goto fail; /* @@ -6653,7 +6830,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, */ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); trans = NULL; goto fail; } @@ -6666,24 +6843,24 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); - if (err) { + if (ret) { drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (ret) goto fail; if (inode->i_nlink == 1) { /* * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ - err = btrfs_orphan_del(trans, BTRFS_I(inode)); - if (err) + ret = btrfs_orphan_del(trans, BTRFS_I(inode)); + if (ret) goto fail; } d_instantiate(dentry, inode); @@ -6699,21 +6876,21 @@ fail: iput(inode); } btrfs_btree_balance_dirty(fs_info); - return err; + return ret; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6722,6 +6899,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6738,7 +6916,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); + max_size = min_t(unsigned long, blocksize, max_size); ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); @@ -6750,14 +6928,15 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; @@ -6772,14 +6951,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio) if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } @@ -6818,18 +6997,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map_tree *em_tree = &inode->extent_tree; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) - free_extent_map(em); + btrfs_free_extent_map(em); else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) - free_extent_map(em); + btrfs_free_extent_map(em); else goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; @@ -6966,7 +7145,7 @@ not_found: insert: ret = 0; btrfs_release_path(path); - if (em->start > start || extent_map_end(em) <= start) { + if (em->start > start || btrfs_extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -6983,7 +7162,7 @@ out: trace_btrfs_get_extent(root, inode, em); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } return em; @@ -7011,8 +7190,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent - * @strict: if true, omit optimizations that might force us into unnecessary - * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write @@ -7022,17 +7199,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, - bool nowait, bool strict) + bool nowait) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; @@ -7042,81 +7219,74 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, return -ENOMEM; path->nowait = nowait; - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); if (ret < 0) - goto out; + return ret; if (ret == 1) { if (path->slots[0] == 0) { - /* can't find the item, must cow */ - ret = 0; - goto out; + /* Can't find the item, must COW. */ + return 0; } path->slots[0]--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { - /* not our file or wrong item type, must cow */ - goto out; + /* Not our file or wrong item type, must COW. */ + return 0; } if (key.offset > offset) { - /* Wrong offset, must cow */ - goto out; + /* Wrong offset, must COW. */ + return 0; } if (btrfs_file_extent_end(path) <= offset) - goto out; + return 0; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; - nocow_args.strict = strict; nocow_args.free_path = true; - ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + ret = can_nocow_file_extent(path, &key, inode, &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; if (ret != 1) { /* Treat errors as not being able to NOCOW. */ - ret = 0; - goto out; + return 0; } - ret = 0; if (btrfs_extent_readonly(fs_info, nocow_args.file_extent.disk_bytenr + nocow_args.file_extent.offset)) - goto out; + return 0; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); - if (ret) { - ret = -EAGAIN; - goto out; - } + ret = btrfs_test_range_bit_exists(io_tree, offset, range_end, + EXTENT_DELALLOC); + if (ret) + return -EAGAIN; } if (file_extent) memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); *len = nocow_args.file_extent.num_bytes; - ret = 1; -out: - btrfs_free_path(path); - return ret; + + return 1; } /* The callers of this must take lock_extent() */ @@ -7164,7 +7334,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7177,15 +7347,15 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, file_extent->compression); + btrfs_extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } - /* em got 2 refs now, callers needs to do free_extent_map once. */ + /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */ return em; } @@ -7199,13 +7369,13 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, static void wait_subpage_spinlock(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - struct btrfs_subpage *subpage; + struct btrfs_folio_state *bfs; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - subpage = folio_get_private(folio); + bfs = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, @@ -7218,14 +7388,14 @@ static void wait_subpage_spinlock(struct folio *folio) * Here we just acquire the spinlock so that all existing callers * should exit and we're safe to release/invalidate the page. */ - spin_lock_irq(&subpage->lock); - spin_unlock_irq(&subpage->lock); + spin_lock_irq(&bfs->lock); + spin_unlock_irq(&bfs->lock); } static int btrfs_launder_folio(struct folio *folio) { return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), - PAGE_SIZE, NULL); + folio_size(folio), NULL); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -7312,7 +7482,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, } if (!inode_evicting) - lock_extent(tree, page_start, page_end, &cached_state); + btrfs_lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { @@ -7368,10 +7538,10 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, cur, range_end, - EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(tree, cur, range_end, + EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -7413,12 +7583,11 @@ next: * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); - if (!inode_evicting) { - clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | - extra_flags, &cached_state); - } + if (!inode_evicting) + btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG | extra_flags, + &cached_state); cur = range_end + 1; } /* @@ -7443,7 +7612,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_rsv *rsv; + struct btrfs_block_rsv rsv; int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; @@ -7485,11 +7654,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ - rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); - if (!rsv) - return -ENOMEM; - rsv->size = min_size; - rsv->failfast = true; + btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP); + rsv.size = min_size; + rsv.failfast = true; /* * 1 for the truncate slack space @@ -7502,7 +7669,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) } /* Migrate the slack space for the truncate to our reserve */ - ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv, min_size, false); /* * We have reserved 2 metadata units when we started the transaction and @@ -7514,7 +7681,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) goto out; } - trans->block_rsv = rsv; + trans->block_rsv = &rsv; while (1) { struct extent_state *cached_state = NULL; @@ -7522,7 +7689,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last @@ -7537,7 +7704,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) @@ -7557,9 +7724,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) break; } - btrfs_block_rsv_release(fs_info, rsv, -1, NULL); + btrfs_block_rsv_release(fs_info, &rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, - rsv, min_size, false); + &rsv, min_size, false); /* * We have reserved 2 metadata units when we started the * transaction and min_size matches 1 unit, so this should never @@ -7568,7 +7735,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) if (WARN_ON(ret)) break; - trans->block_rsv = rsv; + trans->block_rsv = &rsv; } /* @@ -7581,7 +7748,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, + inode->vfs_inode.i_size, (u64)-1); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -7606,7 +7774,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_btree_balance_dirty(fs_info); } out: - btrfs_free_block_rsv(fs_info, rsv); + btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL); /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to @@ -7693,10 +7861,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_otime_nsec = 0; inode = &ei->vfs_inode; - extent_map_tree_init(&ei->extent_tree); + btrfs_extent_map_tree_init(&ei->extent_tree); /* This io tree sets the valid inode. */ - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; ei->file_extent_tree = NULL; @@ -7861,7 +8029,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; - stat->subvol = BTRFS_I(inode)->root->root_key.objectid; + stat->subvol = btrfs_root_id(BTRFS_I(inode)->root); stat->result_mask |= STATX_SUBVOL; spin_lock(&BTRFS_I(inode)->lock); @@ -7894,6 +8062,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8017,6 +8186,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8027,31 +8221,45 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), @@ -8074,30 +8282,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) +out_fail: + if (logs_pinned) { btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); -out_fail: + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8147,6 +8348,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -8281,22 +8483,52 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); - if (!ret) - ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); - } - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } if (new_inode) { @@ -8304,18 +8536,27 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } - if (!ret && new_inode->i_nlink == 0) + if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_fail; + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_fail; + } } } @@ -8329,7 +8570,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -8345,6 +8586,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8430,8 +8675,6 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { - struct btrfs_inode *binode; - struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); @@ -8442,30 +8685,30 @@ static int start_delalloc_inodes(struct btrfs_root *root, spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - binode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + struct btrfs_inode *inode; + struct inode *tmp_inode; + + inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - list_move_tail(&binode->delalloc_inodes, - &root->delalloc_inodes); + list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && - test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) continue; - inode = igrab(&binode->vfs_inode); - if (!inode) { + tmp_inode = igrab(&inode->vfs_inode); + if (!tmp_inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) - set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, - &binode->runtime_flags); + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(inode); + work = btrfs_alloc_delalloc_work(&inode->vfs_inode); if (!work) { - iput(inode); + iput(&inode->vfs_inode); ret = -ENOMEM; goto out; } @@ -8473,8 +8716,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -8583,7 +8826,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, .dentry = dentry, }; unsigned int trans_num_items; - int err; + int ret; int name_len; int datasize; unsigned long ptr; @@ -8591,7 +8834,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + /* + * Symlinks utilize uncompressed inline extent data, which should not + * reach block size. + */ + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + name_len >= fs_info->sectorsize) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); @@ -8605,38 +8853,37 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, inode_set_bytes(inode, name_len); new_inode_args.inode = inode; - err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); - if (err) + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) goto out_inode; /* 1 additional item for the inline extent */ trans_num_items++; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { - err = PTR_ERR(trans); + ret = PTR_ERR(trans); goto out_new_inode_args; } - err = btrfs_create_new_inode(trans, &new_inode_args); - if (err) + ret = btrfs_create_new_inode(trans, &new_inode_args); + if (ret) goto out; path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; - btrfs_abort_transaction(trans, err); + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); discard_new_inode(inode); inode = NULL; goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); - err = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (err) { - btrfs_abort_transaction(trans, err); + ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); + if (ret) { + btrfs_abort_transaction(trans, ret); btrfs_free_path(path); discard_new_inode(inode); inode = NULL; @@ -8655,20 +8902,19 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); - err = 0; + ret = 0; out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: - if (err) + if (ret) iput(inode); - return err; + return ret; } static struct btrfs_trans_handle *insert_prealloc_file_extent( @@ -8805,11 +9051,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 0); + ins.offset, false); break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, false); @@ -8827,7 +9073,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); - free_extent_map(em); + btrfs_free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; @@ -8999,7 +9245,7 @@ static ssize_t btrfs_encoded_read_inline( struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *item; u64 ram_bytes; @@ -9009,10 +9255,8 @@ static ssize_t btrfs_encoded_read_inline( const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; path->nowait = nowait; @@ -9021,9 +9265,9 @@ static ssize_t btrfs_encoded_read_inline( if (ret) { if (ret > 0) { /* The extent item disappeared? */ - ret = -EIO; + return -EIO; } - goto out; + return ret; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -9036,17 +9280,16 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, item)); if (ret < 0) - goto out; + return ret; encoded->compression = ret; if (encoded->compression) { size_t inline_size; inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); - if (inline_size > count) { - ret = -ENOBUFS; - goto out; - } + if (inline_size > count) + return -ENOBUFS; + count = inline_size; encoded->unencoded_len = ram_bytes; encoded->unencoded_offset = iocb->ki_pos - extent_start; @@ -9058,13 +9301,12 @@ static ssize_t btrfs_encoded_read_inline( } tmp = kmalloc(count, GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out; - } + if (!tmp) + return -ENOMEM; + read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9072,15 +9314,14 @@ static ssize_t btrfs_encoded_read_inline( if (ret != count) ret = -EFAULT; kfree(tmp); -out: - btrfs_free_path(path); + return ret; } struct btrfs_encoded_read_private { - wait_queue_head_t wait; + struct completion *sync_reads; void *uring_ctx; - atomic_t pending; + refcount_t pending_refs; blk_status_t status; }; @@ -9090,23 +9331,22 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) if (bbio->bio.bi_status) { /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the - * atomic_dec_return() or io_wait_event() in - * btrfs_encoded_read_regular_fill_pages() to ensure that this - * write is observed before the load of status in + * The memory barrier implied by the refcount_dec_and_test() here + * pairs with the memory barrier implied by the refcount_dec_and_test() + * in btrfs_encoded_read_regular_fill_pages() to ensure that + * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } - if (atomic_dec_and_test(&priv->pending)) { + if (refcount_dec_and_test(&priv->pending_refs)) { int err = blk_status_to_errno(READ_ONCE(priv->status)); if (priv->uring_ctx) { btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - wake_up(&priv->wait); + complete(priv->sync_reads); } } bio_put(&bbio->bio); @@ -9117,17 +9357,27 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private *priv; + struct btrfs_encoded_read_private *priv, sync_priv; + struct completion sync_reads; unsigned long i = 0; struct btrfs_bio *bbio; int ret; - priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); - if (!priv) - return -ENOMEM; + /* + * Fast path for synchronous reads which completes in this call, io_uring + * needs longer time span. + */ + if (uring_ctx) { + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + } else { + priv = &sync_priv; + init_completion(&sync_reads); + priv->sync_reads = &sync_reads; + } - init_waitqueue_head(&priv->wait); - atomic_set(&priv->pending, 1); + refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9140,7 +9390,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, @@ -9155,11 +9405,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, disk_io_size -= bytes; } while (disk_io_size); - atomic_inc(&priv->pending); + refcount_inc(&priv->pending_refs); btrfs_submit_bbio(bbio, 0); if (uring_ctx) { - if (atomic_dec_return(&priv->pending) == 0) { + if (refcount_dec_and_test(&priv->pending_refs)) { ret = blk_status_to_errno(READ_ONCE(priv->status)); btrfs_uring_read_extent_endio(uring_ctx, ret); kfree(priv); @@ -9168,12 +9418,10 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { - if (atomic_dec_return(&priv->pending) != 0) - io_wait_event(priv->wait, !atomic_read(&priv->pending)); + if (!refcount_dec_and_test(&priv->pending_refs)) + wait_for_completion_io(&sync_reads); /* See btrfs_encoded_read_endio() for ordering. */ - ret = blk_status_to_errno(READ_ONCE(priv->status)); - kfree(priv); - return ret; + return blk_status_to_errno(READ_ONCE(priv->status)); } } @@ -9206,7 +9454,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out; - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9283,7 +9531,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock_inode; } - if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) { ret = -EAGAIN; goto out_unlock_inode; } @@ -9292,7 +9540,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, lockend - start + 1); if (ordered) { btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); ret = -EAGAIN; goto out_unlock_inode; } @@ -9305,13 +9553,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out_unlock_inode; - lock_extent(io_tree, start, lockend, cached_state); + btrfs_lock_extent(io_tree, start, lockend, cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); cond_resched(); } } @@ -9329,7 +9577,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * For inline extents we get everything we need out of the * extent item. */ - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, cached_state, extent_start, @@ -9341,7 +9589,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * We only want to return up to EOF even if the extent extends beyond * that. */ - encoded->len = min_t(u64, extent_map_end(em), + encoded->len = min_t(u64, btrfs_extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { @@ -9349,7 +9597,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (extent_map_is_compressed(em)) { + } else if (btrfs_extent_map_is_compressed(em)) { *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole @@ -9364,12 +9612,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, - extent_map_compression(em)); + btrfs_extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - *disk_bytenr = extent_map_block_start(em) + (start - em->start); + *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* @@ -9382,11 +9630,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = count; *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; if (*disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); @@ -9398,11 +9646,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, } out_em: - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock_extent: /* Leave inode and extent locked if we need to do a read. */ if (!unlocked && ret != -EIOCBQUEUED) - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9549,14 +9797,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, end >> PAGE_SHIFT); if (ret) goto out_folios; - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } @@ -9606,11 +9854,11 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = PTR_ERR(em); goto out_free_reserved; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED)); + (1U << BTRFS_ORDERED_ENCODED) | + (1U << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -9621,7 +9869,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); @@ -9631,7 +9879,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, out_free_reserved: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_delalloc_release: btrfs_delalloc_release_extents(inode, num_bytes); btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); @@ -9644,9 +9892,9 @@ out_free_data_space: * bytes_may_use. */ if (!extent_reserved) - btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); + btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); out_unlock: - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) @@ -9799,15 +10047,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct extent_map *em = NULL; struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; + struct btrfs_path *path = NULL; int ret = 0; u64 isize; - u64 start; + u64 prev_extent_end = 0; + + /* + * Acquire the inode's mmap lock to prevent races with memory mapped + * writes, as they could happen after we flush delalloc below and before + * we lock the extent range further below. The inode was already locked + * up in the call chain. + */ + btrfs_assert_inode_locked(BTRFS_I(inode)); + down_write(&BTRFS_I(inode)->i_mmap_lock); /* * If the swap file was just created, make sure delalloc is done. If the @@ -9816,22 +10074,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - return ret; + goto out_unlock_mmap; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; + } + + path = btrfs_alloc_path(); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); + if (!path || !backref_ctx) { + ret = -ENOMEM; + goto out_unlock_mmap; } /* @@ -9846,7 +10114,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); - return -EBUSY; + ret = -EBUSY; + goto out_unlock_mmap; } /* @@ -9860,7 +10129,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); - return -EINVAL; + ret = -EINVAL; + goto out_unlock_mmap; } /* * Snapshots can create extents which require COW even if NODATACOW is @@ -9881,32 +10151,48 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", btrfs_root_id(root)); - return -EPERM; + ret = -EPERM; + goto out_unlock_mmap; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent(io_tree, 0, isize - 1, &cached_state); - start = 0; - while (start < isize) { - u64 logical_block_start, physical_block_start; + btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state); + while (prev_extent_end < isize) { + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *ei; struct btrfs_block_group *bg; - u64 len = isize - start; + u64 logical_block_start; + u64 physical_block_start; + u64 extent_gen; + u64 disk_bytenr; + u64 len; - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + key.objectid = btrfs_ino(BTRFS_I(inode)); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = prev_extent_end; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) goto out; - } - if (em->disk_bytenr == EXTENT_MAP_HOLE) { + /* + * If key not found it means we have an implicit hole (NO_HOLES + * is enabled). + */ + if (ret > 0) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } - if (em->disk_bytenr == EXTENT_MAP_INLINE) { + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be @@ -9918,23 +10204,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (extent_map_is_compressed(em)) { + + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } - logical_block_start = extent_map_block_start(em) + (start - em->start); - len = min(len, em->len - (start - em->start)); - free_extent_map(em); - em = NULL; + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + if (disk_bytenr == 0) { + btrfs_warn(fs_info, "swapfile must not have holes"); + ret = -EINVAL; + goto out; + } + + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); + extent_gen = btrfs_file_extent_generation(leaf, ei); + prev_extent_end = btrfs_file_extent_end(path); + + if (prev_extent_end > isize) + len = isize - key.offset; + else + len = btrfs_file_extent_num_bytes(leaf, ei); + + backref_ctx->curr_leaf_bytenr = leaf->start; - ret = can_nocow_extent(inode, start, &len, NULL, false, true); + /* + * Don't need the path anymore, release to avoid deadlocks when + * calling btrfs_is_data_extent_shared() because when joining a + * transaction it can block waiting for the current one's commit + * which in turn may be trying to lock the same leaf to flush + * delayed items for example. + */ + btrfs_release_path(path); + + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, + extent_gen, backref_ctx); if (ret < 0) { goto out; - } else if (ret) { - ret = 0; - } else { + } else if (ret > 0) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; @@ -9969,7 +10277,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, physical_block_start = (map->stripes[0].physical + (logical_block_start - map->start)); - len = min(len, map->chunk_len - (logical_block_start - map->start)); btrfs_free_chunk_map(map); map = NULL; @@ -10010,24 +10317,27 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) goto out; } - bsi.start = start; + bsi.start = key.offset; bsi.block_start = physical_block_start; bsi.block_len = len; } - start += len; + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + cond_resched(); } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: - if (!IS_ERR_OR_NULL(em)) - free_extent_map(em); if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); - unlock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); @@ -10036,6 +10346,10 @@ out: btrfs_exclop_finish(fs_info); +out_unlock_mmap: + up_write(&BTRFS_I(inode)->i_mmap_lock); + btrfs_free_backref_share_ctx(backref_ctx); + btrfs_free_path(path); if (ret) return ret; @@ -10044,7 +10358,6 @@ out: *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; - sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else |