diff options
author | Omar Sandoval <osandov@fb.com> | 2019-08-14 02:00:02 +0300 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2022-03-14 15:13:51 +0300 |
commit | 7c0c7269f7b508ba6e4b063a9314d6bd1fb6db22 (patch) | |
tree | 97c42d1bcd553b2ef14e5b2494b4f303585e09b1 /fs/btrfs/inode.c | |
parent | 1881fba89bd5dcd364d2e1bf561912a90a11c21a (diff) | |
download | linux-7c0c7269f7b508ba6e4b063a9314d6bd1fb6db22.tar.xz |
btrfs: add BTRFS_IOC_ENCODED_WRITE
The implementation resembles direct I/O: we have to flush any ordered
extents, invalidate the page cache, and do the io tree/delalloc/extent
map/ordered extent dance. From there, we can reuse the compression code
with a minor modification to distinguish the write from writeback. This
also creates inline extents when possible.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 249 |
1 files changed, 247 insertions, 2 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 505cd2051181..964eab60742f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1001,7 +1001,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, - async_chunk->blkcg_css)) { + async_chunk->blkcg_css, true)) { const u64 start = async_extent->start; const u64 end = start + async_extent->ram_size - 1; @@ -3001,6 +3001,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || + test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), @@ -3035,7 +3036,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); @@ -10654,6 +10656,249 @@ out_unlock_inode: return ret; } +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; + struct extent_changeset *data_reserved = NULL; + struct extent_state *cached_state = NULL; + int compression; + size_t orig_count; + u64 start, end; + u64 num_bytes, ram_bytes, disk_num_bytes; + unsigned long nr_pages, i; + struct page **pages; + struct btrfs_key ins; + bool extent_reserved = false; + struct extent_map *em; + ssize_t ret; + + switch (encoded->compression) { + case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: + compression = BTRFS_COMPRESS_ZLIB; + break; + case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: + compression = BTRFS_COMPRESS_ZSTD; + break; + case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: + case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: + /* The sector size must match for LZO. */ + if (encoded->compression - + BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != + fs_info->sectorsize_bits) + return -EINVAL; + compression = BTRFS_COMPRESS_LZO; + break; + default: + return -EINVAL; + } + if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) + return -EINVAL; + + orig_count = iov_iter_count(from); + + /* The extent size must be sane. */ + if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || + orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) + return -EINVAL; + + /* + * The compressed data must be smaller than the decompressed data. + * + * It's of course possible for data to compress to larger or the same + * size, but the buffered I/O path falls back to no compression for such + * data, and we don't want to break any assumptions by creating these + * extents. + * + * Note that this is less strict than the current check we have that the + * compressed data must be at least one sector smaller than the + * decompressed data. We only want to enforce the weaker requirement + * from old kernels that it is at least one byte smaller. + */ + if (orig_count >= encoded->unencoded_len) + return -EINVAL; + + /* The extent must start on a sector boundary. */ + start = iocb->ki_pos; + if (!IS_ALIGNED(start, fs_info->sectorsize)) + return -EINVAL; + + /* + * The extent must end on a sector boundary. However, we allow a write + * which ends at or extends i_size to have an unaligned length; we round + * up the extent size and set i_size to the unaligned end. + */ + if (start + encoded->len < inode->vfs_inode.i_size && + !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) + return -EINVAL; + + /* Finally, the offset in the unencoded data must be sector-aligned. */ + if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) + return -EINVAL; + + num_bytes = ALIGN(encoded->len, fs_info->sectorsize); + ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); + end = start + num_bytes - 1; + + /* + * If the extent cannot be inline, the compressed data on disk must be + * sector-aligned. For convenience, we extend it with zeroes if it + * isn't. + */ + disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); + nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); + pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!pages) + return -ENOMEM; + for (i = 0; i < nr_pages; i++) { + size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); + char *kaddr; + + pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); + if (!pages[i]) { + ret = -ENOMEM; + goto out_pages; + } + kaddr = kmap(pages[i]); + if (copy_from_iter(kaddr, bytes, from) != bytes) { + kunmap(pages[i]); + ret = -EFAULT; + goto out_pages; + } + if (bytes < PAGE_SIZE) + memset(kaddr + bytes, 0, PAGE_SIZE - bytes); + kunmap(pages[i]); + } + + for (;;) { + struct btrfs_ordered_extent *ordered; + + ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); + if (ret) + goto out_pages; + ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + if (ret) + goto out_pages; + lock_extent_bits(io_tree, start, end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); + if (!ordered && + !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) + break; + if (ordered) + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(io_tree, start, end, &cached_state); + cond_resched(); + } + + /* + * We don't use the higher-level delalloc space functions because our + * num_bytes and disk_num_bytes are different. + */ + ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); + if (ret) + goto out_unlock; + ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); + if (ret) + goto out_free_data_space; + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes); + if (ret) + goto out_qgroup_free_data; + + /* Try an inline extent first. */ + if (start == 0 && encoded->unencoded_len == encoded->len && + encoded->unencoded_offset == 0) { + ret = cow_file_range_inline(inode, encoded->len, orig_count, + compression, pages, true); + if (ret <= 0) { + if (ret == 0) + ret = orig_count; + goto out_delalloc_release; + } + } + + ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, + disk_num_bytes, 0, 0, &ins, 1, 1); + if (ret) + goto out_delalloc_release; + extent_reserved = true; + + em = create_io_em(inode, start, num_bytes, + start - encoded->unencoded_offset, ins.objectid, + ins.offset, ins.offset, ram_bytes, compression, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserved; + } + free_extent_map(em); + + ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes, + ins.objectid, ins.offset, + encoded->unencoded_offset, + (1 << BTRFS_ORDERED_ENCODED) | + (1 << BTRFS_ORDERED_COMPRESSED), + compression); + if (ret) { + btrfs_drop_extent_cache(inode, start, end, 0); + goto out_free_reserved; + } + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + if (start + encoded->len > inode->vfs_inode.i_size) + i_size_write(&inode->vfs_inode, start + encoded->len); + + unlock_extent_cached(io_tree, start, end, &cached_state); + + btrfs_delalloc_release_extents(inode, num_bytes); + + if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, + ins.offset, pages, nr_pages, 0, NULL, + false)) { + btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); + ret = -EIO; + goto out_pages; + } + ret = orig_count; + goto out; + +out_free_reserved: + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); +out_delalloc_release: + btrfs_delalloc_release_extents(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); +out_qgroup_free_data: + if (ret < 0) + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); +out_free_data_space: + /* + * If btrfs_reserve_extent() succeeded, then we already decremented + * bytes_may_use. + */ + if (!extent_reserved) + btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); +out_unlock: + unlock_extent_cached(io_tree, start, end, &cached_state); +out_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kvfree(pages); +out: + if (ret >= 0) + iocb->ki_pos += encoded->len; + return ret; +} + #ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a |