diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 1068 |
1 files changed, 469 insertions, 599 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dbbb67293e34..f09fbdc43f0f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -124,11 +124,11 @@ static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); -static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset); + +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -423,11 +423,10 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, while (index <= end_index) { /* - * For locked page, we will call end_extent_writepage() on it - * in run_delalloc_range() for the error handling. That - * end_extent_writepage() function will call - * btrfs_mark_ordered_io_finished() to clear page Ordered and - * run the ordered extent accounting. + * For locked page, we will call btrfs_mark_ordered_io_finished + * through btrfs_mark_ordered_io_finished() on it + * in run_delalloc_range() for the error handling, which will + * clear page Ordered and run the ordered extent accounting. * * Here we can't just clear the Ordered bit, or * btrfs_mark_ordered_io_finished() would skip the accounting @@ -815,24 +814,22 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, } /* - * we create compressed extents in two phases. The first - * phase compresses a range of pages that have already been - * locked (both pages and state bits are locked). + * Work queue call back to started compression on a file and pages. * - * This is done inside an ordered work queue, and the compression - * is spread across many cpus. The actual IO submission is step - * two, and the ordered work queue takes care of making sure that - * happens in the same order things were put onto the queue by - * writepages and friends. + * This is done inside an ordered work queue, and the compression is spread + * across many cpus. The actual IO submission is step two, and the ordered work + * queue takes care of making sure that happens in the same order things were + * put onto the queue by writepages and friends. * - * If this code finds it can't get good compression, it puts an - * entry onto the work queue to write the uncompressed bytes. This - * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that the flusher thread sent them - * down. + * If this code finds it can't get good compression, it puts an entry onto the + * work queue to write the uncompressed bytes. This makes sure that both + * compressed inodes and uncompressed inodes are written in the same order that + * the flusher thread sent them down. */ -static noinline int compress_file_range(struct async_chunk *async_chunk) +static void compress_file_range(struct btrfs_work *work) { + struct async_chunk *async_chunk = + container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -842,19 +839,24 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages = NULL; + struct page **pages; unsigned long nr_pages; unsigned long total_compressed = 0; unsigned long total_in = 0; + unsigned int poff; int i; - int will_compress; int compress_type = fs_info->compress_type; - int compressed_extents = 0; - int redirty = 0; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* + * We need to call clear_page_dirty_for_io on each page in the range. + * Otherwise applications with the file mmap'd can wander in and change + * the page contents while we are compressing them. + */ + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size @@ -868,7 +870,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - will_compress = 0; + pages = NULL; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); @@ -912,78 +914,57 @@ again: ret = 0; /* - * we do compression for mount -o compress and when the - * inode has not been flagged as nocompress. This flag can - * change at any time if we discover bad compression ratios. + * We do compression for mount -o compress and when the inode has not + * been flagged as NOCOMPRESS. This flag can change at any time if we + * discover bad compression ratios. */ - if (inode_need_compress(inode, start, end)) { - WARN_ON(pages); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { - /* just bail out to the uncompressed code */ - nr_pages = 0; - goto cont; - } - - if (inode->defrag_compress) - compress_type = inode->defrag_compress; - else if (inode->prop_compress) - compress_type = inode->prop_compress; + if (!inode_need_compress(inode, start, end)) + goto cleanup_and_bail_uncompressed; + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) { /* - * we need to call clear_page_dirty_for_io on each - * page in the range. Otherwise applications with the file - * mmap'd can wander in and change the page contents while - * we are compressing them. - * - * If the compression fails for any reason, we set the pages - * dirty again later on. - * - * Note that the remaining part is redirtied, the start pointer - * has moved, the end is the original one. + * Memory allocation failure is not a fatal error, we can fall + * back to uncompressed code. */ - if (!redirty) { - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); - redirty = 1; - } + goto cleanup_and_bail_uncompressed; + } - /* Compression level is applied here and only here */ - ret = btrfs_compress_pages( - compress_type | (fs_info->compress_level << 4), - mapping, start, - pages, - &nr_pages, - &total_in, - &total_compressed); + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; + + /* Compression level is applied here. */ + ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), + mapping, start, pages, &nr_pages, &total_in, + &total_compressed); + if (ret) + goto mark_incompressible; - if (!ret) { - unsigned long offset = offset_in_page(total_compressed); - struct page *page = pages[nr_pages - 1]; + /* + * Zero the tail end of the last page, as we might be sending it down + * to disk. + */ + poff = offset_in_page(total_compressed); + if (poff) + memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); - /* zero the tail end of the last page, we might be - * sending it down to disk - */ - if (offset) - memzero_page(page, offset, PAGE_SIZE - offset); - will_compress = 1; - } - } -cont: /* + * Try to create an inline extent. + * + * If we didn't compress the entire range, try to create an uncompressed + * inline extent, else a compressed one. + * * Check cow_file_range() for why we don't even try to create inline - * extent for subpage case. + * extent for the subpage case. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - /* lets try to make an inline extent */ - if (ret || total_in < actual_end) { - /* we didn't compress the entire range, try - * to make an uncompressed inline extent. - */ - ret = cow_file_range_inline(inode, actual_end, - 0, BTRFS_COMPRESS_NONE, - NULL, false); + if (total_in < actual_end) { + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, + false); } else { - /* try making a compressed inline extent */ ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, @@ -1013,99 +994,52 @@ cont: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - - /* - * Ensure we only free the compressed pages if we have - * them allocated, as we can still reach here with - * inode_need_compress() == false. - */ - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); - } - return 0; + goto free_pages; } } - if (will_compress) { - /* - * we aren't doing an inline extent round the compressed size - * up to a block size boundary so the allocator does sane - * things - */ - total_compressed = ALIGN(total_compressed, blocksize); + /* + * We aren't doing an inline extent. Round the compressed size up to a + * block size boundary so the allocator does sane things. + */ + total_compressed = ALIGN(total_compressed, blocksize); - /* - * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk, - * compression must free at least one sector size - */ - total_in = round_up(total_in, fs_info->sectorsize); - if (total_compressed + blocksize <= total_in) { - compressed_extents++; + /* + * One last check to make sure the compression is really a win, compare + * the page count read with the blocks on disk, compression must free at + * least one sector. + */ + total_in = round_up(total_in, fs_info->sectorsize); + if (total_compressed + blocksize > total_in) + goto mark_incompressible; - /* - * The async work queues will take care of doing actual - * allocation on disk for these compressed pages, and - * will submit them to the elevator. - */ - add_async_extent(async_chunk, start, total_in, - total_compressed, pages, nr_pages, - compress_type); - - if (start + total_in < end) { - start += total_in; - pages = NULL; - cond_resched(); - goto again; - } - return compressed_extents; - } + /* + * The async work queues will take care of doing actual allocation on + * disk for these compressed pages, and will submit the bios. + */ + add_async_extent(async_chunk, start, total_in, total_compressed, pages, + nr_pages, compress_type); + if (start + total_in < end) { + start += total_in; + cond_resched(); + goto again; } + return; + +mark_incompressible: + if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) + inode->flags |= BTRFS_INODE_NOCOMPRESS; +cleanup_and_bail_uncompressed: + add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); +free_pages: if (pages) { - /* - * the compression code ran but failed to make things smaller, - * free any pages it allocated and our page pointer array - */ for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); - pages = NULL; - total_compressed = 0; - nr_pages = 0; - - /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(inode->prop_compress)) { - inode->flags |= BTRFS_INODE_NOCOMPRESS; - } - } -cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in the file - * we've been given so far. redirty the locked page if it corresponds - * to our extent and set things up for the async work queue to run - * cow_file_range to do the normal delalloc dance. - */ - if (async_chunk->locked_page && - (page_offset(async_chunk->locked_page) >= start && - page_offset(async_chunk->locked_page)) <= end) { - __set_page_dirty_nobuffers(async_chunk->locked_page); - /* unlocked later on in the async handlers */ } - - if (redirty) - extent_range_redirty_for_io(&inode->vfs_inode, start, end); - add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); - compressed_extents++; - - return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -1124,14 +1058,12 @@ static void free_async_extent_pages(struct async_extent *async_extent) async_extent->pages = NULL; } -static int submit_uncompressed_range(struct btrfs_inode *inode, - struct async_extent *async_extent, - struct page *locked_page) +static void submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; - unsigned long nr_written = 0; - int page_started = 0; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -1140,45 +1072,33 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, .no_cgroup_owner = 1, }; - /* - * Call cow_file_range() to run the delalloc range directly, since we - * won't go to NOCOW or async path again. - * - * Also we call cow_file_range() with @unlock_page == 0, so that we - * can directly submit them without interruption. - */ - ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0, NULL); - /* Inline extent inserted, page gets unlocked and everything is done */ - if (page_started) - return 0; - + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); - const u64 page_end = page_start + PAGE_SIZE - 1; set_page_writeback(locked_page); end_page_writeback(locked_page); - end_extent_writepage(locked_page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, locked_page, + page_start, PAGE_SIZE, + !ret); + btrfs_page_clear_uptodate(inode->root->fs_info, + locked_page, page_start, + PAGE_SIZE); + mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } - return ret; } - - /* All pages will be unlocked, including @locked_page */ - wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); - wbc_detach_inode(&wbc); - return ret; } -static int submit_one_async_extent(struct btrfs_inode *inode, - struct async_chunk *async_chunk, - struct async_extent *async_extent, - u64 *alloc_hint) +static void submit_one_async_extent(struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) { + struct btrfs_inode *inode = async_chunk->inode; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1206,9 +1126,8 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } lock_extent(io_tree, start, end, NULL); - /* We have fall back to uncompressed write */ - if (!async_extent->pages) { - ret = submit_uncompressed_range(inode, async_extent, locked_page); + if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + submit_uncompressed_range(inode, async_extent, locked_page); goto done; } @@ -1217,7 +1136,6 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->compressed_size, 0, *alloc_hint, &ins, 1, 1); if (ret) { - free_async_extent_pages(async_extent); /* * Here we used to try again by going back to non-compressed * path for ENOSPC. But we can't reserve space even for @@ -1272,7 +1190,7 @@ done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); kfree(async_extent); - return ret; + return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1286,39 +1204,13 @@ out_free: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); - goto done; -} - -/* - * Phase two of compressed writeback. This is the ordered portion of the code, - * which only gets called in the order the work was queued. We walk all the - * async extents created by compress_file_range and send them down to the disk. - */ -static noinline void submit_compressed_extents(struct async_chunk *async_chunk) -{ - struct btrfs_inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_extent *async_extent; - u64 alloc_hint = 0; - int ret = 0; - - while (!list_empty(&async_chunk->extents)) { - u64 extent_start; - u64 ram_size; - - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - extent_start = async_extent->start; - ram_size = async_extent->ram_size; - - ret = submit_one_async_extent(inode, async_chunk, async_extent, - &alloc_hint); - btrfs_debug(fs_info, + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); + btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - inode->root->root_key.objectid, - btrfs_ino(inode), extent_start, ram_size, ret); - } + root->root_key.objectid, btrfs_ino(inode), start, + async_extent->ram_size, ret); + kfree(async_extent); } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, @@ -1362,25 +1254,18 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_page is the page that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * *page_started is set to one if we unlock locked_page and do everything - * required to start IO on it. It may be clean and already done with - * IO when we return. - * - * When unlock == 1, we unlock the pages in successfully allocated regions. - * When unlock == 0, we leave them locked for writing them out. + * When this function fails, it unlocks all pages except @locked_page. * - * However, we unlock all the pages except @locked_page in case of failure. + * When this function successfully creates an inline extent, it returns 1 and + * unlocks all pages including locked_page and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_page is + * the only page handled anyway). * - * In summary, page locking state will be as follow: + * When this function succeed and creates a normal extent, the page locking + * status depends on the passed in flags: * - * - page_started == 1 (return value) - * - All the pages are unlocked. IO is started. - * - Note that this can happen only on success - * - unlock == 1 - * - All the pages except @locked_page are unlocked in any case - * - unlock == 0 - * - On success, all the pages are locked for writing out them - * - On failure, all the pages except @locked_page are unlocked + * - If @keep_locked is set, all pages are kept locked. + * - Else all pages except for @locked_page are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1389,10 +1274,9 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset) + struct page *locked_page, u64 start, u64 end, + u64 *done_offset, + bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1431,7 +1315,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * This means we can trigger inline extent even if we didn't want to. * So here we skip inline extent creation completely. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); @@ -1451,9 +1335,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - *nr_written = *nr_written + - (end - start + PAGE_SIZE) / PAGE_SIZE; - *page_started = 1; /* * locked_page is locked by the caller of * writepage_delalloc(), not locked by @@ -1463,11 +1344,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * as it doesn't have any subpage::writers recorded. * * Here we manually unlock the page, since the caller - * can't use page_started to determine if it's an - * inline extent or a compressed extent. + * can't determine if it's an inline extent or a + * compressed extent. */ unlock_page(locked_page); - goto out; + ret = 1; + goto done; } else if (ret < 0) { goto out_unlock; } @@ -1498,6 +1380,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + /* + * btrfs_reserve_extent only returns -EAGAIN for zoned + * file systems, which is an indication that there are + * no active zones to allocate from at the moment. + * + * If this is the first loop iteration, wait for at + * least one zone to finish before retrying the + * allocation. Otherwise ask the caller to write out + * the already allocated blocks before coming back to + * us, or return -ENOSPC if it can't handle retries. + */ + ASSERT(btrfs_is_zoned(fs_info)); + if (start == orig_start) { + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + continue; + } + if (done_offset) { + *done_offset = start - 1; + return 0; + } + ret = -ENOSPC; + } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; @@ -1558,7 +1465,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was * properly setup for writepage. */ - page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, @@ -1581,7 +1488,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } -out: +done: + if (done_offset) + *done_offset = end; return ret; out_drop_extent_cache: @@ -1591,21 +1500,6 @@ out_reserve: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: /* - * If done_offset is non-NULL and ret == -EAGAIN, we expect the - * caller to write out the successfully allocated region and retry. - */ - if (done_offset && ret == -EAGAIN) { - if (orig_start < start) - *done_offset = start - 1; - else - *done_offset = start; - return ret; - } else if (ret == -EAGAIN) { - /* Convert to -ENOSPC since the caller cannot retry. */ - ret = -ENOSPC; - } - - /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| @@ -1627,10 +1521,10 @@ out_unlock: * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup * function. * - * However, in case of unlock == 0, we still need to unlock the pages + * However, in case of @keep_locked, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ - if (!unlock && orig_start < start) { + if (keep_locked && orig_start < start) { if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, @@ -1654,8 +1548,6 @@ out_unlock: clear_bits, page_ops); start += cur_alloc_size; - if (start >= end) - return ret; } /* @@ -1664,50 +1556,37 @@ out_unlock: * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ - extent_clear_unlock_delalloc(inode, start, end, locked_page, - clear_bits | EXTENT_CLEAR_DATA_RESV, - page_ops); - return ret; -} - -/* - * work queue call back to started compression on a file and pages - */ -static noinline void async_cow_start(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - int compressed_extents; - - async_chunk = container_of(work, struct async_chunk, work); - - compressed_extents = compress_file_range(async_chunk); - if (compressed_extents == 0) { - btrfs_add_delayed_iput(async_chunk->inode); - async_chunk->inode = NULL; + if (start < end) { + clear_bits |= EXTENT_CLEAR_DATA_RESV; + extent_clear_unlock_delalloc(inode, start, end, locked_page, + clear_bits, page_ops); } + return ret; } /* - * work queue call back to submit previously compressed pages + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. */ -static noinline void async_cow_submit(struct btrfs_work *work) +static noinline void submit_compressed_extents(struct btrfs_work *work) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); + struct async_extent *async_extent; unsigned long nr_pages; + u64 alloc_hint = 0; nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* - * ->inode could be NULL if async_chunk_start has failed to compress, - * in which case we don't have anything to submit, yet we need to - * always adjust ->async_delalloc_pages as its paired with the init - * happening in run_delalloc_compressed - */ - if (async_chunk->inode) - submit_compressed_extents(async_chunk); + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + submit_one_async_extent(async_chunk, async_extent, &alloc_hint); + } /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < @@ -1721,8 +1600,7 @@ static noinline void async_cow_free(struct btrfs_work *work) struct async_cow *async_cow; async_chunk = container_of(work, struct async_chunk, work); - if (async_chunk->inode) - btrfs_add_delayed_iput(async_chunk->inode); + btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -1732,10 +1610,8 @@ static noinline void async_cow_free(struct btrfs_work *work) } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); @@ -1809,65 +1685,42 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, async_chunk[i].blkcg_css = NULL; } - btrfs_init_work(&async_chunk[i].work, async_cow_start, - async_cow_submit, async_cow_free); + btrfs_init_work(&async_chunk[i].work, compress_file_range, + submit_compressed_extents, async_cow_free); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); - *nr_written += nr_pages; start = cur_end + 1; } - *page_started = 1; return true; } -static noinline int run_delalloc_zoned(struct btrfs_inode *inode, - struct page *locked_page, u64 start, - u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) +/* + * Run the delalloc range from start to end, and write back any dirty pages + * covered by the range. + */ +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty) { u64 done_offset = end; int ret; - bool locked_page_done = false; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0, &done_offset); - if (ret && ret != -EAGAIN) + ret = cow_file_range(inode, locked_page, start, end, &done_offset, + true, false); + if (ret) return ret; - - if (*page_started) { - ASSERT(ret == 0); - return 0; - } - - if (ret == 0) - done_offset = end; - - if (done_offset == start) { - wait_on_bit_io(&inode->root->fs_info->flags, - BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - continue; - } - - if (!locked_page_done) { - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - } - locked_page_done = true; - extent_write_locked_range(&inode->vfs_inode, start, done_offset, - wbc); + extent_write_locked_range(&inode->vfs_inode, locked_page, start, + done_offset, wbc, pages_dirty); start = done_offset + 1; } - *page_started = 1; - - return 0; + return 1; } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, @@ -1894,8 +1747,7 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, } static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, unsigned long *nr_written) + const u64 start, const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); @@ -1903,6 +1755,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; u64 count; + int ret; /* * If EXTENT_NORESERVE is set it means that when the buffered write was @@ -1955,8 +1808,14 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, NULL); } - return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1, NULL); + /* + * Don't try to create inline extents, as a mix of inline extent that + * is written out and unlocked directly and a normal NOCOW extent + * doesn't work. + */ + ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ASSERT(ret != 1); + return ret; } struct can_nocow_file_extent_args { @@ -2105,9 +1964,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, - unsigned long *nr_written) + const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -2117,25 +1974,26 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); - struct btrfs_block_group *bg; - bool nocow = false; struct can_nocow_file_extent_args nocow_args = { 0 }; + /* + * Normally on a zoned device we're only doing COW writes, but in case + * of relocation on a zoned filesystem serializes I/O so that we're only + * writing sequentially and can end up here as well. + */ + ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - return -ENOMEM; + ret = -ENOMEM; + goto error; } nocow_args.end = end; nocow_args.writeback_path = true; while (1) { + struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; @@ -2146,8 +2004,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int extent_type; bool is_prealloc; - nocow = false; - ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) @@ -2172,11 +2028,8 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } if (ret > 0) break; leaf = path->nodes[0]; @@ -2209,7 +2062,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; - goto out_check; + goto must_cow; } /* @@ -2239,24 +2092,22 @@ next_slot: nocow_args.start = cur_offset; ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } else if (ret == 0) { - goto out_check; - } + if (ret == 0) + goto must_cow; ret = 0; - bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); - if (bg) - nocow = true; -out_check: - /* - * If nocow is false then record the beginning of the range - * that needs to be COWed - */ - if (!nocow) { + nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (!nocow_bg) { +must_cow: + /* + * If we can't perform NOCOW writeback for the range, + * then record the beginning of the range that needs to + * be COWed. It will be written out before the next + * NOCOW range if we find one, or when exiting this + * loop. + */ if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; @@ -2275,11 +2126,12 @@ out_check: */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written); - if (ret) - goto error; + cow_start, found_key.offset - 1); cow_start = (u64)-1; + if (ret) { + btrfs_dec_nocow_writers(nocow_bg); + goto error; + } } nocow_end = cur_offset + nocow_args.num_bytes - 1; @@ -2296,6 +2148,7 @@ out_check: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { + btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; } @@ -2309,6 +2162,7 @@ out_check: ? (1 << BTRFS_ORDERED_PREALLOC) : (1 << BTRFS_ORDERED_NOCOW), BTRFS_COMPRESS_NONE); + btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, @@ -2318,11 +2172,6 @@ out_check: goto error; } - if (nocow) { - btrfs_dec_nocow_writers(bg); - nocow = false; - } - if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent @@ -2357,17 +2206,24 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end, - page_started, nr_written); + ret = fallback_to_cow(inode, locked_page, cow_start, end); + cow_start = (u64)-1; if (ret) goto error; } -error: - if (nocow) - btrfs_dec_nocow_writers(bg); + btrfs_free_path(path); + return 0; - if (ret && cur_offset < end) +error: + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to cow_start to ensure the COW region is unlocked + * as well. + */ + if (cow_start != (u64)-1) + cur_offset = cow_start; + if (cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | @@ -2395,49 +2251,37 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc) + u64 start, u64 end, struct writeback_control *wbc) { - int ret = 0; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + int ret; /* - * The range must cover part of the @locked_page, or the returned - * @page_started can confuse the caller. + * The range must cover part of the @locked_page, or a return of 1 + * can confuse the caller. */ ASSERT(!(end <= page_offset(locked_page) || start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { - /* - * Normally on a zoned device we're only doing COW writes, but - * in case of relocation on a zoned filesystem we have taken - * precaution, that we're only writing sequentially. It's safe - * to use run_delalloc_nocow() here, like for regular - * preallocated inodes. - */ - ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, nr_written); + ret = run_delalloc_nocow(inode, locked_page, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, wbc, locked_page, start, - end, page_started, nr_written)) - goto out; + run_delalloc_compressed(inode, locked_page, start, end, wbc)) + return 1; if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written, wbc); + ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + true); else - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, start, end, NULL, + false, false); out: - ASSERT(ret <= 0); - if (ret) + if (ret < 0) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; @@ -2840,23 +2684,19 @@ struct btrfs_writepage_fixup { static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { - struct btrfs_writepage_fixup *fixup; + struct btrfs_writepage_fixup *fixup = + container_of(work, struct btrfs_writepage_fixup, work); struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page; - struct btrfs_inode *inode; - u64 page_start; - u64 page_end; + struct page *page = fixup->page; + struct btrfs_inode *inode = fixup->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 page_start = page_offset(page); + u64 page_end = page_offset(page) + PAGE_SIZE - 1; int ret = 0; bool free_delalloc_space = true; - fixup = container_of(work, struct btrfs_writepage_fixup, work); - page = fixup->page; - inode = fixup->inode; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; - /* * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. @@ -2949,10 +2789,12 @@ out_page: * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, page, page_start, + PAGE_SIZE, !ret); + btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -3359,6 +3201,13 @@ out: btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, 1); + /* + * Actually free the qgroup rsv which was released when + * the ordered extent was created. + */ + btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + ordered_extent->qgroup_rsv, + BTRFS_QGROUP_RSV_DATA); } } @@ -3384,15 +3233,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate) -{ - trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); -} - /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. @@ -3482,15 +3322,21 @@ zeroit: void btrfs_add_delayed_iput(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned long flags; if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); - spin_lock(&fs_info->delayed_iput_lock); + /* + * Need to be irq safe here because we can be called from either an irq + * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq + * context. + */ + spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); ASSERT(list_empty(&inode->delayed_iput)); list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); } @@ -3499,37 +3345,46 @@ static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) wake_up(&fs_info->delayed_iputs_wait); - spin_lock(&fs_info->delayed_iput_lock); + spin_lock_irq(&fs_info->delayed_iput_lock); } static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { if (!list_empty(&inode->delayed_iput)) { - spin_lock(&fs_info->delayed_iput_lock); + spin_lock_irq(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput)) run_delayed_iput_locked(fs_info, inode); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { - - spin_lock(&fs_info->delayed_iput_lock); + /* + * btrfs_put_ordered_extent() can run in irq context (see bio.c), which + * calls btrfs_add_delayed_iput() and that needs to lock + * fs_info->delayed_iput_lock. So we need to disable irqs here to + * prevent a deadlock. + */ + spin_lock_irq(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); - cond_resched_lock(&fs_info->delayed_iput_lock); + if (need_resched()) { + spin_unlock_irq(&fs_info->delayed_iput_lock); + cond_resched(); + spin_lock_irq(&fs_info->delayed_iput_lock); + } } - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); } /* @@ -3647,9 +3502,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ if (found_key.offset == last_objectid) { + /* + * We found the same inode as before. This means we were + * not able to remove its items via eviction triggered + * by an iput(). A transaction abort may have happened, + * due to -ENOSPC for example, so try to grab the error + * that lead to a transaction abort, if any. + */ btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); - ret = -EINVAL; + ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out; } @@ -3659,11 +3521,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(fs_info->sb, last_objectid, root); - ret = PTR_ERR_OR_ZERO(inode); - if (ret && ret != -ENOENT) - goto out; + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + if (ret != -ENOENT) + goto out; + } - if (ret == -ENOENT && root == fs_info->tree_root) { + if (!inode && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0; @@ -3724,17 +3589,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (ret == -ENOENT || inode->i_nlink) { - if (!ret) { + if (!inode || inode->i_nlink) { + if (inode) { ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); + inode = NULL; if (ret) goto out; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - iput(inode); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", @@ -3742,10 +3607,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); - if (ret) { - iput(inode); + if (ret) goto out; - } continue; } @@ -3901,8 +3764,8 @@ static int btrfs_read_locked_inode(struct inode *inode, inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); + inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + btrfs_timespec_nsec(leaf, &inode_item->ctime)); BTRFS_I(inode)->i_otime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->otime); @@ -4073,9 +3936,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, inode->i_mtime.tv_nsec); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime(inode).tv_sec); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime(inode).tv_nsec); btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime.tv_sec); @@ -4273,9 +4136,8 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; - dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; + inode_set_ctime_current(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4448,8 +4310,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); - dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); - dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; + dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, ret); @@ -4847,9 +4708,6 @@ again: ret = -ENOMEM; goto out; } - ret = set_page_extent_mapped(page); - if (ret < 0) - goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_read_folio(NULL, page_folio(page)); @@ -4864,6 +4722,17 @@ again: goto out_unlock; } } + + /* + * We unlock the page after the io is completed and then re-lock it + * above. release_folio() could have come in between that and cleared + * PagePrivate(), but left the page in the mapping. Set the page mapped + * here to make sure it's properly set for the subpage stuff. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + wait_on_page_writeback(page); lock_extent(io_tree, block_start, block_end, &cached_state); @@ -5091,8 +4960,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); } } @@ -5714,11 +5582,11 @@ struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root return btrfs_iget_path(s, ino, root, NULL); } -static struct inode *new_simple_dir(struct super_block *s, +static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { - struct inode *inode = new_inode(s); + struct inode *inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -5736,10 +5604,11 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = current_time(inode); - inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); + inode->i_atime = dir->i_atime; BTRFS_I(inode)->i_otime = inode->i_mtime; + inode->i_uid = dir->i_uid; + inode->i_gid = dir->i_gid; return inode; } @@ -5798,7 +5667,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, root); + inode = new_simple_dir(dir, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); btrfs_put_root(sub_root); @@ -5849,6 +5718,74 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, } /* + * Find the highest existing sequence number in a directory and then set the + * in-memory index_cnt variable to the first free sequence number. + */ +static int btrfs_set_inode_index_count(struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + if (path->slots[0] == 0) { + inode->index_cnt = BTRFS_DIR_START_INDEX; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != btrfs_ino(inode) || + found_key.type != BTRFS_DIR_INDEX_KEY) { + inode->index_cnt = BTRFS_DIR_START_INDEX; + goto out; + } + + inode->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) +{ + if (dir->index_cnt == (u64)-1) { + int ret; + + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + } + + *index = dir->index_cnt; + + return 0; +} + +/* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy * our information into that, and then dir_emit from the buffer. This is @@ -5860,10 +5797,17 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, static int btrfs_opendir(struct inode *inode, struct file *file) { struct btrfs_file_private *private; + u64 last_index; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); + if (ret) + return ret; private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); if (!private) return -ENOMEM; + private->last_index = last_index; private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!private->filldir_buf) { kfree(private); @@ -5908,8 +5852,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_key found_key; struct btrfs_path *path; void *addr; - struct list_head ins_list; - struct list_head del_list; + LIST_HEAD(ins_list); + LIST_HEAD(del_list); int ret; char *name_ptr; int name_len; @@ -5928,9 +5872,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - INIT_LIST_HEAD(&ins_list); - INIT_LIST_HEAD(&del_list); - put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); + put = btrfs_readdir_get_delayed_items(inode, private->last_index, + &ins_list, &del_list); again: key.type = BTRFS_DIR_INDEX_KEY; @@ -5948,6 +5891,8 @@ again: break; if (found_key.offset < ctx->pos) continue; + if (found_key.offset > private->last_index) + break; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) continue; di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); @@ -6063,8 +6008,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -static int btrfs_update_time(struct inode *inode, struct timespec64 *now, - int flags) +static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; bool dirty = flags & ~S_VERSION; @@ -6072,69 +6016,11 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, if (btrfs_root_readonly(root)) return -EROFS; - if (flags & S_VERSION) - dirty |= inode_maybe_inc_iversion(inode, dirty); - if (flags & S_CTIME) - inode->i_ctime = *now; - if (flags & S_MTIME) - inode->i_mtime = *now; - if (flags & S_ATIME) - inode->i_atime = *now; + dirty = inode_update_timestamps(inode, flags); return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } /* - * find the highest existing sequence number in a directory - * and then set the in-memory index_cnt variable to reflect - * free sequence numbers - */ -static int btrfs_set_inode_index_count(struct btrfs_inode *inode) -{ - struct btrfs_root *root = inode->root; - struct btrfs_key key, found_key; - struct btrfs_path *path; - struct extent_buffer *leaf; - int ret; - - key.objectid = btrfs_ino(inode); - key.type = BTRFS_DIR_INDEX_KEY; - key.offset = (u64)-1; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - /* FIXME: we should be able to handle this */ - if (ret == 0) - goto out; - ret = 0; - - if (path->slots[0] == 0) { - inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; - } - - path->slots[0]--; - - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != btrfs_ino(inode) || - found_key.type != BTRFS_DIR_INDEX_KEY) { - inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; - } - - inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; -} - -/* * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree */ @@ -6378,9 +6264,8 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; /* @@ -6545,12 +6430,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done). */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { - struct timespec64 now = current_time(&parent_inode->vfs_inode); + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) + parent_inode->vfs_inode.i_mtime = + inode_set_ctime_current(&parent_inode->vfs_inode); - parent_inode->vfs_inode.i_mtime = now; - parent_inode->vfs_inode.i_ctime = now; - } ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); @@ -6690,7 +6573,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -7849,8 +7732,11 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { - bbio->bio.bi_status = errno_to_blk_status(ret); - btrfs_dio_end_io(bbio); + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + file_offset, dip->bytes, + !ret); + bio->bi_status = errno_to_blk_status(ret); + iomap_dio_bio_end_io(bio); return; } } @@ -8753,7 +8639,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -8777,7 +8663,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct timespec64 ctime = current_time(old_inode); struct btrfs_rename_ctx old_rename_ctx; struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); @@ -8908,12 +8793,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_mtime = ctime; - old_dir->i_ctime = ctime; - new_dir->i_mtime = ctime; - new_dir->i_ctime = ctime; - old_inode->i_ctime = ctime; - new_inode->i_ctime = ctime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9177,11 +9057,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_mtime = current_time(old_dir); - old_dir->i_ctime = old_dir->i_mtime; - new_dir->i_mtime = old_dir->i_mtime; - new_dir->i_ctime = old_dir->i_mtime; - old_inode->i_ctime = old_dir->i_mtime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9203,7 +9079,6 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode) { inode_inc_iversion(new_inode); - new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); @@ -9336,14 +9211,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; - struct list_head works; - struct list_head splice; + LIST_HEAD(works); + LIST_HEAD(splice); int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX; - INIT_LIST_HEAD(&works); - INIT_LIST_HEAD(&splice); - mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -9431,14 +9303,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, .range_end = LLONG_MAX, }; struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); int ret; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); @@ -9743,7 +9613,7 @@ next: *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && |