diff options
Diffstat (limited to 'fs/jbd2')
-rw-r--r-- | fs/jbd2/checkpoint.c | 206 | ||||
-rw-r--r-- | fs/jbd2/journal.c | 201 | ||||
-rw-r--r-- | fs/jbd2/transaction.c | 17 |
3 files changed, 357 insertions, 67 deletions
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 63b526d44886..746132998c57 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -80,23 +80,15 @@ static inline void __buffer_relink_io(struct journal_head *jh) } /* - * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. + * Check a checkpoint buffer could be release or not. * * Requires j_list_lock */ -static int __try_to_free_cp_buf(struct journal_head *jh) +static inline bool __cp_buffer_busy(struct journal_head *jh) { - int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_transaction == NULL && !buffer_locked(bh) && - !buffer_dirty(bh) && !buffer_write_io_error(bh)) { - JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __jbd2_journal_remove_checkpoint(jh) + 1; - } - return ret; + return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); } /* @@ -228,7 +220,6 @@ int jbd2_log_do_checkpoint(journal_t *journal) * OK, we need to start writing disk blocks. Take one transaction * and write it. */ - result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; @@ -295,8 +286,6 @@ restart: goto restart; } if (!buffer_dirty(bh)) { - if (unlikely(buffer_write_io_error(bh)) && !result) - result = -EIO; BUFFER_TRACE(bh, "remove from checkpoint"); if (__jbd2_journal_remove_checkpoint(jh)) /* The transaction was released; we're done */ @@ -356,8 +345,6 @@ restart2: spin_lock(&journal->j_list_lock); goto restart2; } - if (unlikely(buffer_write_io_error(bh)) && !result) - result = -EIO; /* * Now in whatever state the buffer currently is, we @@ -369,10 +356,7 @@ restart2: } out: spin_unlock(&journal->j_list_lock); - if (result < 0) - jbd2_journal_abort(journal, result); - else - result = jbd2_cleanup_journal_tail(journal); + result = jbd2_cleanup_journal_tail(journal); return (result < 0) ? result : 0; } @@ -437,7 +421,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret; if (!jh) return 0; @@ -446,13 +429,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) do { jh = next_jh; next_jh = jh->b_cpnext; - if (!destroy) - ret = __try_to_free_cp_buf(jh); - else - ret = __jbd2_journal_remove_checkpoint(jh) + 1; - if (!ret) + + if (!destroy && __cp_buffer_busy(jh)) return 0; - if (ret == 2) + + if (__jbd2_journal_remove_checkpoint(jh)) return 1; /* * This function only frees up some memory @@ -468,6 +449,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) } /* + * journal_shrink_one_cp_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the given list + * and try to release them. If the whole transaction is released, set + * the 'released' parameter. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, + unsigned long *nr_to_scan, + bool *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + unsigned long nr_freed = 0; + int ret; + + if (!jh || *nr_to_scan == 0) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + + (*nr_to_scan)--; + if (__cp_buffer_busy(jh)) + continue; + + nr_freed++; + ret = __jbd2_journal_remove_checkpoint(jh); + if (ret) { + *released = true; + break; + } + + if (need_resched()) + break; + } while (jh != last_jh && *nr_to_scan); + + return nr_freed; +} + +/* + * jbd2_journal_shrink_checkpoint_list + * + * Find 'nr_to_scan' written-back checkpoint buffers in the journal + * and try to release them. Return the number of released checkpointed + * buffers. + * + * Called with j_list_lock held. + */ +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, + unsigned long *nr_to_scan) +{ + transaction_t *transaction, *last_transaction, *next_transaction; + bool released; + tid_t first_tid = 0, last_tid = 0, next_tid = 0; + tid_t tid = 0; + unsigned long nr_freed = 0; + unsigned long nr_scanned = *nr_to_scan; + +again: + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) { + spin_unlock(&journal->j_list_lock); + goto out; + } + + /* + * Get next shrink transaction, resume previous scan or start + * over again. If some others do checkpoint and drop transaction + * from the checkpoint list, we ignore saved j_shrink_transaction + * and start over unconditionally. + */ + if (journal->j_shrink_transaction) + transaction = journal->j_shrink_transaction; + else + transaction = journal->j_checkpoint_transactions; + + if (!first_tid) + first_tid = transaction->t_tid; + last_transaction = journal->j_checkpoint_transactions->t_cpprev; + next_transaction = transaction; + last_tid = last_transaction->t_tid; + do { + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + tid = transaction->t_tid; + released = false; + + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, + nr_to_scan, &released); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + if (released) + continue; + + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, + nr_to_scan, &released); + if (*nr_to_scan == 0) + break; + if (need_resched() || spin_needbreak(&journal->j_list_lock)) + break; + } while (transaction != last_transaction); + + if (transaction != last_transaction) { + journal->j_shrink_transaction = next_transaction; + next_tid = next_transaction->t_tid; + } else { + journal->j_shrink_transaction = NULL; + next_tid = 0; + } + + spin_unlock(&journal->j_list_lock); + cond_resched(); + + if (*nr_to_scan && next_tid) + goto again; +out: + nr_scanned -= *nr_to_scan; + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, + nr_freed, nr_scanned, next_tid); + + return nr_freed; +} + +/* * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. @@ -564,24 +676,37 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; - int ret = 0; + struct buffer_head *bh = jh2bh(jh); JBUFFER_TRACE(jh, "entry"); - if ((transaction = jh->b_cp_transaction) == NULL) { + transaction = jh->b_cp_transaction; + if (!transaction) { JBUFFER_TRACE(jh, "not on transaction"); - goto out; + return 0; } journal = transaction->t_journal; JBUFFER_TRACE(jh, "removing from transaction"); + + /* + * If we have failed to write the buffer out to disk, the filesystem + * may become inconsistent. We cannot abort the journal here since + * we hold j_list_lock and we have to be careful about races with + * jbd2_journal_destroy(). So mark the writeback IO error in the + * journal here and we abort the journal later from a better context. + */ + if (buffer_write_io_error(bh)) + set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags); + __buffer_unlink(jh); jh->b_cp_transaction = NULL; + percpu_counter_dec(&journal->j_checkpoint_jh_count); jbd2_journal_put_journal_head(jh); - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) - goto out; + /* Is this transaction empty? */ + if (transaction->t_checkpoint_list || transaction->t_checkpoint_io_list) + return 0; /* * There is one special case to worry about: if we have just pulled the @@ -593,10 +718,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) * See the comment at the end of jbd2_journal_commit_transaction(). */ if (transaction->t_state != T_FINISHED) - goto out; + return 0; - /* OK, that was the last buffer for the transaction: we can now - safely remove this transaction from the log */ + /* + * OK, that was the last buffer for the transaction, we can now + * safely remove this transaction from the log. + */ stats = &transaction->t_chp_stats; if (stats->cs_chp_time) stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, @@ -606,9 +733,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) __jbd2_journal_drop_transaction(journal, transaction); jbd2_journal_free_transaction(transaction); - ret = 1; -out: - return ret; + return 1; } /* @@ -639,6 +764,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, jh->b_cpnext->b_cpprev = jh; } transaction->t_checkpoint_list = jh; + percpu_counter_inc(&transaction->t_journal->j_checkpoint_jh_count); } /* @@ -654,6 +780,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) { assert_spin_locked(&journal->j_list_lock); + + journal->j_shrink_transaction = NULL; if (transaction->t_cpnext) { transaction->t_cpnext->t_cpprev = transaction->t_cpprev; transaction->t_cpprev->t_cpnext = transaction->t_cpnext; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2dc944442802..35302bc192eb 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -934,10 +934,6 @@ int jbd2_fc_wait_bufs(journal_t *journal, int num_blks) } EXPORT_SYMBOL(jbd2_fc_wait_bufs); -/* - * Wait on fast commit buffers that were allocated by jbd2_fc_get_buf - * for completion. - */ int jbd2_fc_release_bufs(journal_t *journal) { struct buffer_head *bh; @@ -945,10 +941,6 @@ int jbd2_fc_release_bufs(journal_t *journal) j_fc_off = journal->j_fc_off; - /* - * Wait in reverse order to minimize chances of us being woken up before - * all IOs have completed - */ for (i = j_fc_off - 1; i >= 0; i--) { bh = journal->j_fc_wbuf[i]; if (!bh) @@ -1291,6 +1283,48 @@ static int jbd2_min_tag_size(void) return sizeof(journal_block_tag_t) - 4; } +/** + * jbd2_journal_shrink_scan() + * + * Scan the checkpointed buffer on the checkpoint list and release the + * journal_head. + */ +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long nr_to_scan = sc->nr_to_scan; + unsigned long nr_shrunk; + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); + + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); + + return nr_shrunk; +} + +/** + * jbd2_journal_shrink_count() + * + * Count the number of checkpoint buffers on the checkpoint list. + */ +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + journal_t *journal = container_of(shrink, journal_t, j_shrinker); + unsigned long count; + + count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); + + return count; +} + /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1369,9 +1403,23 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; + journal->j_shrink_transaction = NULL; + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; + journal->j_shrinker.seeks = DEFAULT_SEEKS; + journal->j_shrinker.batch = journal->j_max_transaction_buffers; + + if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) + goto err_cleanup; + + if (register_shrinker(&journal->j_shrinker)) { + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + goto err_cleanup; + } return journal; err_cleanup: + brelse(journal->j_sb_buffer); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); kfree(journal); @@ -1618,6 +1666,10 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, if (is_journal_aborted(journal)) return -EIO; + if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) { + jbd2_journal_abort(journal, -EIO); + return -EIO; + } BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -1686,6 +1738,110 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) write_unlock(&journal->j_state_lock); } +/** + * __jbd2_journal_erase() - Discard or zeroout journal blocks (excluding superblock) + * @journal: The journal to erase. + * @flags: A discard/zeroout request is sent for each physically contigous + * region of the journal. Either JBD2_JOURNAL_FLUSH_DISCARD or + * JBD2_JOURNAL_FLUSH_ZEROOUT must be set to determine which operation + * to perform. + * + * Note: JBD2_JOURNAL_FLUSH_ZEROOUT attempts to use hardware offload. Zeroes + * will be explicitly written if no hardware offload is available, see + * blkdev_issue_zeroout for more details. + */ +static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) +{ + int err = 0; + unsigned long block, log_offset; /* logical */ + unsigned long long phys_block, block_start, block_stop; /* physical */ + loff_t byte_start, byte_stop, byte_count; + struct request_queue *q = bdev_get_queue(journal->j_dev); + + /* flags must be set to either discard or zeroout */ + if ((flags & ~JBD2_JOURNAL_FLUSH_VALID) || !flags || + ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && + (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) + return -EINVAL; + + if (!q) + return -ENXIO; + + if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !blk_queue_discard(q)) + return -EOPNOTSUPP; + + /* + * lookup block mapping and issue discard/zeroout for each + * contiguous region + */ + log_offset = be32_to_cpu(journal->j_superblock->s_first); + block_start = ~0ULL; + for (block = log_offset; block < journal->j_total_len; block++) { + err = jbd2_journal_bmap(journal, block, &phys_block); + if (err) { + pr_err("JBD2: bad block at offset %lu", block); + return err; + } + + if (block_start == ~0ULL) { + block_start = phys_block; + block_stop = block_start - 1; + } + + /* + * last block not contiguous with current block, + * process last contiguous region and return to this block on + * next loop + */ + if (phys_block != block_stop + 1) { + block--; + } else { + block_stop++; + /* + * if this isn't the last block of journal, + * no need to process now because next block may also + * be part of this contiguous region + */ + if (block != journal->j_total_len - 1) + continue; + } + + /* + * end of contiguous region or this is last block of journal, + * take care of the region + */ + byte_start = block_start * journal->j_blocksize; + byte_stop = block_stop * journal->j_blocksize; + byte_count = (block_stop - block_start + 1) * + journal->j_blocksize; + + truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping, + byte_start, byte_stop); + + if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { + err = blkdev_issue_discard(journal->j_dev, + byte_start >> SECTOR_SHIFT, + byte_count >> SECTOR_SHIFT, + GFP_NOFS, 0); + } else if (flags & JBD2_JOURNAL_FLUSH_ZEROOUT) { + err = blkdev_issue_zeroout(journal->j_dev, + byte_start >> SECTOR_SHIFT, + byte_count >> SECTOR_SHIFT, + GFP_NOFS, 0); + } + + if (unlikely(err != 0)) { + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", + err, block_start, block_stop); + return err; + } + + /* reset start and stop after processing a region */ + block_start = ~0ULL; + } + + return blkdev_issue_flush(journal->j_dev); +} /** * jbd2_journal_update_sb_errno() - Update error in the journal. @@ -1995,6 +2151,16 @@ int jbd2_journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock); + /* + * OK, all checkpoint transactions have been checked, now check the + * write out io error flag and abort the journal if some buffer failed + * to write back to the original location, otherwise the filesystem + * may become inconsistent. + */ + if (!is_journal_aborted(journal) && + test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) + jbd2_journal_abort(journal, -EIO); + if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { mutex_lock_io(&journal->j_checkpoint_mutex); @@ -2012,6 +2178,10 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } + if (journal->j_shrinker.flags & SHRINKER_REGISTERED) { + percpu_counter_destroy(&journal->j_checkpoint_jh_count); + unregister_shrinker(&journal->j_shrinker); + } if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); iput(journal->j_inode); @@ -2246,13 +2416,18 @@ EXPORT_SYMBOL(jbd2_journal_clear_features); /** * jbd2_journal_flush() - Flush journal * @journal: Journal to act on. + * @flags: optional operation on the journal blocks after the flush (see below) * * Flush all data for a given journal to disk and empty the journal. * Filesystems can use this when remounting readonly to ensure that - * recovery does not need to happen on remount. + * recovery does not need to happen on remount. Optionally, a discard or zeroout + * can be issued on the journal blocks after flushing. + * + * flags: + * JBD2_JOURNAL_FLUSH_DISCARD: issues discards for the journal blocks + * JBD2_JOURNAL_FLUSH_ZEROOUT: issues zeroouts for the journal blocks */ - -int jbd2_journal_flush(journal_t *journal) +int jbd2_journal_flush(journal_t *journal, unsigned int flags) { int err = 0; transaction_t *transaction = NULL; @@ -2306,6 +2481,10 @@ int jbd2_journal_flush(journal_t *journal) * commits of data to the journal will restore the current * s_start value. */ jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); + + if (flags) + err = __jbd2_journal_erase(journal, flags); + mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e8fc45fd751f..8804e126805f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2123,7 +2123,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) { struct buffer_head *head; struct buffer_head *bh; - bool has_write_io_error = false; int ret = 0; J_ASSERT(PageLocked(page)); @@ -2148,26 +2147,10 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page) jbd2_journal_put_journal_head(jh); if (buffer_jbd(bh)) goto busy; - - /* - * If we free a metadata buffer which has been failed to - * write out, the jbd2 checkpoint procedure will not detect - * this failure and may lead to filesystem inconsistency - * after cleanup journal tail. - */ - if (buffer_write_io_error(bh)) { - pr_err("JBD2: Error while async write back metadata bh %llu.", - (unsigned long long)bh->b_blocknr); - has_write_io_error = true; - } } while ((bh = bh->b_this_page) != head); ret = try_to_free_buffers(page); - busy: - if (has_write_io_error) - jbd2_journal_abort(journal, -EIO); - return ret; } |