diff options
Diffstat (limited to 'fs')
175 files changed, 13978 insertions, 1822 deletions
diff --git a/fs/befs/befs.h b/fs/befs/befs.h index e0f59263a96d..c6bad51d8ec7 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -43,7 +43,10 @@ struct befs_sb_info { u32 ag_shift; u32 num_ags; - /* jornal log entry */ + /* State of the superblock */ + u32 flags; + + /* Journal log entry */ befs_block_run log_blocks; befs_off_t log_start; befs_off_t log_end; @@ -79,7 +82,7 @@ enum befs_err { BEFS_BT_END, BEFS_BT_EMPTY, BEFS_BT_MATCH, - BEFS_BT_PARMATCH, + BEFS_BT_OVERFLOW, BEFS_BT_NOT_FOUND }; @@ -140,18 +143,6 @@ befs_iaddrs_per_block(struct super_block *sb) return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr); } -static inline int -befs_iaddr_is_empty(const befs_inode_addr *iaddr) -{ - return (!iaddr->allocation_group) && (!iaddr->start) && (!iaddr->len); -} - -static inline size_t -befs_brun_size(struct super_block *sb, befs_block_run run) -{ - return BEFS_SB(sb)->block_size * run.len; -} - #include "endian.h" #endif /* _LINUX_BEFS_H */ diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 307645f9e284..7e135ea73fdd 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -85,7 +85,7 @@ struct befs_btree_node { }; /* local constants */ -static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; +static const befs_off_t BEFS_BT_INVAL = 0xffffffffffffffffULL; /* local functions */ static int befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds, @@ -156,8 +156,6 @@ befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds, sup->max_depth = fs32_to_cpu(sb, od_sup->max_depth); sup->data_type = fs32_to_cpu(sb, od_sup->data_type); sup->root_node_ptr = fs64_to_cpu(sb, od_sup->root_node_ptr); - sup->free_node_ptr = fs64_to_cpu(sb, od_sup->free_node_ptr); - sup->max_size = fs64_to_cpu(sb, od_sup->max_size); brelse(bh); if (sup->magic != BEFS_BTREE_MAGIC) { @@ -183,8 +181,8 @@ befs_bt_read_super(struct super_block *sb, const befs_data_stream *ds, * Calls befs_read_datastream to read in the indicated btree node and * makes sure its header fields are in cpu byteorder, byteswapping if * necessary. - * Note: node->bh must be NULL when this function called first - * time. Don't forget brelse(node->bh) after last call. + * Note: node->bh must be NULL when this function is called the first time. + * Don't forget brelse(node->bh) after last call. * * On success, returns BEFS_OK and *@node contains the btree node that * starts at @node_off, with the node->head fields in cpu byte order. @@ -244,7 +242,7 @@ befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds, * Read the superblock and rootnode of the b+tree. * Drill down through the interior nodes using befs_find_key(). * Once at the correct leaf node, use befs_find_key() again to get the - * actuall value stored with the key. + * actual value stored with the key. */ int befs_btree_find(struct super_block *sb, const befs_data_stream *ds, @@ -283,9 +281,9 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, while (!befs_leafnode(this_node)) { res = befs_find_key(sb, this_node, key, &node_off); - if (res == BEFS_BT_NOT_FOUND) + /* if no key set, try the overflow node */ + if (res == BEFS_BT_OVERFLOW) node_off = this_node->head.overflow; - /* if no match, go to overflow node */ if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) { befs_error(sb, "befs_btree_find() failed to read " "node at %llu", node_off); @@ -293,15 +291,15 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, } } - /* at the correct leaf node now */ - + /* at a leaf node now, check if it is correct */ res = befs_find_key(sb, this_node, key, value); brelse(this_node->bh); kfree(this_node); if (res != BEFS_BT_MATCH) { - befs_debug(sb, "<--- %s Key %s not found", __func__, key); + befs_error(sb, "<--- %s Key %s not found", __func__, key); + befs_debug(sb, "<--- %s ERROR", __func__); *value = 0; return BEFS_BT_NOT_FOUND; } @@ -324,16 +322,12 @@ befs_btree_find(struct super_block *sb, const befs_data_stream *ds, * @findkey: Keystring to search for * @value: If key is found, the value stored with the key is put here * - * finds exact match if one exists, and returns BEFS_BT_MATCH - * If no exact match, finds first key in node that is greater - * (alphabetically) than the search key and returns BEFS_BT_PARMATCH - * (for partial match, I guess). Can you think of something better to - * call it? - * - * If no key was a match or greater than the search key, return - * BEFS_BT_NOT_FOUND. + * Finds exact match if one exists, and returns BEFS_BT_MATCH. + * If there is no match and node's value array is too small for key, return + * BEFS_BT_OVERFLOW. + * If no match and node should countain this key, return BEFS_BT_NOT_FOUND. * - * Use binary search instead of a linear. + * Uses binary search instead of a linear. */ static int befs_find_key(struct super_block *sb, struct befs_btree_node *node, @@ -348,18 +342,16 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, befs_debug(sb, "---> %s %s", __func__, findkey); - *value = 0; - findkey_len = strlen(findkey); - /* if node can not contain key, just skeep this node */ + /* if node can not contain key, just skip this node */ last = node->head.all_key_count - 1; thiskey = befs_bt_get_key(sb, node, last, &keylen); eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len); if (eq < 0) { - befs_debug(sb, "<--- %s %s not found", __func__, findkey); - return BEFS_BT_NOT_FOUND; + befs_debug(sb, "<--- node can't contain %s", findkey); + return BEFS_BT_OVERFLOW; } valarray = befs_bt_valarray(node); @@ -387,12 +379,15 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, else first = mid + 1; } + + /* return an existing value so caller can arrive to a leaf node */ if (eq < 0) *value = fs64_to_cpu(sb, valarray[mid + 1]); else *value = fs64_to_cpu(sb, valarray[mid]); - befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid); - return BEFS_BT_PARMATCH; + befs_error(sb, "<--- %s %s not found", __func__, findkey); + befs_debug(sb, "<--- %s ERROR", __func__); + return BEFS_BT_NOT_FOUND; } /** @@ -405,7 +400,7 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, * @keysize: Length of the returned key * @value: Value stored with the returned key * - * Heres how it works: Key_no is the index of the key/value pair to + * Here's how it works: Key_no is the index of the key/value pair to * return in keybuf/value. * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is * the number of characters in the key (just a convenience). @@ -422,7 +417,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, { struct befs_btree_node *this_node; befs_btree_super bt_super; - befs_off_t node_off = 0; + befs_off_t node_off; int cur_key; fs64 *valarray; char *keystart; @@ -467,7 +462,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, while (key_sum + this_node->head.all_key_count <= key_no) { /* no more nodes to look in: key_no is too large */ - if (this_node->head.right == befs_bt_inval) { + if (this_node->head.right == BEFS_BT_INVAL) { *keysize = 0; *value = 0; befs_debug(sb, @@ -541,7 +536,6 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, * @node_off: Pointer to offset of current node within datastream. Modified * by the function. * - * * Helper function for btree traverse. Moves the current position to the * start of the first leaf node. * @@ -608,7 +602,7 @@ static int befs_leafnode(struct befs_btree_node *node) { /* all interior nodes (and only interior nodes) have an overflow node */ - if (node->head.overflow == befs_bt_inval) + if (node->head.overflow == BEFS_BT_INVAL) return 1; else return 0; @@ -715,7 +709,7 @@ befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node, * * Returns 0 if @key1 and @key2 are equal. * Returns >0 if @key1 is greater. - * Returns <0 if @key2 is greater.. + * Returns <0 if @key2 is greater. */ static int befs_compare_strings(const void *key1, int keylen1, diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index af1bc19b7c85..b4c7ba013c0d 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -22,22 +22,22 @@ const befs_inode_addr BAD_IADDR = { 0, 0, 0 }; static int befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t blockno, befs_block_run * run); + befs_blocknr_t blockno, befs_block_run *run); static int befs_find_brun_indirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run); + befs_block_run *run); static int befs_find_brun_dblindirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run); + befs_block_run *run); /** * befs_read_datastream - get buffer_head containing data, starting from pos. * @sb: Filesystem superblock - * @ds: datastrem to find data with + * @ds: datastream to find data with * @pos: start of data * @off: offset of data in buffer_head->b_data * @@ -46,7 +46,7 @@ static int befs_find_brun_dblindirect(struct super_block *sb, */ struct buffer_head * befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, - befs_off_t pos, uint * off) + befs_off_t pos, uint *off) { struct buffer_head *bh; befs_block_run run; @@ -75,7 +75,13 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, return bh; } -/* +/** + * befs_fblock2brun - give back block run for fblock + * @sb: the superblock + * @data: datastream to read from + * @fblock: the blocknumber with the file position to find + * @run: The found run is passed back through this pointer + * * Takes a file position and gives back a brun who's starting block * is block number fblock of the file. * @@ -88,7 +94,7 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, */ int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t fblock, befs_block_run * run) + befs_blocknr_t fblock, befs_block_run *run) { int err; befs_off_t pos = fblock << BEFS_SB(sb)->block_shift; @@ -115,7 +121,7 @@ befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, /** * befs_read_lsmylink - read long symlink from datastream. * @sb: Filesystem superblock - * @ds: Datastrem to read from + * @ds: Datastream to read from * @buff: Buffer in which to place long symlink data * @len: Length of the long symlink in bytes * @@ -128,6 +134,7 @@ befs_read_lsymlink(struct super_block *sb, const befs_data_stream *ds, befs_off_t bytes_read = 0; /* bytes readed */ u16 plen; struct buffer_head *bh; + befs_debug(sb, "---> %s length: %llu", __func__, len); while (bytes_read < len) { @@ -183,13 +190,13 @@ befs_count_blocks(struct super_block *sb, const befs_data_stream *ds) metablocks += ds->indirect.len; /* - Double indir block, plus all the indirect blocks it mapps - In the double-indirect range, all block runs of data are - BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know - how many data block runs are in the double-indirect region, - and from that we know how many indirect blocks it takes to - map them. We assume that the indirect blocks are also - BEFS_DBLINDIR_BRUN_LEN blocks long. + * Double indir block, plus all the indirect blocks it maps. + * In the double-indirect range, all block runs of data are + * BEFS_DBLINDIR_BRUN_LEN blocks long. Therefore, we know + * how many data block runs are in the double-indirect region, + * and from that we know how many indirect blocks it takes to + * map them. We assume that the indirect blocks are also + * BEFS_DBLINDIR_BRUN_LEN blocks long. */ if (ds->size > ds->max_indirect_range && ds->max_indirect_range != 0) { uint dbl_bytes; @@ -212,58 +219,50 @@ befs_count_blocks(struct super_block *sb, const befs_data_stream *ds) return blocks; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the direct region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - Linear search. Checks each element of array[] to see if it - contains the blockno-th filesystem block. This is necessary - because the block runs map variable amounts of data. Simply - keeps a count of the number of blocks searched so far (sum), - incrementing this by the length of each block run as we come - across it. Adds sum to *count before returning (this is so - you can search multiple arrays that are logicaly one array, - as in the indirect region code). - - When/if blockno is found, if blockno is inside of a block - run as stored on disk, we offset the start and length members - of the block run, so that blockno is the start and len is - still valid (the run ends in the same place). - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_direct - find a direct block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the direct region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * Linear search. Checks each element of array[] to see if it + * contains the blockno-th filesystem block. This is necessary + * because the block runs map variable amounts of data. Simply + * keeps a count of the number of blocks searched so far (sum), + * incrementing this by the length of each block run as we come + * across it. Adds sum to *count before returning (this is so + * you can search multiple arrays that are logicaly one array, + * as in the indirect region code). + * + * When/if blockno is found, if blockno is inside of a block + * run as stored on disk, we offset the start and length members + * of the block run, so that blockno is the start and len is + * still valid (the run ends in the same place). + */ static int befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t blockno, befs_block_run * run) + befs_blocknr_t blockno, befs_block_run *run) { int i; const befs_block_run *array = data->direct; befs_blocknr_t sum; - befs_blocknr_t max_block = - data->max_direct_range >> BEFS_SB(sb)->block_shift; befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno); - if (blockno > max_block) { - befs_error(sb, "%s passed block outside of direct region", - __func__); - return BEFS_ERR; - } - for (i = 0, sum = 0; i < BEFS_NUM_DIRECT_BLOCKS; sum += array[i].len, i++) { if (blockno >= sum && blockno < sum + (array[i].len)) { int offset = blockno - sum; + run->allocation_group = array[i].allocation_group; run->start = array[i].start + offset; run->len = array[i].len - offset; @@ -275,38 +274,39 @@ befs_find_brun_direct(struct super_block *sb, const befs_data_stream *data, } } + befs_error(sb, "%s failed to find file block %lu", __func__, + (unsigned long)blockno); befs_debug(sb, "---> %s ERROR", __func__); return BEFS_ERR; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the indirect region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - For each block in the indirect run of the datastream, read - it in and search through it for search_blk. - - XXX: - Really should check to make sure blockno is inside indirect - region. - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_indirect - find a block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the indirect region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * For each block in the indirect run of the datastream, read + * it in and search through it for search_blk. + * + * XXX: + * Really should check to make sure blockno is inside indirect + * region. + */ static int befs_find_brun_indirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run) + befs_block_run *run) { int i, j; befs_blocknr_t sum = 0; @@ -326,11 +326,12 @@ befs_find_brun_indirect(struct super_block *sb, /* Examine blocks of the indirect run one at a time */ for (i = 0; i < indirect.len; i++) { - indirblock = befs_bread(sb, indirblockno + i); + indirblock = sb_bread(sb, indirblockno + i); if (indirblock == NULL) { - befs_debug(sb, "---> %s failed to read " + befs_error(sb, "---> %s failed to read " "disk block %lu from the indirect brun", __func__, (unsigned long)indirblockno + i); + befs_debug(sb, "<--- %s ERROR", __func__); return BEFS_ERR; } @@ -370,52 +371,51 @@ befs_find_brun_indirect(struct super_block *sb, return BEFS_ERR; } -/* - Finds the block run that starts at file block number blockno - in the file represented by the datastream data, if that - blockno is in the double-indirect region of the datastream. - - sb: the superblock - data: the datastream - blockno: the blocknumber to find - run: The found run is passed back through this pointer - - Return value is BEFS_OK if the blockrun is found, BEFS_ERR - otherwise. - - Algorithm: - The block runs in the double-indirect region are different. - They are always allocated 4 fs blocks at a time, so each - block run maps a constant amount of file data. This means - that we can directly calculate how many block runs into the - double-indirect region we need to go to get to the one that - maps a particular filesystem block. - - We do this in two stages. First we calculate which of the - inode addresses in the double-indirect block will point us - to the indirect block that contains the mapping for the data, - then we calculate which of the inode addresses in that - indirect block maps the data block we are after. - - Oh, and once we've done that, we actually read in the blocks - that contain the inode addresses we calculated above. Even - though the double-indirect run may be several blocks long, - we can calculate which of those blocks will contain the index - we are after and only read that one. We then follow it to - the indirect block and perform a similar process to find - the actual block run that maps the data block we are interested - in. - - Then we offset the run as in befs_find_brun_array() and we are - done. - - 2001-11-15 Will Dyson -*/ +/** + * befs_find_brun_dblindirect - find a block run in the datastream + * @sb: the superblock + * @data: the datastream + * @blockno: the blocknumber to find + * @run: The found run is passed back through this pointer + * + * Finds the block run that starts at file block number blockno + * in the file represented by the datastream data, if that + * blockno is in the double-indirect region of the datastream. + * + * Return value is BEFS_OK if the blockrun is found, BEFS_ERR + * otherwise. + * + * Algorithm: + * The block runs in the double-indirect region are different. + * They are always allocated 4 fs blocks at a time, so each + * block run maps a constant amount of file data. This means + * that we can directly calculate how many block runs into the + * double-indirect region we need to go to get to the one that + * maps a particular filesystem block. + * + * We do this in two stages. First we calculate which of the + * inode addresses in the double-indirect block will point us + * to the indirect block that contains the mapping for the data, + * then we calculate which of the inode addresses in that + * indirect block maps the data block we are after. + * + * Oh, and once we've done that, we actually read in the blocks + * that contain the inode addresses we calculated above. Even + * though the double-indirect run may be several blocks long, + * we can calculate which of those blocks will contain the index + * we are after and only read that one. We then follow it to + * the indirect block and perform a similar process to find + * the actual block run that maps the data block we are interested + * in. + * + * Then we offset the run as in befs_find_brun_array() and we are + * done. + */ static int befs_find_brun_dblindirect(struct super_block *sb, const befs_data_stream *data, befs_blocknr_t blockno, - befs_block_run * run) + befs_block_run *run) { int dblindir_indx; int indir_indx; @@ -430,10 +430,9 @@ befs_find_brun_dblindirect(struct super_block *sb, struct buffer_head *indir_block; befs_block_run indir_run; befs_disk_inode_addr *iaddr_array; - struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_blocknr_t indir_start_blk = - data->max_indirect_range >> befs_sb->block_shift; + data->max_indirect_range >> BEFS_SB(sb)->block_shift; off_t dbl_indir_off = blockno - indir_start_blk; @@ -471,7 +470,7 @@ befs_find_brun_dblindirect(struct super_block *sb, } dbl_indir_block = - befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) + + sb_bread(sb, iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); if (dbl_indir_block == NULL) { befs_error(sb, "%s couldn't read the " @@ -479,7 +478,6 @@ befs_find_brun_dblindirect(struct super_block *sb, (unsigned long) iaddr2blockno(sb, &data->double_indirect) + dbl_which_block); - brelse(dbl_indir_block); return BEFS_ERR; } @@ -499,12 +497,11 @@ befs_find_brun_dblindirect(struct super_block *sb, } indir_block = - befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); + sb_bread(sb, iaddr2blockno(sb, &indir_run) + which_block); if (indir_block == NULL) { befs_error(sb, "%s couldn't read the indirect block " "at blockno %lu", __func__, (unsigned long) iaddr2blockno(sb, &indir_run) + which_block); - brelse(indir_block); return BEFS_ERR; } diff --git a/fs/befs/debug.c b/fs/befs/debug.c index 4de7cffcd662..85c13392e9e8 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -169,6 +169,7 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks)); befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks)); + befs_debug(sb, " inode_size %u", fs32_to_cpu(sb, sup->inode_size)); befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2)); befs_debug(sb, " blocks_per_ag %u", diff --git a/fs/befs/io.c b/fs/befs/io.c index 523c8af2d770..b4a558126ee1 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -27,7 +27,7 @@ struct buffer_head * befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) { struct buffer_head *bh; - befs_blocknr_t block = 0; + befs_blocknr_t block; struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_debug(sb, "---> Enter %s " @@ -59,27 +59,3 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } - -struct buffer_head * -befs_bread(struct super_block *sb, befs_blocknr_t block) -{ - struct buffer_head *bh; - - befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block); - - bh = sb_bread(sb, block); - - if (bh == NULL) { - befs_error(sb, "Failed to read block %lu", - (unsigned long)block); - goto error; - } - - befs_debug(sb, "<--- %s", __func__); - - return bh; - - error: - befs_debug(sb, "<--- %s ERROR", __func__); - return NULL; -} diff --git a/fs/befs/io.h b/fs/befs/io.h index 9b78266b6aa5..78d7bc6e60de 100644 --- a/fs/befs/io.h +++ b/fs/befs/io.h @@ -5,5 +5,3 @@ struct buffer_head *befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr); -struct buffer_head *befs_bread(struct super_block *sb, befs_blocknr_t block); - diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index bfe9f9994935..647a276eba56 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -120,7 +120,7 @@ befs_get_block(struct inode *inode, sector_t block, struct super_block *sb = inode->i_sb; befs_data_stream *ds = &BEFS_I(inode)->i_data.ds; befs_block_run run = BAD_IADDR; - int res = 0; + int res; ulong disk_off; befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld", @@ -179,15 +179,16 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) kfree(utfname); } else { - ret = befs_btree_find(sb, ds, dentry->d_name.name, &offset); + ret = befs_btree_find(sb, ds, name, &offset); } if (ret == BEFS_BT_NOT_FOUND) { befs_debug(sb, "<--- %s %pd not found", __func__, dentry); + d_add(dentry, NULL); return ERR_PTR(-ENOENT); } else if (ret != BEFS_OK || offset == 0) { - befs_warning(sb, "<--- %s Error", __func__); + befs_error(sb, "<--- %s Error", __func__); return ERR_PTR(-ENODATA); } @@ -211,56 +212,55 @@ befs_readdir(struct file *file, struct dir_context *ctx) befs_off_t value; int result; size_t keysize; - unsigned char d_type; char keybuf[BEFS_NAME_LEN + 1]; befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld", __func__, file, inode->i_ino, ctx->pos); -more: - result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, - keybuf, &keysize, &value); + while (1) { + result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, + keybuf, &keysize, &value); - if (result == BEFS_ERR) { - befs_debug(sb, "<--- %s ERROR", __func__); - befs_error(sb, "IO error reading %pD (inode %lu)", - file, inode->i_ino); - return -EIO; - - } else if (result == BEFS_BT_END) { - befs_debug(sb, "<--- %s END", __func__); - return 0; - - } else if (result == BEFS_BT_EMPTY) { - befs_debug(sb, "<--- %s Empty directory", __func__); - return 0; - } + if (result == BEFS_ERR) { + befs_debug(sb, "<--- %s ERROR", __func__); + befs_error(sb, "IO error reading %pD (inode %lu)", + file, inode->i_ino); + return -EIO; - d_type = DT_UNKNOWN; + } else if (result == BEFS_BT_END) { + befs_debug(sb, "<--- %s END", __func__); + return 0; - /* Convert to NLS */ - if (BEFS_SB(sb)->nls) { - char *nlsname; - int nlsnamelen; - result = - befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen); - if (result < 0) { - befs_debug(sb, "<--- %s ERROR", __func__); - return result; + } else if (result == BEFS_BT_EMPTY) { + befs_debug(sb, "<--- %s Empty directory", __func__); + return 0; } - if (!dir_emit(ctx, nlsname, nlsnamelen, - (ino_t) value, d_type)) { + + /* Convert to NLS */ + if (BEFS_SB(sb)->nls) { + char *nlsname; + int nlsnamelen; + + result = + befs_utf2nls(sb, keybuf, keysize, &nlsname, + &nlsnamelen); + if (result < 0) { + befs_debug(sb, "<--- %s ERROR", __func__); + return result; + } + if (!dir_emit(ctx, nlsname, nlsnamelen, + (ino_t) value, DT_UNKNOWN)) { + kfree(nlsname); + return 0; + } kfree(nlsname); - return 0; + } else { + if (!dir_emit(ctx, keybuf, keysize, + (ino_t) value, DT_UNKNOWN)) + return 0; } - kfree(nlsname); - } else { - if (!dir_emit(ctx, keybuf, keysize, - (ino_t) value, d_type)) - return 0; + ctx->pos++; } - ctx->pos++; - goto more; } static struct inode * @@ -299,7 +299,6 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) struct befs_sb_info *befs_sb = BEFS_SB(sb); struct befs_inode_info *befs_ino; struct inode *inode; - long ret = -EIO; befs_debug(sb, "---> %s inode = %lu", __func__, ino); @@ -318,7 +317,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) befs_ino->i_inode_num.allocation_group, befs_ino->i_inode_num.start, befs_ino->i_inode_num.len); - bh = befs_bread(sb, inode->i_ino); + bh = sb_bread(sb, inode->i_ino); if (!bh) { befs_error(sb, "unable to read inode block - " "inode = %lu", inode->i_ino); @@ -421,7 +420,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) unacquire_none: iget_failed(inode); befs_debug(sb, "<--- %s - Bad inode", __func__); - return ERR_PTR(ret); + return ERR_PTR(-EIO); } /* Initialize the inode cache. Called at fs setup. @@ -436,10 +435,9 @@ befs_init_inodecache(void) 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); - if (befs_inode_cachep == NULL) { - pr_err("%s: Couldn't initialize inode slabcache\n", __func__); + if (befs_inode_cachep == NULL) return -ENOMEM; - } + return 0; } @@ -524,8 +522,6 @@ befs_utf2nls(struct super_block *sb, const char *in, *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "%s cannot allocate memory", __func__); - *out_len = 0; return -ENOMEM; } @@ -604,7 +600,6 @@ befs_nls2utf(struct super_block *sb, const char *in, *out = result = kmalloc(maxlen, GFP_NOFS); if (!*out) { - befs_error(sb, "%s cannot allocate memory", __func__); *out_len = 0; return -ENOMEM; } @@ -637,10 +632,6 @@ befs_nls2utf(struct super_block *sb, const char *in, return -EILSEQ; } -/** - * Use the - * - */ enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; @@ -760,19 +751,19 @@ befs_fill_super(struct super_block *sb, void *data, int silent) long ret = -EINVAL; const unsigned long sb_block = 0; const off_t x86_sb_off = 512; + int blocksize; save_mount_options(sb, data); sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL); - if (sb->s_fs_info == NULL) { - pr_err("(%s): Unable to allocate memory for private " - "portion of superblock. Bailing.\n", sb->s_id); + if (sb->s_fs_info == NULL) goto unacquire_none; - } + befs_sb = BEFS_SB(sb); if (!parse_options((char *) data, &befs_sb->mount_opts)) { - befs_error(sb, "cannot parse mount options"); + if (!silent) + befs_error(sb, "cannot parse mount options"); goto unacquire_priv_sbp; } @@ -793,10 +784,16 @@ befs_fill_super(struct super_block *sb, void *data, int silent) * least 1k to get the second 512 bytes of the volume. * -WD 10-26-01 */ - sb_min_blocksize(sb, 1024); + blocksize = sb_min_blocksize(sb, 1024); + if (!blocksize) { + if (!silent) + befs_error(sb, "unable to set blocksize"); + goto unacquire_priv_sbp; + } if (!(bh = sb_bread(sb, sb_block))) { - befs_error(sb, "unable to read superblock"); + if (!silent) + befs_error(sb, "unable to read superblock"); goto unacquire_priv_sbp; } @@ -820,9 +817,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if( befs_sb->num_blocks > ~((sector_t)0) ) { - befs_error(sb, "blocks count: %llu " - "is larger than the host can use", - befs_sb->num_blocks); + if (!silent) + befs_error(sb, "blocks count: %llu is larger than the host can use", + befs_sb->num_blocks); goto unacquire_priv_sbp; } @@ -841,7 +838,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); if (!sb->s_root) { - befs_error(sb, "get root inode failed"); + if (!silent) + befs_error(sb, "get root inode failed"); goto unacquire_priv_sbp; } @@ -870,9 +868,9 @@ befs_fill_super(struct super_block *sb, void *data, int silent) unacquire_priv_sbp: kfree(befs_sb->mount_opts.iocharset); kfree(sb->s_fs_info); + sb->s_fs_info = NULL; unacquire_none: - sb->s_fs_info = NULL; return ret; } diff --git a/fs/befs/super.c b/fs/befs/super.c index aeafc4d84278..7c50025c99d8 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -13,24 +13,20 @@ #include "befs.h" #include "super.h" -/** - * load_befs_sb -- Read from disk and properly byteswap all the fields +/* + * befs_load_sb -- Read from disk and properly byteswap all the fields * of the befs superblock - * - * - * - * */ int -befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) +befs_load_sb(struct super_block *sb, befs_super_block *disk_sb) { struct befs_sb_info *befs_sb = BEFS_SB(sb); /* Check the byte order of the filesystem */ if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) - befs_sb->byte_order = BEFS_BYTESEX_LE; + befs_sb->byte_order = BEFS_BYTESEX_LE; else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE) - befs_sb->byte_order = BEFS_BYTESEX_BE; + befs_sb->byte_order = BEFS_BYTESEX_BE; befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); @@ -45,6 +41,8 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) befs_sb->ag_shift = fs32_to_cpu(sb, disk_sb->ag_shift); befs_sb->num_ags = fs32_to_cpu(sb, disk_sb->num_ags); + befs_sb->flags = fs32_to_cpu(sb, disk_sb->flags); + befs_sb->log_blocks = fsrun_to_cpu(sb, disk_sb->log_blocks); befs_sb->log_start = fs64_to_cpu(sb, disk_sb->log_start); befs_sb->log_end = fs64_to_cpu(sb, disk_sb->log_end); @@ -84,15 +82,15 @@ befs_check_sb(struct super_block *sb) } if (befs_sb->block_size > PAGE_SIZE) { - befs_error(sb, "blocksize(%u) cannot be larger" + befs_error(sb, "blocksize(%u) cannot be larger " "than system pagesize(%lu)", befs_sb->block_size, PAGE_SIZE); return BEFS_ERR; } /* - * block_shift and block_size encode the same information - * in different ways as a consistency check. + * block_shift and block_size encode the same information + * in different ways as a consistency check. */ if ((1 << befs_sb->block_shift) != befs_sb->block_size) { @@ -101,10 +99,18 @@ befs_check_sb(struct super_block *sb) return BEFS_ERR; } - if (befs_sb->log_start != befs_sb->log_end) { + + /* ag_shift also encodes the same information as blocks_per_ag in a + * different way, non-fatal consistency check + */ + if ((1 << befs_sb->ag_shift) != befs_sb->blocks_per_ag) + befs_error(sb, "ag_shift disagrees with blocks_per_ag."); + + if (befs_sb->log_start != befs_sb->log_end || + befs_sb->flags == BEFS_DIRTY) { befs_error(sb, "Filesystem not clean! There are blocks in the " - "journal. You must boot into BeOS and mount this volume " - "to make it clean."); + "journal. You must boot into BeOS and mount this " + "volume to make it clean."); return BEFS_ERR; } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ccc70d96958d..d4d8b7e36b2f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -698,7 +698,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } @@ -728,7 +728,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); if (ret) { - bio->bi_error = ret; + comp_bio->bi_error = ret; bio_endio(comp_bio); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6c21bad26a27..0b8ce2b9f7d0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -252,7 +252,8 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e720d3e6ec20..3a57f99d96aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2586,6 +2586,7 @@ int open_ctree(struct super_block *sb, int num_backups_tried = 0; int backup_index = 0; int max_active; + int clear_free_space_tree = 0; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); @@ -3148,6 +3149,26 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; + if (btrfs_test_opt(fs_info, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + clear_free_space_tree = 1; + } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { + btrfs_warn(fs_info, "free space tree is invalid"); + clear_free_space_tree = 1; + } + + if (clear_free_space_tree) { + btrfs_info(fs_info, "clearing free space tree"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + btrfs_warn(fs_info, + "failed to clear free space tree: %d", ret); + close_ctree(tree_root); + return ret; + } + } + if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); @@ -3185,18 +3206,6 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) && - btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_info(fs_info, "clearing free space tree"); - ret = btrfs_clear_free_space_tree(fs_info); - if (ret) { - btrfs_warn(fs_info, - "failed to clear free space tree: %d", ret); - close_ctree(tree_root); - return ret; - } - } - if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ee40384c394d..66a755150056 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5558,17 +5558,45 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } -/* - * The extent buffer bitmap operations are done with byte granularity because - * bitmap items are not guaranteed to be aligned to a word and therefore a - * single word in a bitmap may straddle two pages in the extent buffer. - */ -#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) -#define BITMAP_FIRST_BYTE_MASK(start) \ - ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) -#define BITMAP_LAST_BYTE_MASK(nbits) \ - (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +void le_bitmap_set(u8 *map, unsigned int start, int len) +{ + u8 *p = map + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_set >= 0) { + *p |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~(u8)0; + p++; + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + *p |= mask_to_set; + } +} + +void le_bitmap_clear(u8 *map, unsigned int start, int len) +{ + u8 *p = map + BIT_BYTE(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE); + u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_BYTE; + mask_to_clear = ~(u8)0; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); + *p &= ~mask_to_clear; + } +} /* * eb_bitmap_offset() - calculate the page and offset of the byte containing the @@ -5612,7 +5640,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb, int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, unsigned long nr) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; @@ -5634,13 +5662,13 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; const unsigned int size = pos + len; int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); + u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; @@ -5651,7 +5679,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, kaddr[offset] |= mask_to_set; len -= bits_to_set; bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0U; + mask_to_set = ~(u8)0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; @@ -5676,13 +5704,13 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { - char *kaddr; + u8 *kaddr; struct page *page; unsigned long i; size_t offset; const unsigned int size = pos + len; int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); + u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; @@ -5693,7 +5721,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, kaddr[offset] &= ~mask_to_clear; len -= bits_to_clear; bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0U; + mask_to_clear = ~(u8)0; if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4a094f1dc7ef..ab31d145227e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -59,6 +59,28 @@ */ #define EXTENT_PAGE_PRIVATE 1 +/* + * The extent buffer bitmap operations are done with byte granularity instead of + * word granularity for two reasons: + * 1. The bitmaps must be little-endian on disk. + * 2. Bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +static inline int le_test_bit(int nr, const u8 *addr) +{ + return 1U & (addr[BIT_BYTE(nr)] >> (nr & (BITS_PER_BYTE-1))); +} + +extern void le_bitmap_set(u8 *map, unsigned int start, int len); +extern void le_bitmap_clear(u8 *map, unsigned int start, int len); + struct extent_state; struct btrfs_root; struct btrfs_io_bio; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index e4a42a8e4f84..57401b474ec6 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -151,7 +151,7 @@ static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); } -static unsigned long *alloc_bitmap(u32 bitmap_size) +static u8 *alloc_bitmap(u32 bitmap_size) { void *mem; @@ -180,8 +180,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - unsigned long *bitmap; - char *bitmap_cursor; + u8 *bitmap, *bitmap_cursor; u64 start, end; u64 bitmap_range, i; u32 bitmap_size, flags, expected_extent_count; @@ -231,7 +230,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, block_group->sectorsize); last = div_u64(found_key.objectid + found_key.offset - start, block_group->sectorsize); - bitmap_set(bitmap, first, last - first); + le_bitmap_set(bitmap, first, last - first); extent_count++; nr++; @@ -270,7 +269,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, goto out; } - bitmap_cursor = (char *)bitmap; + bitmap_cursor = bitmap; bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; i = start; while (i < end) { @@ -319,7 +318,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_free_space_info *info; struct btrfs_key key, found_key; struct extent_buffer *leaf; - unsigned long *bitmap; + u8 *bitmap; u64 start, end; /* Initialize to silence GCC. */ u64 extent_start = 0; @@ -363,7 +362,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, break; } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { unsigned long ptr; - char *bitmap_cursor; + u8 *bitmap_cursor; u32 bitmap_pos, data_size; ASSERT(found_key.objectid >= start); @@ -373,7 +372,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap_pos = div_u64(found_key.objectid - start, block_group->sectorsize * BITS_PER_BYTE); - bitmap_cursor = ((char *)bitmap) + bitmap_pos; + bitmap_cursor = bitmap + bitmap_pos; data_size = free_space_bitmap_size(found_key.offset, block_group->sectorsize); @@ -410,7 +409,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, offset = start; bitnr = 0; while (offset < end) { - bit = !!test_bit(bitnr, bitmap); + bit = !!le_test_bit(bitnr, bitmap); if (prev_bit == 0 && bit == 1) { extent_start = offset; } else if (prev_bit == 1 && bit == 0) { @@ -1185,6 +1184,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) } btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); ret = btrfs_commit_transaction(trans, tree_root); @@ -1253,6 +1253,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) return PTR_ERR(trans); btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); fs_info->free_space_root = NULL; ret = clear_free_space_tree(trans, free_space_root); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d19ab0317283..caad80bb9bd0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -273,20 +273,37 @@ out: return ret; } -/** - * test_bit_in_byte - Determine whether a bit is set in a byte - * @nr: bit number to test - * @addr: Address to start counting from - */ -static inline int test_bit_in_byte(int nr, const u8 *addr) +static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, + unsigned long len) { - return 1UL & (addr[nr / BITS_PER_BYTE] >> (nr & (BITS_PER_BYTE - 1))); + unsigned long i; + + for (i = 0; i < len * BITS_PER_BYTE; i++) { + int bit, bit1; + + bit = !!test_bit(i, bitmap); + bit1 = !!extent_buffer_test_bit(eb, 0, i); + if (bit1 != bit) { + test_msg("Bits do not match\n"); + return -EINVAL; + } + + bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1 != bit) { + test_msg("Offset bits do not match\n"); + return -EINVAL; + } + } + return 0; } static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, unsigned long len) { - unsigned long i, x; + unsigned long i, j; + u32 x; + int ret; memset(bitmap, 0, len); memset_extent_buffer(eb, 0, 0, len); @@ -297,16 +314,18 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, bitmap_set(bitmap, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Setting all bits failed\n"); - return -EINVAL; + return ret; } bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Clearing all bits failed\n"); - return -EINVAL; + return ret; } /* Straddling pages test */ @@ -316,9 +335,10 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, sizeof(long) * BITS_PER_BYTE); extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Setting straddling pages failed\n"); - return -EINVAL; + return ret; } bitmap_set(bitmap, 0, len * BITS_PER_BYTE); @@ -328,9 +348,10 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { test_msg("Clearing straddling pages failed\n"); - return -EINVAL; + return ret; } } @@ -339,28 +360,22 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, * something repetitive that could miss some hypothetical off-by-n bug. */ x = 0; - for (i = 0; i < len / sizeof(long); i++) { - x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL; - bitmap[i] = x; - } - write_extent_buffer(eb, bitmap, 0, len); - - for (i = 0; i < len * BITS_PER_BYTE; i++) { - int bit, bit1; - - bit = !!test_bit_in_byte(i, (u8 *)bitmap); - bit1 = !!extent_buffer_test_bit(eb, 0, i); - if (bit1 != bit) { - test_msg("Testing bit pattern failed\n"); - return -EINVAL; + bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); + for (i = 0; i < len * BITS_PER_BYTE / 32; i++) { + x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU; + for (j = 0; j < 32; j++) { + if (x & (1U << j)) { + bitmap_set(bitmap, i * 32 + j, 1); + extent_buffer_bitmap_set(eb, 0, i * 32 + j, 1); + } } + } - bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, - i % BITS_PER_BYTE); - if (bit1 != bit) { - test_msg("Testing bit pattern with offset failed\n"); - return -EINVAL; - } + ret = check_eb_bitmap(bitmap, eb, len); + if (ret) { + test_msg("Random bit pattern failed\n"); + return ret; } return 0; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 7508d3b42780..6e144048a72e 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -24,20 +24,15 @@ #include "../transaction.h" struct free_space_extent { - u64 start, length; + u64 start; + u64 length; }; -/* - * The test cases align their operations to this in order to hit some of the - * edge cases in the bitmap code. - */ -#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE) - static int __check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, struct btrfs_path *path, - struct free_space_extent *extents, + const struct free_space_extent * const extents, unsigned int num_extents) { struct btrfs_free_space_info *info; @@ -126,7 +121,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, struct btrfs_path *path, - struct free_space_extent *extents, + const struct free_space_extent * const extents, unsigned int num_extents) { struct btrfs_free_space_info *info; @@ -168,9 +163,10 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans, static int test_empty_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { + const struct free_space_extent extents[] = { {cache->key.objectid, cache->key.offset}, }; @@ -181,9 +177,10 @@ static int test_empty_block_group(struct btrfs_trans_handle *trans, static int test_remove_all(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = {}; + const struct free_space_extent extents[] = {}; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, @@ -201,16 +198,17 @@ static int test_remove_all(struct btrfs_trans_handle *trans, static int test_remove_beginning(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid + BITMAP_RANGE, - cache->key.offset - BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid + alignment, + cache->key.offset - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -224,17 +222,18 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans, static int test_remove_end(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, cache->key.offset - BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset - alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, cache->key.objectid + - cache->key.offset - BITMAP_RANGE, - BITMAP_RANGE); + cache->key.offset - alignment, + alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -247,18 +246,19 @@ static int test_remove_end(struct btrfs_trans_handle *trans, static int test_remove_middle(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, BITMAP_RANGE}, - {cache->key.objectid + 2 * BITMAP_RANGE, - cache->key.offset - 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, alignment}, + {cache->key.objectid + 2 * alignment, + cache->key.offset - 2 * alignment}, }; int ret; ret = __remove_from_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not remove free space\n"); return ret; @@ -271,10 +271,11 @@ static int test_remove_middle(struct btrfs_trans_handle *trans, static int test_merge_left(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, 2 * alignment}, }; int ret; @@ -287,15 +288,15 @@ static int test_merge_left(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -308,10 +309,11 @@ static int test_merge_left(struct btrfs_trans_handle *trans, static int test_merge_right(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid + alignment, 2 * alignment}, }; int ret; @@ -324,16 +326,16 @@ static int test_merge_right(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -346,10 +348,11 @@ static int test_merge_right(struct btrfs_trans_handle *trans, static int test_merge_both(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, 3 * BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, 3 * alignment}, }; int ret; @@ -362,23 +365,23 @@ static int test_merge_both(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -391,12 +394,13 @@ static int test_merge_both(struct btrfs_trans_handle *trans, static int test_merge_none(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, - struct btrfs_path *path) + struct btrfs_path *path, + u32 alignment) { - struct free_space_extent extents[] = { - {cache->key.objectid, BITMAP_RANGE}, - {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE}, - {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE}, + const struct free_space_extent extents[] = { + {cache->key.objectid, alignment}, + {cache->key.objectid + 2 * alignment, alignment}, + {cache->key.objectid + 4 * alignment, alignment}, }; int ret; @@ -409,23 +413,23 @@ static int test_merge_none(struct btrfs_trans_handle *trans, } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid, BITMAP_RANGE); + cache->key.objectid, alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 4 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 4 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; } ret = __add_to_free_space_tree(trans, fs_info, cache, path, - cache->key.objectid + 2 * BITMAP_RANGE, - BITMAP_RANGE); + cache->key.objectid + 2 * alignment, + alignment); if (ret) { test_msg("Could not add free space\n"); return ret; @@ -438,10 +442,11 @@ static int test_merge_none(struct btrfs_trans_handle *trans, typedef int (*test_func_t)(struct btrfs_trans_handle *, struct btrfs_fs_info *, struct btrfs_block_group_cache *, - struct btrfs_path *); + struct btrfs_path *, + u32 alignment); -static int run_test(test_func_t test_func, int bitmaps, - u32 sectorsize, u32 nodesize) +static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, + u32 nodesize, u32 alignment) { struct btrfs_fs_info *fs_info; struct btrfs_root *root = NULL; @@ -480,7 +485,7 @@ static int run_test(test_func_t test_func, int bitmaps, btrfs_set_header_nritems(root->node, 0); root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE, sectorsize); + cache = btrfs_alloc_dummy_block_group(8 * alignment, sectorsize); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; @@ -514,7 +519,7 @@ static int run_test(test_func_t test_func, int bitmaps, } } - ret = test_func(&trans, root->fs_info, cache, path); + ret = test_func(&trans, root->fs_info, cache, path, alignment); if (ret) goto out; @@ -539,15 +544,27 @@ out: return ret; } -static int run_test_both_formats(test_func_t test_func, - u32 sectorsize, u32 nodesize) +static int run_test_both_formats(test_func_t test_func, u32 sectorsize, + u32 nodesize, u32 alignment) { + int test_ret = 0; int ret; - ret = run_test(test_func, 0, sectorsize, nodesize); - if (ret) - return ret; - return run_test(test_func, 1, sectorsize, nodesize); + ret = run_test(test_func, 0, sectorsize, nodesize, alignment); + if (ret) { + test_msg("%pf failed with extents, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_func, sectorsize, nodesize, alignment); + test_ret = ret; + } + + ret = run_test(test_func, 1, sectorsize, nodesize, alignment); + if (ret) { + test_msg("%pf failed with bitmaps, sectorsize=%u, nodesize=%u, alignment=%u\n", + test_func, sectorsize, nodesize, alignment); + test_ret = ret; + } + + return test_ret; } int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) @@ -563,18 +580,30 @@ int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize) test_merge_both, test_merge_none, }; + u32 bitmap_alignment; + int test_ret = 0; int i; + /* + * Align some operations to a page to flush out bugs in the extent + * buffer bitmap handling of highmem. + */ + bitmap_alignment = BTRFS_FREE_SPACE_BITMAP_BITS * PAGE_SIZE; + test_msg("Running free space tree tests\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { - int ret = run_test_both_formats(tests[i], sectorsize, - nodesize); - if (ret) { - test_msg("%pf : sectorsize %u failed\n", - tests[i], sectorsize); - return ret; - } + int ret; + + ret = run_test_both_formats(tests[i], sectorsize, nodesize, + sectorsize); + if (ret) + test_ret = ret; + + ret = run_test_both_formats(tests[i], sectorsize, nodesize, + bitmap_alignment); + if (ret) + test_ret = ret; } - return 0; + return test_ret; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 7bf08825cc11..18630e800208 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1272,7 +1272,8 @@ again: statret = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, !!page); if (statret < 0) { - __free_page(page); + if (page) + __free_page(page); if (statret == -ENODATA) { BUG_ON(retry_op != READ_INLINE); goto again; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bca1b49c1c4b..ef4d04647325 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1511,7 +1511,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } - if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { + if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && + !(rinfo->hash_order && req->r_path2)) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a29ffce98187..b382e5910eea 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -845,6 +845,8 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) err = ceph_fs_debugfs_init(fsc); if (err < 0) goto fail; + } else { + root = dget(fsc->sb->s_root); } fsc->mount_state = CEPH_MOUNT_MOUNTED; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 40b703217977..febc28f9e2c2 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -16,7 +16,7 @@ static int __remove_xattr(struct ceph_inode_info *ci, struct ceph_inode_xattr *xattr); -const struct xattr_handler ceph_other_xattr_handler; +static const struct xattr_handler ceph_other_xattr_handler; /* * List of handlers for synthetic system.* attributes. Other @@ -1086,7 +1086,7 @@ static int ceph_set_xattr_handler(const struct xattr_handler *handler, return __ceph_setxattr(inode, name, value, size, flags); } -const struct xattr_handler ceph_other_xattr_handler = { +static const struct xattr_handler ceph_other_xattr_handler = { .prefix = "", /* match any name => handlers called with full name */ .get = ceph_get_xattr_handler, .set = ceph_set_xattr_handler, diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 6c58e13fed2f..3d03e48a9213 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -152,6 +152,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + seq_printf(m, "\nNumber of credits: %d", server->credits); i++; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 1418daa03d95..07ed81cf1552 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -49,6 +49,7 @@ #define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible * root mountable */ +#define CIFS_MOUNT_UID_FROM_ACL 0x2000000 /* try to get UID via special SID */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 0065256881d8..57ff0756e30c 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -36,7 +36,15 @@ struct smb_mnt_fs_info { __u64 cifs_posix_caps; } __packed; +struct smb_snapshot_array { + __u32 number_of_snapshots; + __u32 number_of_snapshots_returned; + __u32 snapshot_array_size; + /* snapshots[]; */ +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) +#define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 71e8a56e9479..15bac390dff9 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -42,6 +42,35 @@ static const struct cifs_sid sid_authusers = { /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; +/* S-1-22-1 Unmapped Unix users */ +static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-22-2 Unmapped Unix groups */ +static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* + * See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx + */ + +/* S-1-5-88 MS NFS and Apple style UID/GID/mode */ + +/* S-1-5-88-1 Unix uid */ +static const struct cifs_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-2 Unix gid */ +static const struct cifs_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-3 Unix mode */ +static const struct cifs_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + static const struct cred *root_cred; static int @@ -183,6 +212,62 @@ compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) return 0; /* sids compare/match */ } +static bool +is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) +{ + int i; + int num_subauth; + const struct cifs_sid *pwell_known_sid; + + if (!psid || (puid == NULL)) + return false; + + num_subauth = psid->num_subauth; + + /* check if Mac (or Windows NFS) vs. Samba format for Unix owner SID */ + if (num_subauth == 2) { + if (is_group) + pwell_known_sid = &sid_unix_groups; + else + pwell_known_sid = &sid_unix_users; + } else if (num_subauth == 3) { + if (is_group) + pwell_known_sid = &sid_unix_NFS_groups; + else + pwell_known_sid = &sid_unix_NFS_users; + } else + return false; + + /* compare the revision */ + if (psid->revision != pwell_known_sid->revision) + return false; + + /* compare all of the six auth values */ + for (i = 0; i < NUM_AUTHS; ++i) { + if (psid->authority[i] != pwell_known_sid->authority[i]) { + cifs_dbg(FYI, "auth %d did not match\n", i); + return false; + } + } + + if (num_subauth == 2) { + if (psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) + return false; + + *puid = le32_to_cpu(psid->sub_auth[1]); + } else /* 3 subauths, ie Windows/Mac style */ { + *puid = le32_to_cpu(psid->sub_auth[0]); + if ((psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) || + (psid->sub_auth[1] != pwell_known_sid->sub_auth[1])) + return false; + + *puid = le32_to_cpu(psid->sub_auth[2]); + } + + cifs_dbg(FYI, "Unix UID %d returned from SID\n", *puid); + return true; /* well known sid found, uid returned */ +} + static void cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { @@ -276,6 +361,43 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, return -EIO; } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) { + uint32_t unix_id; + bool is_group; + + if (sidtype != SIDOWNER) + is_group = true; + else + is_group = false; + + if (is_well_known_sid(psid, &unix_id, is_group) == false) + goto try_upcall_to_get_id; + + if (is_group) { + kgid_t gid; + gid_t id; + + id = (gid_t)unix_id; + gid = make_kgid(&init_user_ns, id); + if (gid_valid(gid)) { + fgid = gid; + goto got_valid_id; + } + } else { + kuid_t uid; + uid_t id; + + id = (uid_t)unix_id; + uid = make_kuid(&init_user_ns, id); + if (uid_valid(uid)) { + fuid = uid; + goto got_valid_id; + } + } + /* If unable to find uid/gid easily from SID try via upcall */ + } + +try_upcall_to_get_id: sidstr = sid_to_key_str(psid, sidtype); if (!sidstr) return -ENOMEM; @@ -329,6 +451,7 @@ out_revert_creds: * Note that we return 0 here unconditionally. If the mapping * fails then we just fall back to using the mnt_uid/mnt_gid. */ +got_valid_id: if (sidtype == SIDOWNER) fattr->cf_uid = fuid; else diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cca04e710421..15261ba464c5 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -64,15 +64,15 @@ unsigned int global_secflags = CIFSSEC_DEF; unsigned int sign_CIFS_PDUs = 1; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; -module_param(CIFSMaxBufSize, uint, 0); +module_param(CIFSMaxBufSize, uint, 0444); MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). " "Default: 16384 Range: 8192 to 130048"); unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL; -module_param(cifs_min_rcv, uint, 0); +module_param(cifs_min_rcv, uint, 0444); MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: " "1 to 64"); unsigned int cifs_min_small = 30; -module_param(cifs_min_small, uint, 0); +module_param(cifs_min_small, uint, 0444); MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 " "Range: 2 to 256"); unsigned int cifs_max_pending = CIFS_MAX_REQ; @@ -271,7 +271,7 @@ cifs_alloc_inode(struct super_block *sb) cifs_inode->createtime = 0; cifs_inode->epoch = 0; #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(cifs_inode->lease_key); #endif /* * Can not set i_flags here - they get immediately overwritten to zero @@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",posixpaths"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) seq_puts(s, ",setuids"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) + seq_puts(s, ",idsfromsid"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) seq_puts(s, ",serverino"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) @@ -1262,7 +1264,6 @@ init_cifs(void) GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; spin_lock_init(&cifs_tcp_ses_lock); - spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); get_random_bytes(&cifs_lock_secret, sizeof(cifs_lock_secret)); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 8f1d8c1e72be..1f17f6bd7a60 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -75,6 +75,18 @@ #define SMB_ECHO_INTERVAL_MAX 600 #define SMB_ECHO_INTERVAL_DEFAULT 60 +/* + * Default number of credits to keep available for SMB3. + * This value is chosen somewhat arbitrarily. The Windows client + * defaults to 128 credits, the Windows server allows clients up to + * 512 credits (or 8K for later versions), and the NetApp server + * does not limit clients at all. Choose a high enough default value + * such that the client shouldn't limit performance, but allow mount + * to override (until you approach 64K, where we limit credits to 65000 + * to reduce possibility of seeing more server credit overflow bugs. + */ +#define SMB2_MAX_CREDITS_AVAILABLE 32000 + #include "cifspdu.h" #ifndef XATTR_DOS_ATTRIB @@ -376,6 +388,8 @@ struct smb_version_operations { int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, struct cifsFileInfo *src_file); + int (*enum_snapshots)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *src_file, void __user *); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -464,6 +478,7 @@ struct smb_vol { bool retry:1; bool intr:1; bool setuids:1; + bool setuidfromacl:1; bool override_uid:1; bool override_gid:1; bool dynperm:1; @@ -510,6 +525,7 @@ struct smb_vol { struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; unsigned int echo_interval; /* echo interval in secs */ + unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ }; #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ @@ -567,7 +583,8 @@ struct TCP_Server_Info { bool noblocksnd; /* use blocking sendmsg */ bool noautotune; /* do not autotune send buf sizes */ bool tcp_nodelay; - int credits; /* send no more requests at once */ + unsigned int credits; /* send no more requests at once */ + unsigned int max_credits; /* can override large 32000 default at mnt */ unsigned int in_flight; /* number of requests on the wire to server */ spinlock_t req_lock; /* protect the two values above */ struct mutex srv_mutex; @@ -833,6 +850,7 @@ struct cifs_tcon { struct list_head tcon_list; int tc_count; struct list_head openFileList; + spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; @@ -889,7 +907,7 @@ struct cifs_tcon { #endif /* CONFIG_CIFS_STATS2 */ __u64 bytes_read; __u64 bytes_written; - spinlock_t stat_lock; + spinlock_t stat_lock; /* protects the two fields above */ #endif /* CONFIG_CIFS_STATS */ FILE_SYSTEM_DEVICE_INFO fsDevInfo; FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ @@ -1040,20 +1058,24 @@ struct cifs_fid_locks { }; struct cifsFileInfo { + /* following two lists are protected by tcon->open_file_lock */ struct list_head tlist; /* pointer to next fid owned by tcon */ struct list_head flist; /* next fid (file instance) for this inode */ + /* lock list below protected by cifsi->lock_sem */ struct cifs_fid_locks *llist; /* brlocks held by this fid */ kuid_t uid; /* allows finding which FileInfo structure */ __u32 pid; /* process id who opened file */ struct cifs_fid fid; /* file id from remote */ + struct list_head rlist; /* reconnect list */ /* BB add lock scope info here if needed */ ; /* lock scope id (0 if none) */ struct dentry *dentry; - unsigned int f_flags; struct tcon_link *tlink; + unsigned int f_flags; bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; - int count; /* refcount protected by cifs_file_list_lock */ + int count; + spinlock_t file_info_lock; /* protects four flag/count fields above */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ @@ -1120,7 +1142,7 @@ struct cifs_writedata { /* * Take a reference on the file private data. Must be called with - * cifs_file_list_lock held. + * cfile->file_info_lock held. */ static inline void cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) @@ -1514,8 +1536,10 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers - * cifs_file_list_lock protects: - * list operations on tcp and SMB session lists and tCon lists + * tcp_ses_lock protects: + * list operations on tcp and SMB session lists + * tcon->open_file_lock protects the list of open files hanging off the tcon + * cfile->file_info_lock protects counters and fields in cifs file struct * f_owner.lock protects certain per file struct operations * mapping->page_lock protects certain per page operations * @@ -1547,18 +1571,12 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; * tcp session, and the list of tcon's per smb session. It also protects * the reference counters for the server, smb session, and tcon. Finally, * changes to the tcon->tidStatus should be done while holding this lock. + * generally the locks should be taken in order tcp_ses_lock before + * tcon->open_file_lock and that before file->file_info_lock since the + * structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file */ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; -/* - * This lock protects the cifs_file->llist and cifs_file->flist - * list operations, and updates to some flags (cifs_file->invalidHandle) - * It will be moved to either use the tcon->stat_lock or equivalent later. - * If cifs_tcp_ses_lock and the lock below are both needed to be held, then - * the cifs_tcp_ses_lock must be grabbed first and released last. - */ -GLOBAL_EXTERN spinlock_t cifs_file_list_lock; - #ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 4ead72a001f9..ced0e42ce460 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -193,6 +193,8 @@ extern struct smb_vol *cifs_get_volume_info(char *mount_data, extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); extern void cifs_umount(struct cifs_sb_info *); extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon); +extern void cifs_reopen_persistent_handles(struct cifs_tcon *tcon); + extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, __u8 type, struct cifsLockInfo **conf_lock, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f82d2823622f..3f3185febc58 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -98,13 +98,13 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) struct list_head *tmp1; /* list all files open on tree connection and mark them invalid */ - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_safe(tmp, tmp1, &tcon->openFileList) { open_file = list_entry(tmp, struct cifsFileInfo, tlist); open_file->invalidHandle = true; open_file->oplock_break_cancelled = true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); /* * BB Add call to invalidate_inodes(sb) for all superblocks mounted * to this tcon. diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2e4f4bad8b1e..aab5227979e2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -63,7 +63,6 @@ extern mempool_t *cifs_req_poolp; #define TLINK_IDLE_EXPIRE (600 * HZ) enum { - /* Mount options that take no arguments */ Opt_user_xattr, Opt_nouser_xattr, Opt_forceuid, Opt_noforceuid, @@ -76,7 +75,7 @@ enum { Opt_noposixpaths, Opt_nounix, Opt_nocase, Opt_brl, Opt_nobrl, - Opt_forcemandatorylock, Opt_setuids, + Opt_forcemandatorylock, Opt_setuidfromacl, Opt_setuids, Opt_nosetuids, Opt_dynperm, Opt_nodynperm, Opt_nohard, Opt_nosoft, Opt_nointr, Opt_intr, @@ -95,7 +94,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, - Opt_echo_interval, + Opt_echo_interval, Opt_max_credits, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -148,6 +147,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_forcemandatorylock, "forcemand" }, { Opt_setuids, "setuids" }, { Opt_nosetuids, "nosetuids" }, + { Opt_setuidfromacl, "idsfromsid" }, { Opt_dynperm, "dynperm" }, { Opt_nodynperm, "nodynperm" }, { Opt_nohard, "nohard" }, @@ -190,6 +190,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, { Opt_echo_interval, "echo_interval=%s" }, + { Opt_max_credits, "max_credits=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -1376,6 +1377,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_nosetuids: vol->setuids = 0; break; + case Opt_setuidfromacl: + vol->setuidfromacl = 1; + break; case Opt_dynperm: vol->dynperm = true; break; @@ -1586,6 +1590,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } vol->echo_interval = option; break; + case Opt_max_credits: + if (get_option_ul(args, &option) || (option < 20) || + (option > 60000)) { + cifs_dbg(VFS, "%s: Invalid max_credits value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->max_credits = option; + break; /* String Arguments */ @@ -2163,7 +2176,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, sizeof(tcp_ses->dstaddr)); #ifdef CONFIG_CIFS_SMB2 - get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE); + generate_random_uuid(tcp_ses->client_guid); #endif /* * at this point we are the only ones with the pointer @@ -3270,6 +3283,8 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; + if (pvolume_info->setuidfromacl) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UID_FROM_ACL; if (pvolume_info->server_ino) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; if (pvolume_info->remap) @@ -3598,7 +3613,11 @@ try_mount_again: bdi_destroy(&cifs_sb->bdi); goto out; } - + if ((volume_info->max_credits < 20) || + (volume_info->max_credits > 60000)) + server->max_credits = SMB2_MAX_CREDITS_AVAILABLE; + else + server->max_credits = volume_info->max_credits; /* get a reference to a SMB session */ ses = cifs_get_smb_ses(server, volume_info); if (IS_ERR(ses)) { @@ -3688,14 +3707,16 @@ remote_path_check: goto mount_fail_check; } - rc = cifs_are_all_path_components_accessible(server, + if (rc != -EREMOTE) { + rc = cifs_are_all_path_components_accessible(server, xid, tcon, cifs_sb, full_path); - if (rc != 0) { - cifs_dbg(VFS, "cannot query dirs between root and final path, " - "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; - rc = 0; + if (rc != 0) { + cifs_dbg(VFS, "cannot query dirs between root and final path, " + "enabling CIFS_MOUNT_USE_PREFIX_PATH\n"); + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; + rc = 0; + } } kfree(full_path); } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a95fe8b1afe9..7f5f6176c6f1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -305,6 +305,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); mutex_init(&cfile->fh_mutex); + spin_lock_init(&cfile->file_info_lock); cifs_sb_active(inode->i_sb); @@ -317,7 +318,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, oplock = 0; } - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; list_del(&fid->pending_open->olist); @@ -326,12 +327,13 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, server->ops->set_fid(cfile, fid, oplock); list_add(&cfile->tlist, &tcon->openFileList); + /* if readable file instance put first in list*/ if (file->f_mode & FMODE_READ) list_add(&cfile->flist, &cinode->openFileList); else list_add_tail(&cfile->flist, &cinode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (fid->purge_cache) cifs_zap_mapping(inode); @@ -343,16 +345,16 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct cifsFileInfo * cifsFileInfo_get(struct cifsFileInfo *cifs_file) { - spin_lock(&cifs_file_list_lock); + spin_lock(&cifs_file->file_info_lock); cifsFileInfo_get_locked(cifs_file); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); return cifs_file; } /* * Release a reference on the file private data. This may involve closing * the filehandle out on the server. Must be called without holding - * cifs_file_list_lock. + * tcon->open_file_lock and cifs_file->file_info_lock. */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { @@ -367,11 +369,15 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifs_pending_open open; bool oplock_break_cancelled; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); + + spin_lock(&cifs_file->file_info_lock); if (--cifs_file->count > 0) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cifs_file->file_info_lock); + spin_unlock(&tcon->open_file_lock); return; } + spin_unlock(&cifs_file->file_info_lock); if (server->ops->get_lease_key) server->ops->get_lease_key(inode, &fid); @@ -395,7 +401,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags); cifs_set_oplock_level(cifsi, 0); } - spin_unlock(&cifs_file_list_lock); + + spin_unlock(&tcon->open_file_lock); oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); @@ -732,6 +739,15 @@ reopen_success: * to the server to get the new inode info. */ + /* + * If the server returned a read oplock and we have mandatory brlocks, + * set oplock level to None. + */ + if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) { + cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n"); + oplock = 0; + } + server->ops->set_fid(cfile, &cfile->fid, oplock); if (oparms.reconnect) cifs_relock_file(cfile); @@ -753,6 +769,36 @@ int cifs_close(struct inode *inode, struct file *file) return 0; } +void +cifs_reopen_persistent_handles(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *open_file; + struct list_head *tmp; + struct list_head *tmp1; + struct list_head tmp_list; + + cifs_dbg(FYI, "Reopen persistent handles"); + INIT_LIST_HEAD(&tmp_list); + + /* list all files open on tree connection, reopen resilient handles */ + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + open_file = list_entry(tmp, struct cifsFileInfo, tlist); + if (!open_file->invalidHandle) + continue; + cifsFileInfo_get(open_file); + list_add_tail(&open_file->rlist, &tmp_list); + } + spin_unlock(&tcon->open_file_lock); + + list_for_each_safe(tmp, tmp1, &tmp_list) { + open_file = list_entry(tmp, struct cifsFileInfo, rlist); + cifs_reopen_file(open_file, false /* do not flush */); + list_del_init(&open_file->rlist); + cifsFileInfo_put(open_file); + } +} + int cifs_closedir(struct inode *inode, struct file *file) { int rc = 0; @@ -772,10 +818,10 @@ int cifs_closedir(struct inode *inode, struct file *file) server = tcon->ses->server; cifs_dbg(FYI, "Freeing private data in close dir\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) rc = server->ops->close_dir(xid, tcon, &cfile->fid); else @@ -784,7 +830,7 @@ int cifs_closedir(struct inode *inode, struct file *file) /* not much we can do if it fails anyway, ignore rc */ rc = 0; } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); buf = cfile->srch_inf.ntwrk_buf_start; if (buf) { @@ -1728,12 +1774,13 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1744,8 +1791,8 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1753,7 +1800,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } @@ -1762,6 +1809,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; bool any_available = false; int rc; unsigned int refind = 0; @@ -1777,15 +1825,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return NULL; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1796,8 +1845,8 @@ refind_writable: if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { if (!open_file->invalidHandle) { /* found a good writable file */ - cifsFileInfo_get_locked(open_file); - spin_unlock(&cifs_file_list_lock); + cifsFileInfo_get(open_file); + spin_unlock(&tcon->open_file_lock); return open_file; } else { if (!inv_file) @@ -1813,24 +1862,24 @@ refind_writable: if (inv_file) { any_available = false; - cifsFileInfo_get_locked(inv_file); + cifsFileInfo_get(inv_file); } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); if (!rc) return inv_file; else { - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_move_tail(&inv_file->flist, &cifs_inode->openFileList); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); cifsFileInfo_put(inv_file); - spin_lock(&cifs_file_list_lock); ++refind; inv_file = NULL; + spin_lock(&tcon->open_file_lock); goto refind_writable; } } @@ -3612,15 +3661,17 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; + struct cifs_tcon *tcon = + cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 1; } } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); return 0; } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 7a3b84e300f8..9f51b81119f2 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -189,7 +189,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); cifs_sb = CIFS_SB(inode->i_sb); - + cifs_dbg(VFS, "cifs ioctl 0x%x\n", command); switch (command) { case FS_IOC_GETFLAGS: if (pSMBFile == NULL) @@ -267,11 +267,23 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; + case CIFS_ENUMERATE_SNAPSHOTS: + if (arg == 0) { + rc = -EINVAL; + goto cifs_ioc_exit; + } + tcon = tlink_tcon(pSMBFile->tlink); + if (tcon->ses->server->ops->enum_snapshots) + rc = tcon->ses->server->ops->enum_snapshots(xid, tcon, + pSMBFile, (void __user *)arg); + else + rc = -EOPNOTSUPP; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; } - +cifs_ioc_exit: free_xid(xid); return rc; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 813fe13c2ae1..c6729156f9a0 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -120,6 +120,7 @@ tconInfoAlloc(void) ++ret_buf->tc_count; INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); + spin_lock_init(&ret_buf->open_file_lock); #ifdef CONFIG_CIFS_STATS spin_lock_init(&ret_buf->stat_lock); #endif @@ -465,7 +466,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) continue; cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { netfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -495,11 +496,11 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) &netfile->oplock_break); netfile->oplock_break_cancelled = false; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; @@ -613,9 +614,9 @@ backup_cred(struct cifs_sb_info *cifs_sb) void cifs_del_pending_open(struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(open->tlink)->open_file_lock); list_del(&open->olist); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } void @@ -635,7 +636,7 @@ void cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open) { - spin_lock(&cifs_file_list_lock); + spin_lock(&tlink_tcon(tlink)->open_file_lock); cifs_add_pending_open_locked(fid, tlink, open); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 65cf85dcda09..8f6a2a5863b9 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -597,14 +597,14 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { /* close and restart search */ cifs_dbg(FYI, "search backing up - close and restart search\n"); - spin_lock(&cifs_file_list_lock); + spin_lock(&cfile->file_info_lock); if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (server->ops->close_dir) server->ops->close_dir(xid, tcon, &cfile->fid); } else - spin_unlock(&cifs_file_list_lock); + spin_unlock(&cfile->file_info_lock); if (cfile->srch_inf.ntwrk_buf_start) { cifs_dbg(FYI, "freeing SMB ff cache buf on search rewind\n"); if (cfile->srch_inf.smallBuf) diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 4f0231e685a9..1238cd3552f9 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -266,9 +266,15 @@ smb2_set_file_info(struct inode *inode, const char *full_path, struct tcon_link *tlink; int rc; + if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && + (buf->LastWriteTime == 0) && (buf->ChangeTime) && + (buf->Attributes == 0)) + return 0; /* would be a no op, no sense sending this */ + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); + rc = smb2_open_op_close(xid, tlink_tcon(tlink), cifs_sb, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, SMB2_OP_SET_INFO); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 389fb9f8c84e..3d383489b9cf 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -549,19 +549,19 @@ smb2_is_valid_lease_break(char *buffer) list_for_each(tmp1, &server->smb_ses_list) { ses = list_entry(tmp1, struct cifs_ses, smb_ses_list); - spin_lock(&cifs_file_list_lock); list_for_each(tmp2, &ses->tcon_list) { tcon = list_entry(tmp2, struct cifs_tcon, tcon_list); + spin_lock(&tcon->open_file_lock); cifs_stats_inc( &tcon->stats.cifs_stats.num_oplock_brks); if (smb2_tcon_has_lease(tcon, rsp, lw)) { - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } + spin_unlock(&tcon->open_file_lock); } - spin_unlock(&cifs_file_list_lock); } } spin_unlock(&cifs_tcp_ses_lock); @@ -603,7 +603,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks); - spin_lock(&cifs_file_list_lock); + spin_lock(&tcon->open_file_lock); list_for_each(tmp2, &tcon->openFileList) { cfile = list_entry(tmp2, struct cifsFileInfo, tlist); @@ -615,7 +615,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "file id match, oplock break\n"); cinode = CIFS_I(d_inode(cfile->dentry)); - + spin_lock(&cfile->file_info_lock); if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) cfile->oplock_break_cancelled = true; @@ -637,14 +637,14 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) clear_bit( CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - + spin_unlock(&cfile->file_info_lock); queue_work(cifsiod_wq, &cfile->oplock_break); - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); return true; } - spin_unlock(&cifs_file_list_lock); + spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); cifs_dbg(FYI, "No matching file for oplock break\n"); return true; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d203c0329626..5d456ebb3813 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -28,6 +28,7 @@ #include "cifs_unicode.h" #include "smb2status.h" #include "smb2glob.h" +#include "cifs_ioctl.h" static int change_conf(struct TCP_Server_Info *server) @@ -70,6 +71,10 @@ smb2_add_credits(struct TCP_Server_Info *server, const unsigned int add, spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); *val += add; + if (*val > 65000) { + *val = 65000; /* Don't get near 64K credits, avoid srv bugs */ + printk_once(KERN_WARNING "server overflowed SMB3 credits\n"); + } server->in_flight--; if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) rc = change_conf(server); @@ -287,7 +292,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) cifs_dbg(FYI, "Link Speed %lld\n", le64_to_cpu(out_buf->LinkSpeed)); } - + kfree(out_buf); return rc; } #endif /* STATS2 */ @@ -541,6 +546,7 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) server->ops->set_oplock_level(cinode, oplock, fid->epoch, &fid->purge_cache); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); + memcpy(cfile->fid.create_guid, fid->create_guid, 16); } static void @@ -699,6 +705,7 @@ smb2_clone_range(const unsigned int xid, cchunk_out: kfree(pcchunk); + kfree(retbuf); return rc; } @@ -823,7 +830,6 @@ smb2_duplicate_extents(const unsigned int xid, { int rc; unsigned int ret_data_len; - char *retbuf = NULL; struct duplicate_extents_to_file dup_ext_buf; struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); @@ -849,7 +855,7 @@ smb2_duplicate_extents(const unsigned int xid, FSCTL_DUPLICATE_EXTENTS_TO_FILE, true /* is_fsctl */, (char *)&dup_ext_buf, sizeof(struct duplicate_extents_to_file), - (char **)&retbuf, + NULL, &ret_data_len); if (ret_data_len > 0) @@ -872,7 +878,6 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) { struct fsctl_set_integrity_information_req integr_info; - char *retbuf = NULL; unsigned int ret_data_len; integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); @@ -884,9 +889,53 @@ smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, FSCTL_SET_INTEGRITY_INFORMATION, true /* is_fsctl */, (char *)&integr_info, sizeof(struct fsctl_set_integrity_information_req), + NULL, + &ret_data_len); + +} + +static int +smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, void __user *ioc_buf) +{ + char *retbuf = NULL; + unsigned int ret_data_len = 0; + int rc; + struct smb_snapshot_array snapshot_in; + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SRV_ENUMERATE_SNAPSHOTS, + true /* is_fsctl */, NULL, 0 /* no input data */, (char **)&retbuf, &ret_data_len); + cifs_dbg(FYI, "enum snaphots ioctl returned %d and ret buflen is %d\n", + rc, ret_data_len); + if (rc) + return rc; + if (ret_data_len && (ioc_buf != NULL) && (retbuf != NULL)) { + /* Fixup buffer */ + if (copy_from_user(&snapshot_in, ioc_buf, + sizeof(struct smb_snapshot_array))) { + rc = -EFAULT; + kfree(retbuf); + return rc; + } + if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { + rc = -ERANGE; + return rc; + } + + if (ret_data_len > snapshot_in.snapshot_array_size) + ret_data_len = snapshot_in.snapshot_array_size; + + if (copy_to_user(ioc_buf, retbuf, ret_data_len)) + rc = -EFAULT; + } + + kfree(retbuf); + return rc; } static int @@ -1041,7 +1090,7 @@ smb2_set_lease_key(struct inode *inode, struct cifs_fid *fid) static void smb2_new_lease_key(struct cifs_fid *fid) { - get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE); + generate_random_uuid(fid->lease_key); } #define SMB2_SYMLINK_STRUCT_SIZE \ @@ -1654,6 +1703,7 @@ struct smb_version_operations smb21_operations = { .clone_range = smb2_clone_range, .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, + .enum_snapshots = smb3_enum_snapshots, }; struct smb_version_operations smb30_operations = { @@ -1740,6 +1790,7 @@ struct smb_version_operations smb30_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, + .enum_snapshots = smb3_enum_snapshots, }; #ifdef CONFIG_CIFS_SMB311 @@ -1827,6 +1878,7 @@ struct smb_version_operations smb311_operations = { .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, .fallocate = smb3_fallocate, + .enum_snapshots = smb3_enum_snapshots, }; #endif /* CIFS_SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 29e06db5f187..5ca5ea4668a1 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -100,7 +100,21 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->ProtocolId = SMB2_PROTO_NUMBER; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; - hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ + if (tcon && tcon->ses && tcon->ses->server) { + struct TCP_Server_Info *server = tcon->ses->server; + + spin_lock(&server->req_lock); + /* Request up to 2 credits but don't go over the limit. */ + if (server->credits >= server->max_credits) + hdr->CreditRequest = cpu_to_le16(0); + else + hdr->CreditRequest = cpu_to_le16( + min_t(int, server->max_credits - + server->credits, 2)); + spin_unlock(&server->req_lock); + } else { + hdr->CreditRequest = cpu_to_le16(2); + } hdr->ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) @@ -236,8 +250,13 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } cifs_mark_open_files_invalid(tcon); + rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&tcon->ses->session_mutex); + + if (tcon->use_persistent) + cifs_reopen_persistent_handles(tcon); + cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; @@ -574,59 +593,42 @@ vneg_out: return -EIO; } -int -SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_cp) +struct SMB2_sess_data { + unsigned int xid; + struct cifs_ses *ses; + struct nls_table *nls_cp; + void (*func)(struct SMB2_sess_data *); + int result; + u64 previous_session; + + /* we will send the SMB in three pieces: + * a fixed length beginning part, an optional + * SPNEGO blob (which can be zero length), and a + * last part which will include the strings + * and rest of bcc area. This allows us to avoid + * a large buffer 17K allocation + */ + int buf0_type; + struct kvec iov[2]; +}; + +static int +SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) { + int rc; + struct cifs_ses *ses = sess_data->ses; struct smb2_sess_setup_req *req; - struct smb2_sess_setup_rsp *rsp = NULL; - struct kvec iov[2]; - int rc = 0; - int resp_buftype = CIFS_NO_BUFFER; - __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ struct TCP_Server_Info *server = ses->server; - u16 blob_length = 0; - struct key *spnego_key = NULL; - char *security_blob = NULL; - unsigned char *ntlmssp_blob = NULL; - bool use_spnego = false; /* else use raw ntlmssp */ - - cifs_dbg(FYI, "Session Setup\n"); - - if (!server) { - WARN(1, "%s: server is NULL!\n", __func__); - return -EIO; - } - - /* - * If we are here due to reconnect, free per-smb session key - * in case signing was required. - */ - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - - /* - * If memory allocation is successful, caller of this function - * frees it. - */ - ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); - if (!ses->ntlmssp) - return -ENOMEM; - ses->ntlmssp->sesskey_per_smbsess = true; - - /* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */ - if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) - ses->sectype = RawNTLMSSP; - -ssetup_ntlmssp_authenticate: - if (phase == NtLmChallenge) - phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ rc = small_smb2_init(SMB2_SESSION_SETUP, NULL, (void **) &req); if (rc) return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ + + /* if reconnect, we need to send previous sess id, otherwise it is 0 */ + req->PreviousSessionId = sess_data->previous_session; + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); @@ -642,199 +644,368 @@ ssetup_ntlmssp_authenticate: req->Capabilities = 0; req->Channel = 0; /* MBZ */ - iov[0].iov_base = (char *)req; + sess_data->iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field and 1 for pad */ - iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + sess_data->iov[0].iov_len = get_rfc1002_length(req) + 4 - 1; + /* + * This variable will be used to clear the buffer + * allocated above in case of any error in the calling function. + */ + sess_data->buf0_type = CIFS_SMALL_BUFFER; - if (ses->sectype == Kerberos) { -#ifdef CONFIG_CIFS_UPCALL - struct cifs_spnego_msg *msg; + return 0; +} - spnego_key = cifs_get_spnego_key(ses); - if (IS_ERR(spnego_key)) { - rc = PTR_ERR(spnego_key); - spnego_key = NULL; - goto ssetup_exit; - } +static void +SMB2_sess_free_buffer(struct SMB2_sess_data *sess_data) +{ + free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base); + sess_data->buf0_type = CIFS_NO_BUFFER; +} - msg = spnego_key->payload.data[0]; - /* - * check version field to make sure that cifs.upcall is - * sending us a response in an expected form - */ - if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { - cifs_dbg(VFS, - "bad cifs.upcall version. Expected %d got %d", - CIFS_SPNEGO_UPCALL_VERSION, msg->version); - rc = -EKEYREJECTED; - goto ssetup_exit; - } - ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, - GFP_KERNEL); - if (!ses->auth_key.response) { - cifs_dbg(VFS, - "Kerberos can't allocate (%u bytes) memory", - msg->sesskey_len); - rc = -ENOMEM; - goto ssetup_exit; - } - ses->auth_key.len = msg->sesskey_len; - blob_length = msg->secblob_len; - iov[1].iov_base = msg->data + msg->sesskey_len; - iov[1].iov_len = blob_length; -#else - rc = -EOPNOTSUPP; - goto ssetup_exit; -#endif /* CONFIG_CIFS_UPCALL */ - } else if (phase == NtLmNegotiate) { /* if not krb5 must be ntlmssp */ - ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), - GFP_KERNEL); - if (ntlmssp_blob == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); - if (use_spnego) { - /* blob_length = build_spnego_ntlmssp_blob( - &security_blob, - sizeof(struct _NEGOTIATE_MESSAGE), - ntlmssp_blob); */ - /* BB eventually need to add this */ - cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); - rc = -EOPNOTSUPP; - kfree(ntlmssp_blob); - goto ssetup_exit; - } else { - blob_length = sizeof(struct _NEGOTIATE_MESSAGE); - /* with raw NTLMSSP we don't encapsulate in SPNEGO */ - security_blob = ntlmssp_blob; - } - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; - } else if (phase == NtLmAuthenticate) { - req->hdr.SessionId = ses->Suid; - rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, - nls_cp); - if (rc) { - cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", - rc); - goto ssetup_exit; /* BB double check error handling */ - } - if (use_spnego) { - /* blob_length = build_spnego_ntlmssp_blob( - &security_blob, - blob_length, - ntlmssp_blob); */ - cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); - rc = -EOPNOTSUPP; - kfree(ntlmssp_blob); - goto ssetup_exit; - } else { - security_blob = ntlmssp_blob; - } - iov[1].iov_base = security_blob; - iov[1].iov_len = blob_length; - } else { - cifs_dbg(VFS, "illegal ntlmssp phase\n"); - rc = -EIO; - goto ssetup_exit; - } +static int +SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) +{ + int rc; + struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base; /* Testing shows that buffer offset must be at location of Buffer[0] */ req->SecurityBufferOffset = - cpu_to_le16(sizeof(struct smb2_sess_setup_req) - - 1 /* pad */ - 4 /* rfc1001 len */); - req->SecurityBufferLength = cpu_to_le16(blob_length); + cpu_to_le16(sizeof(struct smb2_sess_setup_req) - + 1 /* pad */ - 4 /* rfc1001 len */); + req->SecurityBufferLength = cpu_to_le16(sess_data->iov[1].iov_len); - inc_rfc1001_len(req, blob_length - 1 /* pad */); + inc_rfc1001_len(req, sess_data->iov[1].iov_len - 1 /* pad */); /* BB add code to build os and lm fields */ - rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, - CIFS_LOG_ERROR | CIFS_NEG_OP); + rc = SendReceive2(sess_data->xid, sess_data->ses, + sess_data->iov, 2, + &sess_data->buf0_type, + CIFS_LOG_ERROR | CIFS_NEG_OP); - kfree(security_blob); - rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base; - ses->Suid = rsp->hdr.SessionId; - if (resp_buftype != CIFS_NO_BUFFER && - rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) { - if (phase != NtLmNegotiate) { - cifs_dbg(VFS, "Unexpected more processing error\n"); - goto ssetup_exit; - } - if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != - le16_to_cpu(rsp->SecurityBufferOffset)) { - cifs_dbg(VFS, "Invalid security buffer offset %d\n", - le16_to_cpu(rsp->SecurityBufferOffset)); - rc = -EIO; - goto ssetup_exit; + return rc; +} + +static int +SMB2_sess_establish_session(struct SMB2_sess_data *sess_data) +{ + int rc = 0; + struct cifs_ses *ses = sess_data->ses; + + mutex_lock(&ses->server->srv_mutex); + if (ses->server->sign && ses->server->ops->generate_signingkey) { + rc = ses->server->ops->generate_signingkey(ses); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + if (rc) { + cifs_dbg(FYI, + "SMB3 session key generation failed\n"); + mutex_unlock(&ses->server->srv_mutex); + goto keygen_exit; } + } + if (!ses->server->session_estab) { + ses->server->sequence_number = 0x2; + ses->server->session_estab = true; + } + mutex_unlock(&ses->server->srv_mutex); + + cifs_dbg(FYI, "SMB2/3 session established successfully\n"); + spin_lock(&GlobalMid_Lock); + ses->status = CifsGood; + ses->need_reconnect = false; + spin_unlock(&GlobalMid_Lock); - /* NTLMSSP Negotiate sent now processing challenge (response) */ - phase = NtLmChallenge; /* process ntlmssp challenge */ - rc = 0; /* MORE_PROCESSING is not an error here but expected */ - rc = decode_ntlmssp_challenge(rsp->Buffer, - le16_to_cpu(rsp->SecurityBufferLength), ses); +keygen_exit: + if (!ses->server->sign) { + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + } + return rc; +} + +#ifdef CONFIG_CIFS_UPCALL +static void +SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct cifs_spnego_msg *msg; + struct key *spnego_key = NULL; + struct smb2_sess_setup_rsp *rsp = NULL; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out; + + spnego_key = cifs_get_spnego_key(ses); + if (IS_ERR(spnego_key)) { + rc = PTR_ERR(spnego_key); + spnego_key = NULL; + goto out; } + msg = spnego_key->payload.data[0]; /* - * BB eventually add code for SPNEGO decoding of NtlmChallenge blob, - * but at least the raw NTLMSSP case works. + * check version field to make sure that cifs.upcall is + * sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cifs_dbg(VFS, + "bad cifs.upcall version. Expected %d got %d", + CIFS_SPNEGO_UPCALL_VERSION, msg->version); + rc = -EKEYREJECTED; + goto out_put_spnego_key; + } + + ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, + GFP_KERNEL); + if (!ses->auth_key.response) { + cifs_dbg(VFS, + "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; + goto out_put_spnego_key; + } + ses->auth_key.len = msg->sesskey_len; + + sess_data->iov[1].iov_base = msg->data + msg->sesskey_len; + sess_data->iov[1].iov_len = msg->secblob_len; + + rc = SMB2_sess_sendreceive(sess_data); + if (rc) + goto out_put_spnego_key; + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + ses->Suid = rsp->hdr.SessionId; + + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); + + rc = SMB2_sess_establish_session(sess_data); +out_put_spnego_key: + key_invalidate(spnego_key); + key_put(spnego_key); +out: + sess_data->result = rc; + sess_data->func = NULL; + SMB2_sess_free_buffer(sess_data); +} +#else +static void +SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) +{ + cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); + sess_data->result = -EOPNOTSUPP; + sess_data->func = NULL; +} +#endif + +static void +SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data); + +static void +SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb2_sess_setup_rsp *rsp = NULL; + char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + u16 blob_length = 0; + /* - * No tcon so can't do - * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); + * If memory allocation is successful, caller of this function + * frees it. */ - if (rc != 0) - goto ssetup_exit; + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) { + rc = -ENOMEM; + goto out_err; + } + ses->ntlmssp->sesskey_per_smbsess = true; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out_err; + + ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE), + GFP_KERNEL); + if (ntlmssp_blob == NULL) { + rc = -ENOMEM; + goto out; + } + + build_ntlmssp_negotiate_blob(ntlmssp_blob, ses); + if (use_spnego) { + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + goto out; + } else { + blob_length = sizeof(struct _NEGOTIATE_MESSAGE); + /* with raw NTLMSSP we don't encapsulate in SPNEGO */ + } + sess_data->iov[1].iov_base = ntlmssp_blob; + sess_data->iov[1].iov_len = blob_length; + + rc = SMB2_sess_sendreceive(sess_data); + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + + /* If true, rc here is expected and not an error */ + if (sess_data->buf0_type != CIFS_NO_BUFFER && + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rc = 0; + + if (rc) + goto out; + + if (offsetof(struct smb2_sess_setup_rsp, Buffer) - 4 != + le16_to_cpu(rsp->SecurityBufferOffset)) { + cifs_dbg(VFS, "Invalid security buffer offset %d\n", + le16_to_cpu(rsp->SecurityBufferOffset)); + rc = -EIO; + goto out; + } + rc = decode_ntlmssp_challenge(rsp->Buffer, + le16_to_cpu(rsp->SecurityBufferLength), ses); + if (rc) + goto out; + + cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n"); + + ses->Suid = rsp->hdr.SessionId; ses->session_flags = le16_to_cpu(rsp->SessionFlags); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); -ssetup_exit: - free_rsp_buf(resp_buftype, rsp); - - /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ - if ((phase == NtLmChallenge) && (rc == 0)) - goto ssetup_ntlmssp_authenticate; +out: + kfree(ntlmssp_blob); + SMB2_sess_free_buffer(sess_data); if (!rc) { - mutex_lock(&server->srv_mutex); - if (server->sign && server->ops->generate_signingkey) { - rc = server->ops->generate_signingkey(ses); - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; - if (rc) { - cifs_dbg(FYI, - "SMB3 session key generation failed\n"); - mutex_unlock(&server->srv_mutex); - goto keygen_exit; - } - } - if (!server->session_estab) { - server->sequence_number = 0x2; - server->session_estab = true; - } - mutex_unlock(&server->srv_mutex); - - cifs_dbg(FYI, "SMB2/3 session established successfully\n"); - spin_lock(&GlobalMid_Lock); - ses->status = CifsGood; - ses->need_reconnect = false; - spin_unlock(&GlobalMid_Lock); + sess_data->result = 0; + sess_data->func = SMB2_sess_auth_rawntlmssp_authenticate; + return; } +out_err: + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + sess_data->result = rc; + sess_data->func = NULL; +} -keygen_exit: - if (!server->sign) { - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; +static void +SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) +{ + int rc; + struct cifs_ses *ses = sess_data->ses; + struct smb2_sess_setup_req *req; + struct smb2_sess_setup_rsp *rsp = NULL; + unsigned char *ntlmssp_blob = NULL; + bool use_spnego = false; /* else use raw ntlmssp */ + u16 blob_length = 0; + + rc = SMB2_sess_alloc_buffer(sess_data); + if (rc) + goto out; + + req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; + req->hdr.SessionId = ses->Suid; + + rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, + sess_data->nls_cp); + if (rc) { + cifs_dbg(FYI, "build_ntlmssp_auth_blob failed %d\n", rc); + goto out; } - if (spnego_key) { - key_invalidate(spnego_key); - key_put(spnego_key); + + if (use_spnego) { + /* BB eventually need to add this */ + cifs_dbg(VFS, "spnego not supported for SMB2 yet\n"); + rc = -EOPNOTSUPP; + goto out; } + sess_data->iov[1].iov_base = ntlmssp_blob; + sess_data->iov[1].iov_len = blob_length; + + rc = SMB2_sess_sendreceive(sess_data); + if (rc) + goto out; + + rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; + + ses->Suid = rsp->hdr.SessionId; + ses->session_flags = le16_to_cpu(rsp->SessionFlags); + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + cifs_dbg(VFS, "SMB3 encryption not supported yet\n"); + + rc = SMB2_sess_establish_session(sess_data); +out: + kfree(ntlmssp_blob); + SMB2_sess_free_buffer(sess_data); kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + sess_data->result = rc; + sess_data->func = NULL; +} +static int +SMB2_select_sec(struct cifs_ses *ses, struct SMB2_sess_data *sess_data) +{ + if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP) + ses->sectype = RawNTLMSSP; + + switch (ses->sectype) { + case Kerberos: + sess_data->func = SMB2_auth_kerberos; + break; + case RawNTLMSSP: + sess_data->func = SMB2_sess_auth_rawntlmssp_negotiate; + break; + default: + cifs_dbg(VFS, "secType %d not supported!\n", ses->sectype); + return -EOPNOTSUPP; + } + + return 0; +} + +int +SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + int rc = 0; + struct TCP_Server_Info *server = ses->server; + struct SMB2_sess_data *sess_data; + + cifs_dbg(FYI, "Session Setup\n"); + + if (!server) { + WARN(1, "%s: server is NULL!\n", __func__); + return -EIO; + } + + sess_data = kzalloc(sizeof(struct SMB2_sess_data), GFP_KERNEL); + if (!sess_data) + return -ENOMEM; + + rc = SMB2_select_sec(ses, sess_data); + if (rc) + goto out; + sess_data->xid = xid; + sess_data->ses = ses; + sess_data->buf0_type = CIFS_NO_BUFFER; + sess_data->nls_cp = (struct nls_table *) nls_cp; + + while (sess_data->func) + sess_data->func(sess_data); + + rc = sess_data->result; +out: + kfree(sess_data); return rc; } @@ -1164,7 +1335,7 @@ create_durable_v2_buf(struct cifs_fid *pfid) buf->dcontext.Timeout = 0; /* Should this be configurable by workload */ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); - get_random_bytes(buf->dcontext.CreateGuid, 16); + generate_random_uuid(buf->dcontext.CreateGuid); memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16); /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */ @@ -2057,6 +2228,7 @@ smb2_async_readv(struct cifs_readdata *rdata) if (rdata->credits) { buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, SMB2_MAX_BUFFER_SIZE)); + buf->CreditRequest = buf->CreditCharge; spin_lock(&server->req_lock); server->credits += rdata->credits - le16_to_cpu(buf->CreditCharge); @@ -2243,6 +2415,7 @@ smb2_async_writev(struct cifs_writedata *wdata, if (wdata->credits) { req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes, SMB2_MAX_BUFFER_SIZE)); + req->hdr.CreditRequest = req->hdr.CreditCharge; spin_lock(&server->req_lock); server->credits += wdata->credits - le16_to_cpu(req->hdr.CreditCharge); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index ff88d9feb01e..fd3709e8de33 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -276,7 +276,7 @@ struct smb2_sess_setup_req { __le32 Channel; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le64 PreviousSessionId; + __u64 PreviousSessionId; __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5e23f64c0804..20af5187ba63 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -33,7 +33,8 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" - +#define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ +#define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ /* BB need to add server (Samba e.g) support for security and trusted prefix */ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; @@ -144,6 +145,54 @@ out: return rc; } +static int cifs_attrib_get(struct dentry *dentry, + struct inode *inode, void *value, + size_t size) +{ + ssize_t rc; + __u32 *pattribute; + + rc = cifs_revalidate_dentry_attr(dentry); + + if (rc) + return rc; + + if ((value == NULL) || (size == 0)) + return sizeof(__u32); + else if (size < sizeof(__u32)) + return -ERANGE; + + /* return dos attributes as pseudo xattr */ + pattribute = (__u32 *)value; + *pattribute = CIFS_I(inode)->cifsAttrs; + + return sizeof(__u32); +} + +static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, + void *value, size_t size) +{ + ssize_t rc; + __u64 * pcreatetime; + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + if ((value == NULL) || (size == 0)) + return sizeof(__u64); + else if (size < sizeof(__u64)) + return -ERANGE; + + /* return dos attributes as pseudo xattr */ + pcreatetime = (__u64 *)value; + *pcreatetime = CIFS_I(inode)->createtime; + return sizeof(__u64); + + return rc; +} + + static int cifs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) @@ -168,10 +217,19 @@ static int cifs_xattr_get(const struct xattr_handler *handler, rc = -ENOMEM; goto out; } - /* return dos attributes as pseudo xattr */ + /* return alt name if available as pseudo attr */ switch (handler->flags) { case XATTR_USER: + cifs_dbg(FYI, "%s:querying user xattr %s\n", __func__, name); + if (strcmp(name, CIFS_XATTR_ATTRIB) == 0) { + rc = cifs_attrib_get(dentry, inode, value, size); + break; + } else if (strcmp(name, CIFS_XATTR_CREATETIME) == 0) { + rc = cifs_creation_time_get(dentry, inode, value, size); + break; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto out; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c1e9f29c924c..f2d7402abe02 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1209,6 +1209,8 @@ COMPATIBLE_IOCTL(WDIOC_SETOPTIONS) COMPATIBLE_IOCTL(WDIOC_KEEPALIVE) COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT) COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT) +COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT) +COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT) /* Big R */ COMPATIBLE_IOCTL(RNDGETENTCNT) COMPATIBLE_IOCTL(RNDADDTOENTCNT) diff --git a/fs/exec.c b/fs/exec.c index 6fcfb3f7b137..4e497b9ee71e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -191,6 +191,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, { struct page *page; int ret; + unsigned int gup_flags = FOLL_FORCE; #ifdef CONFIG_STACK_GROWSUP if (write) { @@ -199,12 +200,16 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, return NULL; } #endif + + if (write) + gup_flags |= FOLL_WRITE; + /* * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ - ret = get_user_pages_remote(current, bprm->mm, pos, 1, write, - 1, &page, NULL); + ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, + &page, NULL); if (ret <= 0) return NULL; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 207ba8d627ca..a4b531be9168 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -428,10 +428,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (!nop || !nop->fh_to_dentry) return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); - if (!result) - result = ERR_PTR(-ESTALE); - if (IS_ERR(result)) - return result; + if (PTR_ERR(result) == -ENOMEM) + return ERR_CAST(result); + if (IS_ERR_OR_NULL(result)) + return ERR_PTR(-ESTALE); if (d_is_dir(result)) { /* @@ -541,6 +541,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, err_result: dput(result); + if (err != -ENOMEM) + err = -ESTALE; return ERR_PTR(err); } EXPORT_SYMBOL_GPL(exportfs_decode_fh); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index d831e24dc885..41b8b44a391c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -622,7 +622,7 @@ static int ext2_get_blocks(struct inode *inode, u32 *bno, bool *new, bool *boundary, int create) { - int err = -EIO; + int err; int offsets[4]; Indirect chain[4]; Indirect *partial; @@ -639,7 +639,7 @@ static int ext2_get_blocks(struct inode *inode, depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary); if (depth == 0) - return (err); + return -EIO; partial = ext2_get_branch(inode, depth, offsets, chain, &err); /* Simplest case - block found, no allocation needed */ @@ -761,7 +761,6 @@ static int ext2_get_blocks(struct inode *inode, ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: - *bno = le32_to_cpu(chain[depth-1].key); if (count > blocks_to_boundary) *boundary = true; err = count; @@ -772,6 +771,8 @@ cleanup: brelse(partial->bh); partial--; } + if (err > 0) + *bno = le32_to_cpu(chain[depth-1].key); return err; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 93985c64d8a8..6f14ee923acd 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -852,16 +852,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0 || - unlikely(f2fs_cp_error(sbi))) - goto next; - /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_bug_on(sbi, !PageUptodate(sum_page)); f2fs_put_page(sum_page, 0); + if (get_valid_blocks(sbi, segno, 1) == 0 || + !PageUptodate(sum_page) || + unlikely(f2fs_cp_error(sbi))) + goto next; + sum = page_address(sum_page); f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index ad0c745ebad7..871c8b392099 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -687,6 +687,11 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) pri_bh = NULL; root_found: + /* We don't support read-write mounts */ + if (!(s->s_flags & MS_RDONLY)) { + error = -EACCES; + goto out_freebh; + } if (joliet_level && (pri == NULL || !opt.rock)) { /* This is the case of Joliet with the norock mount flag. @@ -1501,9 +1506,6 @@ struct inode *__isofs_iget(struct super_block *sb, static struct dentry *isofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - /* We don't support read-write mounts */ - if (!(flags & MS_RDONLY)) - return ERR_PTR(-EACCES); return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index dcd96aac02f5..cf4c636ff4da 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -110,8 +110,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * - * return value: length of the string. If greater than buflen, - * then contents of buf are undefined. On error, -1 is returned. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, @@ -119,9 +120,8 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; - size_t depth_from, depth_to, len = 0, nlen = 0; - char *p; - int i; + size_t depth_from, depth_to, len = 0; + int i, j; if (!kn_from) kn_from = kernfs_root(kn_to)->kn; @@ -131,7 +131,7 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) - return -1; + return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); @@ -144,22 +144,16 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, len < buflen ? buflen - len : 0); /* Calculate how many bytes we need for the rest */ - for (kn = kn_to; kn != common; kn = kn->parent) - nlen += strlen(kn->name) + 1; - - if (len + nlen >= buflen) - return len + nlen; - - p = buf + len + nlen; - *p = '\0'; - for (kn = kn_to; kn != common; kn = kn->parent) { - size_t tmp = strlen(kn->name); - p -= tmp; - memcpy(p, kn->name, tmp); - *(--p) = '/'; + for (i = depth_to - 1; i >= 0; i--) { + for (kn = kn_to, j = 0; j < i; j++) + kn = kn->parent; + len += strlcpy(buf + len, "/", + len < buflen ? buflen - len : 0); + len += strlcpy(buf + len, kn->name, + len < buflen ? buflen - len : 0); } - return len + nlen; + return len; } /** @@ -186,29 +180,6 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) } /** - * kernfs_path_len - determine the length of the full path of a given node - * @kn: kernfs_node of interest - * - * The returned length doesn't include the space for the terminating '\0'. - */ -size_t kernfs_path_len(struct kernfs_node *kn) -{ - size_t len = 0; - unsigned long flags; - - spin_lock_irqsave(&kernfs_rename_lock, flags); - - do { - len += strlen(kn->name) + 1; - kn = kn->parent; - } while (kn && kn->parent); - - spin_unlock_irqrestore(&kernfs_rename_lock, flags); - - return len; -} - -/** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest @@ -220,8 +191,9 @@ size_t kernfs_path_len(struct kernfs_node *kn) * path (which includes '..'s) as needed to reach from @from to @to is * returned. * - * If @buf isn't long enough, the return value will be greater than @buflen - * and @buf contents are undefined. + * Returns the length of the full path. If the full length is equal to or + * greater than @buflen, @buf contains the truncated path with the trailing + * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) @@ -237,28 +209,6 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** - * kernfs_path - build full path of a given node - * @kn: kernfs_node of interest - * @buf: buffer to copy @kn's name into - * @buflen: size of @buf - * - * Builds and returns the full path of @kn in @buf of @buflen bytes. The - * path is built from the end of @buf so the returned pointer usually - * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated - * and %NULL is returned. - */ -char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) -{ - int ret; - - ret = kernfs_path_from_node(kn, NULL, buf, buflen); - if (ret < 0 || ret >= buflen) - return NULL; - return buf; -} -EXPORT_SYMBOL_GPL(kernfs_path); - -/** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * diff --git a/fs/locks.c b/fs/locks.c index ce93b416b490..22c5b4aa4961 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1609,6 +1609,7 @@ int fcntl_getlease(struct file *filp) ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); list_for_each_entry(fl, &ctx->flc_lease, fl_list) { @@ -1618,6 +1619,8 @@ int fcntl_getlease(struct file *filp) break; } spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); } return type; @@ -2529,11 +2532,14 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) if (list_empty(&ctx->flc_lease)) return; + percpu_down_read_preempt_disable(&file_rwsem); spin_lock(&ctx->flc_lock); list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) if (filp == fl->fl_file) lease_modify(fl, F_UNLCK, &dispose); spin_unlock(&ctx->flc_lock); + percpu_up_read_preempt_enable(&file_rwsem); + locks_dispose_list(&dispose); } diff --git a/fs/namei.c b/fs/namei.c index a7f601cd521a..5b4eed221530 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4668,6 +4668,31 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(generic_readlink); +/** + * vfs_get_link - get symlink body + * @dentry: dentry on which to get symbolic link + * @done: caller needs to free returned data with this + * + * Calls security hook and i_op->get_link() on the supplied inode. + * + * It does not touch atime. That's up to the caller if necessary. + * + * Does not work on "special" symlinks like /proc/$$/fd/N + */ +const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) +{ + const char *res = ERR_PTR(-EINVAL); + struct inode *inode = d_inode(dentry); + + if (d_is_symlink(dentry)) { + res = ERR_PTR(security_inode_readlink(dentry)); + if (!res) + res = inode->i_op->get_link(dentry, inode, done); + } + return res; +} +EXPORT_SYMBOL(vfs_get_link); + /* get the link contents into pagecache */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) diff --git a/fs/namespace.c b/fs/namespace.c index 58aca9c931ac..e6c234b1a645 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2824,6 +2824,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) return new_ns; } +__latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 217847679f0e..2905479f214a 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -344,9 +344,10 @@ static void bl_write_cleanup(struct work_struct *work) u64 start = hdr->args.offset & (loff_t)PAGE_MASK; u64 end = (hdr->args.offset + hdr->args.count + PAGE_SIZE - 1) & (loff_t)PAGE_MASK; + u64 lwb = hdr->args.offset + hdr->args.count; ext_tree_mark_written(bl, start >> SECTOR_SHIFT, - (end - start) >> SECTOR_SHIFT, end); + (end - start) >> SECTOR_SHIFT, lwb); } pnfs_ld_write_done(hdr); diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index 5f7b053720ee..6de15709d024 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -76,7 +76,7 @@ static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); - complete_all(&dreq->completion); + complete(&dreq->completion); nfs_cache_defer_req_put(dreq); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 52a28311e2a4..532d8e242d4d 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -31,8 +31,6 @@ struct nfs_callback_data { unsigned int users; struct svc_serv *serv; - struct svc_rqst *rqst; - struct task_struct *task; }; static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1]; @@ -89,15 +87,6 @@ nfs4_callback_svc(void *vrqstp) return 0; } -/* - * Prepare to bring up the NFSv4 callback service - */ -static struct svc_rqst * -nfs4_callback_up(struct svc_serv *serv) -{ - return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); -} - #if defined(CONFIG_NFS_V4_1) /* * The callback service for NFSv4.1 callbacks @@ -139,29 +128,6 @@ nfs41_callback_svc(void *vrqstp) return 0; } -/* - * Bring up the NFSv4.1 callback service - */ -static struct svc_rqst * -nfs41_callback_up(struct svc_serv *serv) -{ - struct svc_rqst *rqstp; - - INIT_LIST_HEAD(&serv->sv_cb_list); - spin_lock_init(&serv->sv_cb_lock); - init_waitqueue_head(&serv->sv_cb_waitq); - rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE); - dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp)); - return rqstp; -} - -static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) -{ - *rqstpp = nfs41_callback_up(serv); - *callback_svc = nfs41_callback_svc; -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -173,13 +139,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, xprt->bc_serv = serv; } #else -static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv, - struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp)) -{ - *rqstpp = ERR_PTR(-ENOTSUPP); - *callback_svc = ERR_PTR(-ENOTSUPP); -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -189,45 +148,22 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { - struct svc_rqst *rqstp; - int (*callback_svc)(void *vrqstp); - struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + int nrservs = nfs_callback_nr_threads; int ret; nfs_callback_bc_serv(minorversion, xprt, serv); - if (cb_info->task) - return 0; + if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS) + nrservs = NFS4_MIN_NR_CALLBACK_THREADS; - switch (minorversion) { - case 0: - /* v4.0 callback setup */ - rqstp = nfs4_callback_up(serv); - callback_svc = nfs4_callback_svc; - break; - default: - nfs_minorversion_callback_svc_setup(serv, - &rqstp, &callback_svc); - } - - if (IS_ERR(rqstp)) - return PTR_ERR(rqstp); - - svc_sock_update_bufs(serv); + if (serv->sv_nrthreads-1 == nrservs) + return 0; - cb_info->serv = serv; - cb_info->rqst = rqstp; - cb_info->task = kthread_create(callback_svc, cb_info->rqst, - "nfsv4.%u-svc", minorversion); - if (IS_ERR(cb_info->task)) { - ret = PTR_ERR(cb_info->task); - svc_exit_thread(cb_info->rqst); - cb_info->rqst = NULL; - cb_info->task = NULL; + ret = serv->sv_ops->svo_setup(serv, NULL, nrservs); + if (ret) { + serv->sv_ops->svo_setup(serv, NULL, 0); return ret; } - rqstp->rq_task = cb_info->task; - wake_up_process(cb_info->task); dprintk("nfs_callback_up: service started\n"); return 0; } @@ -281,19 +217,41 @@ err_bind: return ret; } -static struct svc_serv_ops nfs_cb_sv_ops = { +static struct svc_serv_ops nfs40_cb_sv_ops = { + .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; +#if defined(CONFIG_NFS_V4_1) +static struct svc_serv_ops nfs41_cb_sv_ops = { + .svo_function = nfs41_callback_svc, + .svo_enqueue_xprt = svc_xprt_do_enqueue, + .svo_setup = svc_set_num_threads, + .svo_module = THIS_MODULE, +}; + +struct svc_serv_ops *nfs4_cb_sv_ops[] = { + [0] = &nfs40_cb_sv_ops, + [1] = &nfs41_cb_sv_ops, +}; +#else +struct svc_serv_ops *nfs4_cb_sv_ops[] = { + [0] = &nfs40_cb_sv_ops, + [1] = NULL, }; +#endif static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; struct svc_serv *serv; + struct svc_serv_ops *sv_ops; /* * Check whether we're already up and running. */ - if (cb_info->task) { + if (cb_info->serv) { /* * Note: increase service usage, because later in case of error * svc_destroy() will be called. @@ -302,6 +260,17 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) return cb_info->serv; } + switch (minorversion) { + case 0: + sv_ops = nfs4_cb_sv_ops[0]; + break; + default: + sv_ops = nfs4_cb_sv_ops[1]; + } + + if (sv_ops == NULL) + return ERR_PTR(-ENOTSUPP); + /* * Sanity check: if there's no task, * we should be the first user ... @@ -310,11 +279,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); } + cb_info->serv = serv; /* As there is only one thread we need to over-ride the * default maximum of 80 connections */ @@ -357,6 +327,8 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt) * thread exits. */ err_net: + if (!cb_info->users) + cb_info->serv = NULL; svc_destroy(serv); err_create: mutex_unlock(&nfs_callback_mutex); @@ -374,18 +346,18 @@ err_start: void nfs_callback_down(int minorversion, struct net *net) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + struct svc_serv *serv; mutex_lock(&nfs_callback_mutex); - nfs_callback_down_net(minorversion, cb_info->serv, net); + serv = cb_info->serv; + nfs_callback_down_net(minorversion, serv, net); cb_info->users--; - if (cb_info->users == 0 && cb_info->task != NULL) { - kthread_stop(cb_info->task); - dprintk("nfs_callback_down: service stopped\n"); - svc_exit_thread(cb_info->rqst); + if (cb_info->users == 0) { + svc_get(serv); + serv->sv_ops->svo_setup(serv, NULL, 0); + svc_destroy(serv); dprintk("nfs_callback_down: service destroyed\n"); cb_info->serv = NULL; - cb_info->rqst = NULL; - cb_info->task = NULL; } mutex_unlock(&nfs_callback_mutex); } diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 5fe1cecbf9f0..c701c308fac5 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -179,6 +179,15 @@ extern __be32 nfs4_callback_devicenotify( struct cb_devicenotifyargs *args, void *dummy, struct cb_process_state *cps); +struct cb_notify_lock_args { + struct nfs_fh cbnl_fh; + struct nfs_lowner cbnl_owner; + bool cbnl_valid; +}; + +extern __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, + void *dummy, + struct cb_process_state *cps); #endif /* CONFIG_NFS_V4_1 */ extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, @@ -198,6 +207,9 @@ extern void nfs_callback_down(int minorversion, struct net *net); #define NFS41_BC_MIN_CALLBACKS 1 #define NFS41_BC_MAX_CALLBACKS 1 +#define NFS4_MIN_NR_CALLBACK_THREADS 1 + extern unsigned int nfs_callback_set_tcpport; +extern unsigned short nfs_callback_nr_threads; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f953ef6b2f2e..e9aa235e9d10 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -628,4 +628,20 @@ out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } + +__be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy, + struct cb_process_state *cps) +{ + if (!cps->clp) /* set in cb_sequence */ + return htonl(NFS4ERR_OP_NOT_IN_SESSION); + + dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n", + rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR)); + + /* Don't wake anybody if the string looked bogus */ + if (args->cbnl_valid) + __wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args); + + return htonl(NFS4_OK); +} #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 656f68f7fe53..eb094c6011d8 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -35,6 +35,7 @@ (1 + 3) * 4) // seqid, 3 slotids #define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) +#define CB_OP_NOTIFY_LOCK_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) #endif /* CONFIG_NFS_V4_1 */ #define NFSDBG_FACILITY NFSDBG_CALLBACK @@ -72,7 +73,7 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy) return xdr_ressize_check(rqstp, p); } -static __be32 *read_buf(struct xdr_stream *xdr, int nbytes) +static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) { __be32 *p; @@ -534,6 +535,49 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, return 0; } +static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args) +{ + __be32 *p; + unsigned int len; + + p = read_buf(xdr, 12); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + + p = xdr_decode_hyper(p, &args->cbnl_owner.clientid); + len = be32_to_cpu(*p); + + p = read_buf(xdr, len); + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); + + /* Only try to decode if the length is right */ + if (len == 20) { + p += 2; /* skip "lock id:" */ + args->cbnl_owner.s_dev = be32_to_cpu(*p++); + xdr_decode_hyper(p, &args->cbnl_owner.id); + args->cbnl_valid = true; + } else { + args->cbnl_owner.s_dev = 0; + args->cbnl_owner.id = 0; + args->cbnl_valid = false; + } + return 0; +} + +static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_notify_lock_args *args) +{ + __be32 status; + + status = decode_fh(xdr, &args->cbnl_fh); + if (unlikely(status != 0)) + goto out; + status = decode_lockowner(xdr, args); +out: + dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); + return status; +} + #endif /* CONFIG_NFS_V4_1 */ static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) @@ -746,6 +790,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_RECALL_SLOT: case OP_CB_LAYOUTRECALL: case OP_CB_NOTIFY_DEVICEID: + case OP_CB_NOTIFY_LOCK: *op = &callback_ops[op_nr]; break; @@ -753,7 +798,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) case OP_CB_PUSH_DELEG: case OP_CB_RECALLABLE_OBJ_AVAIL: case OP_CB_WANTS_CANCELLED: - case OP_CB_NOTIFY_LOCK: return htonl(NFS4ERR_NOTSUPP); default: @@ -1006,6 +1050,11 @@ static struct callback_op callback_ops[] = { .decode_args = (callback_decode_arg_t)decode_recallslot_args, .res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ, }, + [OP_CB_NOTIFY_LOCK] = { + .process_op = (callback_process_op_t)nfs4_callback_notify_lock, + .decode_args = (callback_decode_arg_t)decode_notify_lock_args, + .res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ, + }, #endif /* CONFIG_NFS_V4_1 */ }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1e106780a237..7555ba889d1f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -313,7 +313,10 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) - continue; + /* Match all xprt_switch full socket addresses */ + if (!rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient, + sap)) + continue; atomic_inc(&clp->cl_count); return clp; @@ -785,7 +788,8 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs } fsinfo.fattr = fattr; - fsinfo.layouttype = 0; + fsinfo.nlayouttypes = 0; + memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) goto out_error; @@ -1078,7 +1082,7 @@ void nfs_clients_init(struct net *net) idr_init(&nn->cb_ident_idr); #endif spin_lock_init(&nn->nfs_client_lock); - nn->boot_time = CURRENT_TIME; + nn->boot_time = ktime_get_real(); } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 322c2585bc34..dff600ae0d74 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -41,6 +41,17 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +static bool +nfs4_is_valid_delegation(const struct nfs_delegation *delegation, + fmode_t flags) +{ + if (delegation != NULL && (delegation->type & flags) == flags && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && + !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + return true; + return false; +} + static int nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { @@ -50,8 +61,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL && (delegation->type & flags) == flags && - !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (nfs4_is_valid_delegation(delegation, flags)) { if (mark) nfs_mark_delegation_referenced(delegation); ret = 1; @@ -185,15 +195,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, rcu_read_unlock(); put_rpccred(oldcred); trace_nfs4_reclaim_delegation(inode, res->delegation_type); - } else { - /* We appear to have raced with a delegation return. */ - spin_unlock(&delegation->lock); - rcu_read_unlock(); - nfs_inode_set_delegation(inode, cred, res); + return; } - } else { - rcu_read_unlock(); + /* We appear to have raced with a delegation return. */ + spin_unlock(&delegation->lock); } + rcu_read_unlock(); + nfs_inode_set_delegation(inode, cred, res); } static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync) @@ -642,28 +650,49 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl rcu_read_unlock(); } -static void nfs_revoke_delegation(struct inode *inode) +static void nfs_mark_delegation_revoked(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); + delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; + nfs_mark_return_delegation(server, delegation); +} + +static bool nfs_revoke_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_delegation *delegation; + nfs4_stateid tmp; + bool ret = false; + rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (delegation != NULL) { - set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); - nfs_mark_return_delegation(NFS_SERVER(inode), delegation); - } + if (delegation == NULL) + goto out; + if (stateid == NULL) { + nfs4_stateid_copy(&tmp, &delegation->stateid); + stateid = &tmp; + } else if (!nfs4_stateid_match(stateid, &delegation->stateid)) + goto out; + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); + ret = true; +out: rcu_read_unlock(); + if (ret) + nfs_inode_find_state_and_recover(inode, stateid); + return ret; } -void nfs_remove_bad_delegation(struct inode *inode) +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) { struct nfs_delegation *delegation; - nfs_revoke_delegation(inode); + if (!nfs_revoke_delegation(inode, stateid)) + return; delegation = nfs_inode_detach_delegation(inode); - if (delegation) { - nfs_inode_find_state_and_recover(inode, &delegation->stateid); + if (delegation) nfs_free_delegation(delegation); - } } EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); @@ -786,8 +815,15 @@ static void nfs_delegation_mark_reclaim_server(struct nfs_server *server) { struct nfs_delegation *delegation; - list_for_each_entry_rcu(delegation, &server->delegations, super_list) + list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + /* + * If the delegation may have been admin revoked, then we + * cannot reclaim it. + */ + if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) + continue; set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + } } /** @@ -851,6 +887,141 @@ restart: rcu_read_unlock(); } +static inline bool nfs4_server_rebooted(const struct nfs_client *clp) +{ + return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) | + BIT(NFS4CLNT_LEASE_EXPIRED) | + BIT(NFS4CLNT_SESSION_RESET))) != 0; +} + +static void nfs_mark_test_expired_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE) + return; + clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); + set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state); +} + +static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server, + struct inode *inode) +{ + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation) + nfs_mark_test_expired_delegation(server, delegation); + rcu_read_unlock(); + +} + +static void nfs_delegation_mark_test_expired_server(struct nfs_server *server) +{ + struct nfs_delegation *delegation; + + list_for_each_entry_rcu(delegation, &server->delegations, super_list) + nfs_mark_test_expired_delegation(server, delegation); +} + +/** + * nfs_mark_test_expired_all_delegations - mark all delegations for testing + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * marks them as needing to be checked for validity. + */ +void nfs_mark_test_expired_all_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) + nfs_delegation_mark_test_expired_server(server); + rcu_read_unlock(); +} + +/** + * nfs_reap_expired_delegations - reap expired delegations + * @clp: nfs_client to process + * + * Iterates through all the delegations associated with this server and + * checks if they have may have been revoked. This function is usually + * expected to be called in cases where the server may have lost its + * lease. + */ +void nfs_reap_expired_delegations(struct nfs_client *clp) +{ + const struct nfs4_minor_version_ops *ops = clp->cl_mvops; + struct nfs_delegation *delegation; + struct nfs_server *server; + struct inode *inode; + struct rpc_cred *cred; + nfs4_stateid stateid; + +restart: + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry_rcu(delegation, &server->delegations, + super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; + if (test_bit(NFS_DELEGATION_TEST_EXPIRED, + &delegation->flags) == 0) + continue; + if (!nfs_sb_active(server->super)) + continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + cred = get_rpccred_rcu(delegation->cred); + nfs4_stateid_copy(&stateid, &delegation->stateid); + clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + rcu_read_unlock(); + if (cred != NULL && + ops->test_and_free_expired(server, &stateid, cred) < 0) { + nfs_revoke_delegation(inode, &stateid); + nfs_inode_find_state_and_recover(inode, &stateid); + } + put_rpccred(cred); + if (nfs4_server_rebooted(clp)) { + nfs_inode_mark_test_expired_delegation(server,inode); + iput(inode); + nfs_sb_deactive(server->super); + return; + } + iput(inode); + nfs_sb_deactive(server->super); + goto restart; + } + } + rcu_read_unlock(); +} + +void nfs_inode_find_delegation_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid) +{ + struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_delegation *delegation; + bool found = false; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation && + nfs4_stateid_match_other(&delegation->stateid, stateid)) { + nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation); + found = true; + } + rcu_read_unlock(); + if (found) + nfs4_schedule_state_manager(clp); +} + /** * nfs_delegations_present - check for existence of delegations * @clp: client state handle @@ -893,7 +1064,7 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, flags &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - ret = (delegation != NULL && (delegation->type & flags) == flags); + ret = nfs4_is_valid_delegation(delegation, flags); if (ret) { nfs4_stateid_copy(dst, &delegation->stateid); nfs_mark_delegation_referenced(delegation); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 64724d252a79..e9d555796873 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -32,6 +32,7 @@ enum { NFS_DELEGATION_REFERENCED, NFS_DELEGATION_RETURNING, NFS_DELEGATION_REVOKED, + NFS_DELEGATION_TEST_EXPIRED, }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -47,11 +48,14 @@ void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags); void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); -void nfs_remove_bad_delegation(struct inode *inode); +void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); +void nfs_mark_test_expired_all_delegations(struct nfs_client *clp); +void nfs_reap_expired_delegations(struct nfs_client *clp); + /* NFSv4 delegation-related procedures */ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync); int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); @@ -62,6 +66,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); bool nfs4_delegation_flush_on_close(const struct inode *inode); +void nfs_inode_find_delegation_state_and_recover(struct inode *inode, + const nfs4_stateid *stateid); #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 06e0bf092ba9..5f1af4cd1a33 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -435,11 +435,11 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) return 0; nfsi = NFS_I(inode); - if (entry->fattr->fileid == nfsi->fileid) - return 1; - if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) - return 1; - return 0; + if (entry->fattr->fileid != nfsi->fileid) + return 0; + if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0) + return 0; + return 1; } static @@ -496,6 +496,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) return; if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) return; + if (filename.len == 0) + return; + /* Validate that the name doesn't contain any illegal '\0' */ + if (strnlen(filename.name, filename.len) != filename.len) + return; + /* ...or '/' */ + if (strnchr(filename.name, filename.len, '/')) + return; if (filename.name[0] == '.') { if (filename.len == 1) return; @@ -517,6 +525,8 @@ again: &entry->fattr->fsid)) goto out; if (nfs_same_file(dentry, entry)) { + if (!entry->fh->size) + goto out; nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) @@ -529,6 +539,10 @@ again: goto again; } } + if (!entry->fh->size) { + d_lookup_done(dentry); + goto out; + } inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label); alias = d_splice_alias(inode, dentry); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 72b7d13ee3c6..bd81bcf3ffcf 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) dreq->iocb->ki_complete(dreq->iocb, res, 0); } - complete_all(&dreq->completion); + complete(&dreq->completion); nfs_direct_req_release(dreq); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2efbdde36c3e..9ea85ae23c32 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -520,7 +520,9 @@ const struct address_space_operations nfs_file_aops = { .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, +#ifdef CONFIG_MIGRATION .migratepage = nfs_migrate_page, +#endif .launder_page = nfs_launder_page, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, @@ -685,11 +687,6 @@ out_noconflict: goto out; } -static int do_vfs_lock(struct file *file, struct file_lock *fl) -{ - return locks_lock_file_wait(file, fl); -} - static int do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { @@ -722,7 +719,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else - status = do_vfs_lock(filp, fl); + status = locks_lock_file_wait(filp, fl); return status; } @@ -747,7 +744,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!is_local) status = NFS_PROTO(inode)->lock(filp, cmd, fl); else - status = do_vfs_lock(filp, fl); + status = locks_lock_file_wait(filp, fl); if (status < 0) goto out; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 51b51369704c..98ace127bf86 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1080,7 +1080,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: if (state == NULL) break; - nfs_remove_bad_delegation(state->inode); + nfs_remove_bad_delegation(state->inode, NULL); case -NFS4ERR_OPENMODE: if (state == NULL) break; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a6acce663219..80bcc0befb07 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -534,12 +534,9 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) } #endif - #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -#else -#define nfs_migrate_page NULL #endif static inline int @@ -562,7 +559,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ -extern void __nfs4_read_done_cb(struct nfs_pgio_header *); extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, @@ -571,6 +567,9 @@ extern int nfs40_walk_client_list(struct nfs_client *clp, extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); +extern int nfs4_test_session_trunk(struct rpc_clnt *, + struct rpc_xprt *, + void *); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index f0e06e4acbef..fbce0d885d4c 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -29,7 +29,7 @@ struct nfs_net { int cb_users[NFS4_MAX_MINOR_VERSION + 1]; #endif spinlock_t nfs_client_lock; - struct timespec boot_time; + ktime_t boot_time; #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc_nfsfs; #endif diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 64b43b4ad9dd..608501971fe0 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -443,6 +443,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, task = rpc_run_task(&task_setup); if (IS_ERR(task)) return PTR_ERR(task); + rpc_put_task(task); return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 9bf64eacba5b..9b3a82abab07 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -39,6 +39,7 @@ enum nfs4_client_state { NFS4CLNT_BIND_CONN_TO_SESSION, NFS4CLNT_MOVED, NFS4CLNT_LEASE_MOVED, + NFS4CLNT_DELEGATION_EXPIRED, }; #define NFS4_RENEW_TIMEOUT 0x01 @@ -57,8 +58,11 @@ struct nfs4_minor_version_ops { struct nfs_fsinfo *); void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); + int (*test_and_free_expired)(struct nfs_server *, + nfs4_stateid *, struct rpc_cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + int (*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *); const struct rpc_call_ops *call_sync_ops; const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; @@ -156,6 +160,7 @@ enum { NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ + NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ }; struct nfs4_state { @@ -203,6 +208,11 @@ struct nfs4_state_recovery_ops { struct rpc_cred *); }; +struct nfs4_add_xprt_data { + struct nfs_client *clp; + struct rpc_cred *cred; +}; + struct nfs4_state_maintenance_ops { int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned); struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *); @@ -278,6 +288,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync); +extern int nfs4_detect_session_trunking(struct nfs_client *clp, + struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt); static inline bool is_ds_only_client(struct nfs_client *clp) @@ -439,7 +451,7 @@ extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp); extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern int nfs4_schedule_migration_recovery(const struct nfs_server *); extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *); -extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); +extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool); extern void nfs41_handle_server_scope(struct nfs_client *, struct nfs41_server_scope **); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); @@ -471,6 +483,7 @@ extern struct nfs_subversion nfs_v4; struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *); extern bool nfs4_disable_idmapping; extern unsigned short max_session_slots; +extern unsigned short max_session_cb_slots; extern unsigned short send_implementation_id; extern bool recover_lost_locks; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index cd3b7cfdde16..074ac7131459 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; +#if IS_ENABLED(CONFIG_NFS_V4_1) + init_waitqueue_head(&clp->cl_lock_waitq); +#endif return clp; error: @@ -562,15 +565,15 @@ out: /* * Returns true if the client IDs match */ -static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) +static bool nfs4_match_clientids(u64 a, u64 b) { - if (a->cl_clientid != b->cl_clientid) { + if (a != b) { dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a->cl_clientid, b->cl_clientid); + __func__, a, b); return false; } dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a->cl_clientid, b->cl_clientid); + __func__, a, b); return true; } @@ -578,17 +581,15 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) * Returns true if the server major ids match */ static bool -nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b) +nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2) { - struct nfs41_server_owner *o1 = a->cl_serverowner; - struct nfs41_server_owner *o2 = b->cl_serverowner; - if (o1->major_id_sz != o2->major_id_sz) goto out_major_mismatch; if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) goto out_major_mismatch; - dprintk("NFS: --> %s server owners match\n", __func__); + dprintk("NFS: --> %s server owner major IDs match\n", __func__); return true; out_major_mismatch: @@ -597,6 +598,100 @@ out_major_mismatch: return false; } +/* + * Returns true if server minor ids match + */ +static bool +nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2) +{ + /* Check eir_server_owner so_minor_id */ + if (o1->minor_id != o2->minor_id) + goto out_minor_mismatch; + + dprintk("NFS: --> %s server owner minor IDs match\n", __func__); + return true; + +out_minor_mismatch: + dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__); + return false; +} + +/* + * Returns true if the server scopes match + */ +static bool +nfs4_check_server_scope(struct nfs41_server_scope *s1, + struct nfs41_server_scope *s2) +{ + if (s1->server_scope_sz != s2->server_scope_sz) + goto out_scope_mismatch; + if (memcmp(s1->server_scope, s2->server_scope, + s1->server_scope_sz) != 0) + goto out_scope_mismatch; + + dprintk("NFS: --> %s server scopes match\n", __func__); + return true; + +out_scope_mismatch: + dprintk("NFS: --> %s server scopes do not match\n", + __func__); + return false; +} + +/** + * nfs4_detect_session_trunking - Checks for session trunking. + * + * Called after a successful EXCHANGE_ID on a multi-addr connection. + * Upon success, add the transport. + * + * @clp: original mount nfs_client + * @res: result structure from an exchange_id using the original mount + * nfs_client with a new multi_addr transport + * + * Returns zero on success, otherwise -EINVAL + * + * Note: since the exchange_id for the new multi_addr transport uses the + * same nfs_client from the original mount, the cl_owner_id is reused, + * so eir_clientowner is the same. + */ +int nfs4_detect_session_trunking(struct nfs_client *clp, + struct nfs41_exchange_id_res *res, + struct rpc_xprt *xprt) +{ + /* Check eir_clientid */ + if (!nfs4_match_clientids(clp->cl_clientid, res->clientid)) + goto out_err; + + /* Check eir_server_owner so_major_id */ + if (!nfs4_check_serverowner_major_id(clp->cl_serverowner, + res->server_owner)) + goto out_err; + + /* Check eir_server_owner so_minor_id */ + if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner, + res->server_owner)) + goto out_err; + + /* Check eir_server_scope */ + if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope)) + goto out_err; + + /* Session trunking passed, add the xprt */ + rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt); + + pr_info("NFS: %s: Session trunking succeeded for %s\n", + clp->cl_hostname, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + return 0; +out_err: + pr_info("NFS: %s: Session trunking failed for %s\n", clp->cl_hostname, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + return -EINVAL; +} + /** * nfs41_walk_client_list - Find nfs_client that matches a client/server owner * @@ -650,7 +745,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (!nfs4_match_clientids(pos, new)) + if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid)) continue; /* @@ -658,7 +753,8 @@ int nfs41_walk_client_list(struct nfs_client *new, * client id trunking. In either case, we want to fall back * to using the existing nfs_client. */ - if (!nfs4_check_clientid_trunking(pos, new)) + if (!nfs4_check_serverowner_major_id(pos->cl_serverowner, + new->cl_serverowner)) continue; /* Unlike NFSv4.0, we know that NFSv4.1 always uses the diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0e327528a3ce..7897826d7c51 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -99,8 +99,8 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, #ifdef CONFIG_NFS_V4_1 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, struct rpc_cred *); -static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, - struct rpc_cred *); +static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, + struct rpc_cred *, bool); #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL @@ -328,6 +328,33 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static void nfs4_test_and_free_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops; + + ops->test_and_free_expired(server, stateid, cred); +} + +static void __nfs4_free_revoked_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + stateid->type = NFS4_REVOKED_STATEID_TYPE; + nfs4_test_and_free_stateid(server, stateid, cred); +} + +static void nfs4_free_revoked_stateid(struct nfs_server *server, + const nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + nfs4_stateid tmp; + + nfs4_stateid_copy(&tmp, stateid); + __nfs4_free_revoked_stateid(server, &tmp, cred); +} + static long nfs4_update_delay(long *timeout) { long ret; @@ -370,13 +397,23 @@ static int nfs4_do_handle_exception(struct nfs_server *server, exception->delay = 0; exception->recovering = 0; exception->retry = 0; + + if (stateid == NULL && state != NULL) + stateid = &state->stateid; + switch(errorcode) { case 0: return 0; - case -NFS4ERR_OPENMODE: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + if (inode != NULL && stateid != NULL) { + nfs_inode_find_state_and_recover(inode, + stateid); + goto wait_on_recovery; + } + case -NFS4ERR_OPENMODE: if (inode) { int err; @@ -395,12 +432,6 @@ static int nfs4_do_handle_exception(struct nfs_server *server, if (ret < 0) break; goto wait_on_recovery; - case -NFS4ERR_EXPIRED: - if (state != NULL) { - ret = nfs4_schedule_stateid_recovery(server, state); - if (ret < 0) - break; - } case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: nfs4_schedule_lease_recovery(clp); @@ -616,6 +647,7 @@ int nfs40_setup_sequence(struct nfs4_slot_table *tbl, } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; res->sr_slot = slot; @@ -723,12 +755,20 @@ static int nfs41_sequence_process(struct rpc_task *task, /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: + /* If previous op on slot was interrupted and we reused + * the seq# and got a reply from the cache, then retry + */ + if (task->tk_status == -EREMOTEIO && interrupted) { + ++slot->seq_nr; + goto retry_nowait; + } /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags, + !!slot->privileged); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -875,6 +915,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, } spin_unlock(&tbl->slot_tbl_lock); + slot->privileged = args->sa_privileged ? 1 : 0; args->sa_slot = slot; dprintk("<-- %s slotid=%u seqid=%u\n", __func__, @@ -1353,6 +1394,19 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode) nfs4_state_set_mode_locked(state, state->state | fmode); } +#ifdef CONFIG_NFS_V4_1 +static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state) +{ + if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags)) + return true; + if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags)) + return true; + if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags)) + return true; + return false; +} +#endif /* CONFIG_NFS_V4_1 */ + static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) { struct nfs_client *clp = state->owner->so_server->nfs_client; @@ -1369,11 +1423,12 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) } static bool nfs_need_update_open_stateid(struct nfs4_state *state, - nfs4_stateid *stateid) + const nfs4_stateid *stateid, nfs4_stateid *freeme) { if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) return true; if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs4_stateid_copy(freeme, &state->open_stateid); nfs_test_and_clear_all_open_stateid(state); return true; } @@ -1437,7 +1492,9 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_schedule_state_manager(state->owner->so_server->nfs_client); } -static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) +static void nfs_set_open_stateid_locked(struct nfs4_state *state, + const nfs4_stateid *stateid, fmode_t fmode, + nfs4_stateid *freeme) { switch (fmode) { case FMODE_READ: @@ -1449,14 +1506,18 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid * case FMODE_READ|FMODE_WRITE: set_bit(NFS_O_RDWR_STATE, &state->flags); } - if (!nfs_need_update_open_stateid(state, stateid)) + if (!nfs_need_update_open_stateid(state, stateid, freeme)) return; if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); } -static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode) +static void __update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *deleg_stateid, + fmode_t fmode, + nfs4_stateid *freeme) { /* * Protect the call to nfs4_state_set_mode_locked and @@ -1469,16 +1530,22 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s set_bit(NFS_DELEGATED_STATE, &state->flags); } if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, fmode); + nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme); write_sequnlock(&state->seqlock); update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); } -static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode) +static int update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *delegation, + fmode_t fmode) { + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *deleg_cur; + nfs4_stateid freeme = { }; int ret = 0; fmode &= (FMODE_READ|FMODE_WRITE); @@ -1500,7 +1567,8 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); - __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode); + __update_open_stateid(state, open_stateid, &deleg_cur->stateid, + fmode, &freeme); ret = 1; no_delegation_unlock: spin_unlock(&deleg_cur->lock); @@ -1508,11 +1576,14 @@ no_delegation: rcu_read_unlock(); if (!ret && open_stateid != NULL) { - __update_open_stateid(state, open_stateid, NULL, fmode); + __update_open_stateid(state, open_stateid, NULL, fmode, &freeme); ret = 1; } if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) - nfs4_schedule_state_manager(state->owner->so_server->nfs_client); + nfs4_schedule_state_manager(clp); + if (freeme.type != 0) + nfs4_test_and_free_stateid(server, &freeme, + state->owner->so_cred); return ret; } @@ -1889,7 +1960,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: set_bit(NFS_DELEGATED_STATE, &state->flags); - case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ nfs4_schedule_lease_recovery(server->nfs_client); return -EAGAIN; @@ -1901,6 +1971,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct return -EAGAIN; case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: nfs_inode_find_state_and_recover(state->inode, @@ -2382,9 +2453,10 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } -static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) +static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) { - nfs_remove_bad_delegation(state->inode); + nfs_remove_bad_delegation(state->inode, stateid); write_seqlock(&state->seqlock); nfs4_stateid_copy(&state->stateid, &state->open_stateid); write_sequnlock(&state->seqlock); @@ -2394,7 +2466,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) static void nfs40_clear_delegation_stateid(struct nfs4_state *state) { if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL) - nfs_finish_clear_delegation_stateid(state); + nfs_finish_clear_delegation_stateid(state, NULL); } static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) @@ -2404,7 +2476,45 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st return nfs4_open_expired(sp, state); } +static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + return -NFS4ERR_BAD_STATEID; +} + #if defined(CONFIG_NFS_V4_1) +static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, + nfs4_stateid *stateid, + struct rpc_cred *cred) +{ + int status; + + switch (stateid->type) { + default: + break; + case NFS4_INVALID_STATEID_TYPE: + case NFS4_SPECIAL_STATEID_TYPE: + return -NFS4ERR_BAD_STATEID; + case NFS4_REVOKED_STATEID_TYPE: + goto out_free; + } + + status = nfs41_test_stateid(server, stateid, cred); + switch (status) { + case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + break; + default: + return status; + } +out_free: + /* Ack the revoked state to the server */ + nfs41_free_stateid(server, stateid, cred, true); + return -NFS4ERR_EXPIRED; +} + static void nfs41_check_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); @@ -2422,23 +2532,68 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state) } nfs4_stateid_copy(&stateid, &delegation->stateid); + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { + rcu_read_unlock(); + nfs_finish_clear_delegation_stateid(state, &stateid); + return; + } + + if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) { + rcu_read_unlock(); + return; + } + cred = get_rpccred(delegation->cred); rcu_read_unlock(); - status = nfs41_test_stateid(server, &stateid, cred); + status = nfs41_test_and_free_expired_stateid(server, &stateid, cred); trace_nfs4_test_delegation_stateid(state, NULL, status); - - if (status != NFS_OK) { - /* Free the stateid unless the server explicitly - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, &stateid, cred); - nfs_finish_clear_delegation_stateid(state); - } + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) + nfs_finish_clear_delegation_stateid(state, &stateid); put_rpccred(cred); } /** + * nfs41_check_expired_locks - possibly free a lock stateid + * + * @state: NFSv4 state for an inode + * + * Returns NFS_OK if recovery for this stateid is now finished. + * Otherwise a negative NFS4ERR value is returned. + */ +static int nfs41_check_expired_locks(struct nfs4_state *state) +{ + int status, ret = NFS_OK; + struct nfs4_lock_state *lsp; + struct nfs_server *server = NFS_SERVER(state->inode); + + if (!test_bit(LK_STATE_IN_USE, &state->flags)) + goto out; + list_for_each_entry(lsp, &state->lock_states, ls_locks) { + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct rpc_cred *cred = lsp->ls_state->owner->so_cred; + + status = nfs41_test_and_free_expired_stateid(server, + &lsp->ls_stateid, + cred); + trace_nfs4_test_lock_stateid(state, lsp, status); + if (status == -NFS4ERR_EXPIRED || + status == -NFS4ERR_BAD_STATEID) { + clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE; + if (!recover_lost_locks) + set_bit(NFS_LOCK_LOST, &lsp->ls_flags); + } else if (status != NFS_OK) { + ret = status; + break; + } + } + }; +out: + return ret; +} + +/** * nfs41_check_open_stateid - possibly free an open stateid * * @state: NFSv4 state for an inode @@ -2453,26 +2608,28 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) struct rpc_cred *cred = state->owner->so_cred; int status; - /* If a state reset has been done, test_stateid is unneeded */ - if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) && - (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) && - (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0)) + if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) { + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) { + if (nfs4_have_delegation(state->inode, state->state)) + return NFS_OK; + return -NFS4ERR_OPENMODE; + } return -NFS4ERR_BAD_STATEID; - - status = nfs41_test_stateid(server, stateid, cred); + } + status = nfs41_test_and_free_expired_stateid(server, stateid, cred); trace_nfs4_test_open_stateid(state, NULL, status); - if (status != NFS_OK) { - /* Free the stateid unless the server explicitly - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid, cred); - + if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) { clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); + stateid->type = NFS4_INVALID_STATEID_TYPE; } - return status; + if (status != NFS_OK) + return status; + if (nfs_open_stateid_recover_openmode(state)) + return -NFS4ERR_OPENMODE; + return NFS_OK; } static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) @@ -2480,6 +2637,9 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st int status; nfs41_check_delegation_stateid(state); + status = nfs41_check_expired_locks(state); + if (status != NFS_OK) + return status; status = nfs41_check_open_stateid(state); if (status != NFS_OK) status = nfs4_open_expired(sp, state); @@ -2537,6 +2697,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, goto out; if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); + if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK) + set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags); dentry = opendata->dentry; if (d_really_is_negative(dentry)) { @@ -2899,9 +3061,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(server, + &calldata->arg.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, &state->open_stateid)) { rpc_restart_call_prepare(task); @@ -4312,7 +4477,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s if (error == 0) { /* block layout checks this! */ server->pnfs_blksize = fsinfo->blksize; - set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype); + set_pnfs_layoutdriver(server, fhandle, fsinfo); } return error; @@ -4399,24 +4564,25 @@ static bool nfs4_error_stateid_expired(int err) return false; } -void __nfs4_read_done_cb(struct nfs_pgio_header *hdr) -{ - nfs_invalidate_atime(hdr->inode); -} - static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct nfs_server *server = NFS_SERVER(hdr->inode); trace_nfs4_read(hdr, task->tk_status); - if (nfs4_async_handle_error(task, server, - hdr->args.context->state, - NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return -EAGAIN; + if (task->tk_status < 0) { + struct nfs4_exception exception = { + .inode = hdr->inode, + .state = hdr->args.context->state, + .stateid = &hdr->args.stateid, + }; + task->tk_status = nfs4_async_handle_exception(task, + server, task->tk_status, &exception); + if (exception.retry) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } } - __nfs4_read_done_cb(hdr); if (task->tk_status > 0) renew_lease(server, hdr->timestamp); return 0; @@ -4445,6 +4611,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) return -EAGAIN; if (nfs4_read_stateid_changed(task, &hdr->args)) return -EAGAIN; + if (task->tk_status > 0) + nfs_invalidate_atime(hdr->inode); return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : nfs4_read_done_cb(task, hdr); } @@ -4482,11 +4650,19 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct inode *inode = hdr->inode; trace_nfs4_write(hdr, task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), - hdr->args.context->state, - NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return -EAGAIN; + if (task->tk_status < 0) { + struct nfs4_exception exception = { + .inode = hdr->inode, + .state = hdr->args.context->state, + .stateid = &hdr->args.stateid, + }; + task->tk_status = nfs4_async_handle_exception(task, + NFS_SERVER(inode), task->tk_status, + &exception); + if (exception.retry) { + rpc_restart_call_prepare(task); + return -EAGAIN; + } } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); @@ -5123,12 +5299,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { /* An impossible timestamp guarantees this value * will never match a generated boot time. */ - verf[0] = 0; - verf[1] = cpu_to_be32(NSEC_PER_SEC + 1); + verf[0] = cpu_to_be32(U32_MAX); + verf[1] = cpu_to_be32(U32_MAX); } else { struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); - verf[0] = cpu_to_be32(nn->boot_time.tv_sec); - verf[1] = cpu_to_be32(nn->boot_time.tv_nsec); + u64 ns = ktime_to_ns(nn->boot_time); + + verf[0] = cpu_to_be32(ns >> 32); + verf[1] = cpu_to_be32(ns); } memcpy(bootverf->data, verf, sizeof(bootverf->data)); } @@ -5393,10 +5571,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) renew_lease(data->res.server, data->timestamp); case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(data->res.server, + data->args.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: task->tk_status = 0; if (data->roc) pnfs_roc_set_barrier(data->inode, data->roc_barrier); @@ -5528,22 +5709,6 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 return err; } -#define NFS4_LOCK_MINTIMEOUT (1 * HZ) -#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) - -/* - * sleep, with exponential backoff, and retry the LOCK operation. - */ -static unsigned long -nfs4_set_lock_task_retry(unsigned long timeout) -{ - freezable_schedule_timeout_killable_unsafe(timeout); - timeout <<= 1; - if (timeout > NFS4_LOCK_MAXTIMEOUT) - return NFS4_LOCK_MAXTIMEOUT; - return timeout; -} - static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct inode *inode = state->inode; @@ -5600,11 +5765,6 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } -static int do_vfs_lock(struct inode *inode, struct file_lock *fl) -{ - return locks_lock_inode_wait(inode, fl); -} - struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -5657,14 +5817,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl); + locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: + nfs4_free_revoked_stateid(calldata->server, + &calldata->arg.stateid, + task->tk_msg.rpc_cred); case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: if (!nfs4_stateid_match(&calldata->arg.stateid, &calldata->lsp->ls_stateid)) rpc_restart_call_prepare(task); @@ -5765,7 +5929,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); - if (do_vfs_lock(inode, request) == -ENOENT) { + if (locks_lock_inode_wait(inode, request) == -ENOENT) { up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -5906,7 +6070,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) { + if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) { rpc_restart_call_prepare(task); break; } @@ -5965,6 +6129,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ { switch (error) { case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || @@ -5973,7 +6138,6 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_ break; case -NFS4ERR_STALE_STATEID: lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; - case -NFS4ERR_EXPIRED: nfs4_schedule_lease_recovery(server->nfs_client); }; } @@ -6083,52 +6247,19 @@ out: } #if defined(CONFIG_NFS_V4_1) -/** - * nfs41_check_expired_locks - possibly free a lock stateid - * - * @state: NFSv4 state for an inode - * - * Returns NFS_OK if recovery for this stateid is now finished. - * Otherwise a negative NFS4ERR value is returned. - */ -static int nfs41_check_expired_locks(struct nfs4_state *state) -{ - int status, ret = -NFS4ERR_BAD_STATEID; - struct nfs4_lock_state *lsp; - struct nfs_server *server = NFS_SERVER(state->inode); - - list_for_each_entry(lsp, &state->lock_states, ls_locks) { - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { - struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - - status = nfs41_test_stateid(server, - &lsp->ls_stateid, - cred); - trace_nfs4_test_lock_stateid(state, lsp, status); - if (status != NFS_OK) { - /* Free the stateid unless the server - * informs us the stateid is unrecognized. */ - if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, - &lsp->ls_stateid, - cred); - clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); - ret = status; - } - } - }; - - return ret; -} - static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request) { - int status = NFS_OK; + struct nfs4_lock_state *lsp; + int status; - if (test_bit(LK_STATE_IN_USE, &state->flags)) - status = nfs41_check_expired_locks(state); - if (status != NFS_OK) - status = nfs4_lock_expired(state, request); + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + lsp = request->fl_u.nfs4_fl.owner; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) || + test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) + return 0; + status = nfs4_lock_expired(state, request); return status; } #endif @@ -6138,17 +6269,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs4_state_owner *sp = state->owner; unsigned char fl_flags = request->fl_flags; - int status = -ENOLCK; + int status; - if ((fl_flags & FL_POSIX) && - !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) - goto out; - /* Is this a delegated open? */ - status = nfs4_set_lock_state(state, request); - if (status != 0) - goto out; request->fl_flags |= FL_ACCESS; - status = do_vfs_lock(state->inode, request); + status = locks_lock_inode_wait(state->inode, request); if (status < 0) goto out; mutex_lock(&sp->so_delegreturn_mutex); @@ -6157,7 +6281,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(state->inode, request); + status = locks_lock_inode_wait(state->inode, request); up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -6188,12 +6312,124 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +#define NFS4_LOCK_MINTIMEOUT (1 * HZ) +#define NFS4_LOCK_MAXTIMEOUT (30 * HZ) + +static int +nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd, + struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long timeout = NFS4_LOCK_MINTIMEOUT; + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + freezable_schedule_timeout_interruptible(timeout); + timeout *= 2; + timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); + status = -ERESTARTSYS; + } + return status; +} + +#ifdef CONFIG_NFS_V4_1 +struct nfs4_lock_waiter { + struct task_struct *task; + struct inode *inode; + struct nfs_lowner *owner; + bool notified; +}; + +static int +nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key) +{ + int ret; + struct cb_notify_lock_args *cbnl = key; + struct nfs4_lock_waiter *waiter = wait->private; + struct nfs_lowner *lowner = &cbnl->cbnl_owner, + *wowner = waiter->owner; + + /* Only wake if the callback was for the same owner */ + if (lowner->clientid != wowner->clientid || + lowner->id != wowner->id || + lowner->s_dev != wowner->s_dev) + return 0; + + /* Make sure it's for the right inode */ + if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh)) + return 0; + + waiter->notified = true; + + /* override "private" so we can use default_wake_function */ + wait->private = waiter->task; + ret = autoremove_wake_function(wait, mode, flags, key); + wait->private = waiter; + return ret; +} + +static int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + int status = -ERESTARTSYS; + unsigned long flags; + struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; + struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_client *clp = server->nfs_client; + wait_queue_head_t *q = &clp->cl_lock_waitq; + struct nfs_lowner owner = { .clientid = clp->cl_clientid, + .id = lsp->ls_seqid.owner_id, + .s_dev = server->s_dev }; + struct nfs4_lock_waiter waiter = { .task = current, + .inode = state->inode, + .owner = &owner, + .notified = false }; + wait_queue_t wait; + + /* Don't bother with waitqueue if we don't expect a callback */ + if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags)) + return nfs4_retry_setlk_simple(state, cmd, request); + + init_wait(&wait); + wait.private = &waiter; + wait.func = nfs4_wake_lock_waiter; + add_wait_queue(q, &wait); + + while(!signalled()) { + status = nfs4_proc_setlk(state, cmd, request); + if ((status != -EAGAIN) || IS_SETLK(cmd)) + break; + + status = -ERESTARTSYS; + spin_lock_irqsave(&q->lock, flags); + if (waiter.notified) { + spin_unlock_irqrestore(&q->lock, flags); + continue; + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&q->lock, flags); + + freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + } + + finish_wait(q, &wait); + return status; +} +#else /* !CONFIG_NFS_V4_1 */ +static inline int +nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) +{ + return nfs4_retry_setlk_simple(state, cmd, request); +} +#endif + static int nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) { struct nfs_open_context *ctx; struct nfs4_state *state; - unsigned long timeout = NFS4_LOCK_MINTIMEOUT; int status; /* verify open state */ @@ -6220,6 +6456,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; + + if ((request->fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + return -ENOLCK; + /* * Don't rely on the VFS having checked the file open mode, * since it won't do this for flock() locks. @@ -6234,16 +6475,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return -EBADF; } - do { - status = nfs4_proc_setlk(state, cmd, request); - if ((status != -EAGAIN) || IS_SETLK(cmd)) - break; - timeout = nfs4_set_lock_task_retry(timeout); - status = -ERESTARTSYS; - if (signalled()) - break; - } while(status < 0); - return status; + status = nfs4_set_lock_state(state, request); + if (status != 0) + return status; + + return nfs4_retry_setlk(state, cmd, request); } int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) @@ -7104,75 +7340,161 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, return 0; } +struct nfs41_exchange_id_data { + struct nfs41_exchange_id_res res; + struct nfs41_exchange_id_args args; + struct rpc_xprt *xprt; + int rpc_status; +}; + +static void nfs4_exchange_id_done(struct rpc_task *task, void *data) +{ + struct nfs41_exchange_id_data *cdata = + (struct nfs41_exchange_id_data *)data; + struct nfs_client *clp = cdata->args.client; + int status = task->tk_status; + + trace_nfs4_exchange_id(clp, status); + + if (status == 0) + status = nfs4_check_cl_exchange_flags(cdata->res.flags); + + if (cdata->xprt && status == 0) { + status = nfs4_detect_session_trunking(clp, &cdata->res, + cdata->xprt); + goto out; + } + + if (status == 0) + status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect); + + if (status == 0) { + clp->cl_clientid = cdata->res.clientid; + clp->cl_exchange_flags = cdata->res.flags; + /* Client ID is not confirmed */ + if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); + clp->cl_seqid = cdata->res.seqid; + } + + kfree(clp->cl_serverowner); + clp->cl_serverowner = cdata->res.server_owner; + cdata->res.server_owner = NULL; + + /* use the most recent implementation id */ + kfree(clp->cl_implid); + clp->cl_implid = cdata->res.impl_id; + cdata->res.impl_id = NULL; + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + cdata->res.server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + kfree(clp->cl_serverscope); + clp->cl_serverscope = NULL; + } + + if (clp->cl_serverscope == NULL) { + clp->cl_serverscope = cdata->res.server_scope; + cdata->res.server_scope = NULL; + } + /* Save the EXCHANGE_ID verifier session trunk tests */ + memcpy(clp->cl_confirm.data, cdata->args.verifier->data, + sizeof(clp->cl_confirm.data)); + } +out: + cdata->rpc_status = status; + return; +} + +static void nfs4_exchange_id_release(void *data) +{ + struct nfs41_exchange_id_data *cdata = + (struct nfs41_exchange_id_data *)data; + + nfs_put_client(cdata->args.client); + if (cdata->xprt) { + xprt_put(cdata->xprt); + rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); + } + kfree(cdata->res.impl_id); + kfree(cdata->res.server_scope); + kfree(cdata->res.server_owner); + kfree(cdata); +} + +static const struct rpc_call_ops nfs4_exchange_id_call_ops = { + .rpc_call_done = nfs4_exchange_id_done, + .rpc_release = nfs4_exchange_id_release, +}; + /* * _nfs4_proc_exchange_id() * * Wrapper for EXCHANGE_ID operation. */ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, - u32 sp4_how) + u32 sp4_how, struct rpc_xprt *xprt) { nfs4_verifier verifier; - struct nfs41_exchange_id_args args = { - .verifier = &verifier, - .client = clp, -#ifdef CONFIG_NFS_V4_1_MIGRATION - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID | - EXCHGID4_FLAG_SUPP_MOVED_MIGR, -#else - .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, -#endif - }; - struct nfs41_exchange_id_res res = { - 0 - }; - int status; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID], - .rpc_argp = &args, - .rpc_resp = &res, .rpc_cred = cred, }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .callback_ops = &nfs4_exchange_id_call_ops, + .rpc_message = &msg, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + }; + struct nfs41_exchange_id_data *calldata; + struct rpc_task *task; + int status = -EIO; + + if (!atomic_inc_not_zero(&clp->cl_count)) + goto out; + + status = -ENOMEM; + calldata = kzalloc(sizeof(*calldata), GFP_NOFS); + if (!calldata) + goto out; - nfs4_init_boot_verifier(clp, &verifier); + if (!xprt) + nfs4_init_boot_verifier(clp, &verifier); status = nfs4_init_uniform_client_string(clp); if (status) - goto out; + goto out_calldata; dprintk("NFS call exchange_id auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), - GFP_NOFS); - if (unlikely(res.server_owner == NULL)) { - status = -ENOMEM; - goto out; - } + calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), + GFP_NOFS); + status = -ENOMEM; + if (unlikely(calldata->res.server_owner == NULL)) + goto out_calldata; - res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), + calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope), GFP_NOFS); - if (unlikely(res.server_scope == NULL)) { - status = -ENOMEM; + if (unlikely(calldata->res.server_scope == NULL)) goto out_server_owner; - } - res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); - if (unlikely(res.impl_id == NULL)) { - status = -ENOMEM; + calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS); + if (unlikely(calldata->res.impl_id == NULL)) goto out_server_scope; - } switch (sp4_how) { case SP4_NONE: - args.state_protect.how = SP4_NONE; + calldata->args.state_protect.how = SP4_NONE; break; case SP4_MACH_CRED: - args.state_protect = nfs4_sp4_mach_cred_request; + calldata->args.state_protect = nfs4_sp4_mach_cred_request; break; default: @@ -7181,56 +7503,42 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, status = -EINVAL; goto out_impl_id; } + if (xprt) { + calldata->xprt = xprt; + task_setup_data.rpc_xprt = xprt; + task_setup_data.flags = + RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC; + calldata->args.verifier = &clp->cl_confirm; + } else { + calldata->args.verifier = &verifier; + } + calldata->args.client = clp; +#ifdef CONFIG_NFS_V4_1_MIGRATION + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID | + EXCHGID4_FLAG_SUPP_MOVED_MIGR, +#else + calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | + EXCHGID4_FLAG_BIND_PRINC_STATEID, +#endif + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + task_setup_data.callback_data = calldata; - status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); - trace_nfs4_exchange_id(clp, status); - if (status == 0) - status = nfs4_check_cl_exchange_flags(res.flags); - - if (status == 0) - status = nfs4_sp4_select_mode(clp, &res.state_protect); - - if (status == 0) { - clp->cl_clientid = res.clientid; - clp->cl_exchange_flags = res.flags; - /* Client ID is not confirmed */ - if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { - clear_bit(NFS4_SESSION_ESTABLISHED, - &clp->cl_session->session_state); - clp->cl_seqid = res.seqid; - } - - kfree(clp->cl_serverowner); - clp->cl_serverowner = res.server_owner; - res.server_owner = NULL; - - /* use the most recent implementation id */ - kfree(clp->cl_implid); - clp->cl_implid = res.impl_id; - res.impl_id = NULL; - - if (clp->cl_serverscope != NULL && - !nfs41_same_server_scope(clp->cl_serverscope, - res.server_scope)) { - dprintk("%s: server_scope mismatch detected\n", - __func__); - set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); - kfree(clp->cl_serverscope); - clp->cl_serverscope = NULL; - } - - if (clp->cl_serverscope == NULL) { - clp->cl_serverscope = res.server_scope; - res.server_scope = NULL; - } + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + status = PTR_ERR(task); + goto out_impl_id; } -out_impl_id: - kfree(res.impl_id); -out_server_scope: - kfree(res.server_scope); -out_server_owner: - kfree(res.server_owner); + if (!xprt) { + status = rpc_wait_for_completion_task(task); + if (!status) + status = calldata->rpc_status; + } else /* session trunking test */ + status = calldata->rpc_status; + + rpc_put_task(task); out: if (clp->cl_implid != NULL) dprintk("NFS reply exchange_id: Server Implementation ID: " @@ -7240,6 +7548,16 @@ out: clp->cl_implid->date.nseconds); dprintk("NFS reply exchange_id: %d\n", status); return status; + +out_impl_id: + kfree(calldata->res.impl_id); +out_server_scope: + kfree(calldata->res.server_scope); +out_server_owner: + kfree(calldata->res.server_owner); +out_calldata: + kfree(calldata); + goto out; } /* @@ -7262,14 +7580,45 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) /* try SP4_MACH_CRED if krb5i/p */ if (authflavor == RPC_AUTH_GSS_KRB5I || authflavor == RPC_AUTH_GSS_KRB5P) { - status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL); if (!status) return 0; } /* try SP4_NONE */ - return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL); +} + +/** + * nfs4_test_session_trunk + * + * This is an add_xprt_test() test function called from + * rpc_clnt_setup_test_and_add_xprt. + * + * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt + * and is dereferrenced in nfs4_exchange_id_release + * + * Upon success, add the new transport to the rpc_clnt + * + * @clnt: struct rpc_clnt to get new transport + * @xprt: the rpc_xprt to test + * @data: call data for _nfs4_proc_exchange_id. + */ +int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, + void *data) +{ + struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + u32 sp4_how; + + dprintk("--> %s try %s\n", __func__, + xprt->address_strings[RPC_DISPLAY_ADDR]); + + sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); + + /* Test connection for session trunking. Async exchange_id call */ + return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt); } +EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); static int _nfs4_proc_destroy_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -7463,7 +7812,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args, args->bc_attrs.max_resp_sz = max_bc_payload; args->bc_attrs.max_resp_sz_cached = 0; args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS; - args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS; + args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1); dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u " "max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n", @@ -7510,10 +7859,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; - /* These would render the backchannel useless: */ - if (rcvd->max_ops != sent->max_ops) + if (rcvd->max_ops > sent->max_ops) return -EINVAL; - if (rcvd->max_reqs != sent->max_reqs) + if (rcvd->max_reqs > sent->max_reqs) return -EINVAL; out: return 0; @@ -7982,6 +8330,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_RECALLCONFLICT: status = -ERECALLCONFLICT; break; + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: exception->timeout = 0; @@ -7993,6 +8343,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, &lgp->args.ctx->state->stateid)) { spin_unlock(&inode->i_lock); exception->state = lgp->args.ctx->state; + exception->stateid = &lgp->args.stateid; break; } @@ -8591,6 +8942,24 @@ static int _nfs41_test_stateid(struct nfs_server *server, return -res.status; } +static void nfs4_handle_delay_or_session_error(struct nfs_server *server, + int err, struct nfs4_exception *exception) +{ + exception->retry = 0; + switch(err) { + case -NFS4ERR_DELAY: + case -NFS4ERR_RETRY_UNCACHED_REP: + nfs4_handle_exception(server, err, exception); + break; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_DEADSESSION: + nfs4_do_handle_exception(server, err, exception); + } +} + /** * nfs41_test_stateid - perform a TEST_STATEID operation * @@ -8610,9 +8979,7 @@ static int nfs41_test_stateid(struct nfs_server *server, int err; do { err = _nfs41_test_stateid(server, stateid, cred); - if (err != -NFS4ERR_DELAY) - break; - nfs4_handle_exception(server, err, &exception); + nfs4_handle_delay_or_session_error(server, err, &exception); } while (exception.retry); return err; } @@ -8657,7 +9024,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { }; static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, - nfs4_stateid *stateid, + const nfs4_stateid *stateid, struct rpc_cred *cred, bool privileged) { @@ -8687,7 +9054,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, msg.rpc_argp = &data->args; msg.rpc_resp = &data->res; - nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); if (privileged) nfs4_set_sequence_privileged(&data->args.seq_args); @@ -8700,38 +9067,31 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server, * @server: server / transport on which to perform the operation * @stateid: state ID to release * @cred: credential + * @is_recovery: set to true if this call needs to be privileged * - * Returns NFS_OK if the server freed "stateid". Otherwise a - * negative NFS4ERR value is returned. + * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - nfs4_stateid *stateid, - struct rpc_cred *cred) + const nfs4_stateid *stateid, + struct rpc_cred *cred, + bool is_recovery) { struct rpc_task *task; - int ret; - task = _nfs41_free_stateid(server, stateid, cred, true); + task = _nfs41_free_stateid(server, stateid, cred, is_recovery); if (IS_ERR(task)) return PTR_ERR(task); - ret = rpc_wait_for_completion_task(task); - if (!ret) - ret = task->tk_status; rpc_put_task(task); - return ret; + return 0; } static void nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) { - struct rpc_task *task; struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); + nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); nfs4_free_lock_state(server, lsp); - if (IS_ERR(task)) - return; - rpc_put_task(task); } static bool nfs41_match_stateid(const nfs4_stateid *s1, @@ -8835,6 +9195,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, .free_lock_state = nfs4_release_lockowner, + .test_and_free_expired = nfs40_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_seqid, .call_sync_ops = &nfs40_call_sync_ops, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, @@ -8862,7 +9223,9 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, + .test_and_free_expired = nfs41_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_no_seqid, + .session_trunk = nfs4_test_session_trunk, .call_sync_ops = &nfs41_call_sync_ops, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, @@ -8891,7 +9254,9 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, .call_sync_ops = &nfs41_call_sync_ops, + .test_and_free_expired = nfs41_test_and_free_expired_stateid, .alloc_seqid = nfs_alloc_no_seqid, + .session_trunk = nfs4_test_session_trunk, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index f703b755351b..dae385500005 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -9,6 +9,7 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (64U) +#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U) #define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) @@ -22,6 +23,7 @@ struct nfs4_slot { u32 slot_nr; u32 seq_nr; unsigned int interrupted : 1, + privileged : 1, seq_done : 1; }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cada00aa5096..5f4281ec5f72 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -991,6 +991,8 @@ int nfs4_select_rw_stateid(struct nfs4_state *state, { int ret; + if (!nfs4_valid_open_stateid(state)) + return -EIO; if (cred != NULL) *cred = NULL; ret = nfs4_copy_lock_stateid(dst, state, lockowner); @@ -1303,6 +1305,8 @@ void nfs4_schedule_path_down_recovery(struct nfs_client *clp) static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { + if (!nfs4_valid_open_stateid(state)) + return 0; set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); /* Don't recover state that expired before the reboot */ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) { @@ -1316,6 +1320,8 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { + if (!nfs4_valid_open_stateid(state)) + return 0; set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags); @@ -1327,9 +1333,8 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ { struct nfs_client *clp = server->nfs_client; - if (!nfs4_valid_open_stateid(state)) + if (!nfs4_state_mark_reclaim_nograce(clp, state)) return -EBADF; - nfs4_state_mark_reclaim_nograce(clp, state); dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); @@ -1337,6 +1342,35 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); +static struct nfs4_lock_state * +nfs_state_find_lock_state_by_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + struct nfs4_lock_state *pos; + + list_for_each_entry(pos, &state->lock_states, ls_locks) { + if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags)) + continue; + if (nfs4_stateid_match_other(&pos->ls_stateid, stateid)) + return pos; + } + return NULL; +} + +static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + bool found = false; + + if (test_bit(LK_STATE_IN_USE, &state->flags)) { + spin_lock(&state->state_lock); + if (nfs_state_find_lock_state_by_stateid(state, stateid)) + found = true; + spin_unlock(&state->state_lock); + } + return found; +} + void nfs_inode_find_state_and_recover(struct inode *inode, const nfs4_stateid *stateid) { @@ -1351,14 +1385,18 @@ void nfs_inode_find_state_and_recover(struct inode *inode, state = ctx->state; if (state == NULL) continue; - if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) + if (nfs4_stateid_match_other(&state->stateid, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) { + found = true; continue; - if (!nfs4_stateid_match(&state->stateid, stateid)) - continue; - nfs4_state_mark_reclaim_nograce(clp, state); - found = true; + } + if (nfs_state_lock_state_matches_stateid(state, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) + found = true; } spin_unlock(&inode->i_lock); + + nfs_inode_find_delegation_state_and_recover(inode, stateid); if (found) nfs4_schedule_state_manager(clp); } @@ -1498,6 +1536,9 @@ restart: __func__, status); case -ENOENT: case -ENOMEM: + case -EACCES: + case -EROFS: + case -EIO: case -ESTALE: /* Open state on this file cannot be recovered */ nfs4_state_mark_recovery_failed(state, status); @@ -1656,15 +1697,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) put_rpccred(cred); } -static void nfs_delegation_clear_all(struct nfs_client *clp) -{ - nfs_delegation_mark_reclaim(clp); - nfs_delegation_reap_unclaimed(clp); -} - static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) { - nfs_delegation_clear_all(clp); + nfs_mark_test_expired_all_delegations(clp); nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); } @@ -2195,7 +2230,7 @@ static void nfs41_handle_all_state_revoked(struct nfs_client *clp) static void nfs41_handle_some_state_revoked(struct nfs_client *clp) { - nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_state_start_reclaim_nograce(clp); nfs4_schedule_state_manager(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); @@ -2227,13 +2262,22 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) +void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, + bool recovery) { if (!flags) return; dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n", __func__, clp->cl_hostname, clp->cl_clientid, flags); + /* + * If we're called from the state manager thread, then assume we're + * already handling the RECLAIM_NEEDED and/or STATE_REVOKED. + * Those flags are expected to remain set until we're done + * recovering (see RFC5661, section 18.46.3). + */ + if (recovery) + goto out_recovery; if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); @@ -2246,6 +2290,7 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); +out_recovery: if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT) nfs41_handle_backchannel_fault(clp); else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | @@ -2410,6 +2455,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_state_end_reclaim_reboot(clp); } + /* Detect expired delegations... */ + if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) { + section = "detect expired delegations"; + nfs_reap_expired_delegations(clp); + continue; + } + /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { section = "reclaim nograce"; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 7bd3a5c09d31..fc89e5ed07ee 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1850,7 +1850,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ + *p++ = cpu_to_be32(ktime_to_ns(nn->boot_time)); /* stamp */ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ @@ -4725,34 +4725,37 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, } /* - * Decode potentially multiple layout types. Currently we only support - * one layout driver per file system. + * Decode potentially multiple layout types. */ -static int decode_first_pnfs_layout_type(struct xdr_stream *xdr, - uint32_t *layouttype) +static int decode_pnfs_layout_types(struct xdr_stream *xdr, + struct nfs_fsinfo *fsinfo) { __be32 *p; - int num; + uint32_t i; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; - num = be32_to_cpup(p); + fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ - if (num == 0) { - *layouttype = 0; + if (fsinfo->nlayouttypes == 0) return 0; - } - if (num > 1) - printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout " - "drivers per filesystem not supported\n", __func__); /* Decode and set first layout type, move xdr->p past unused types */ - p = xdr_inline_decode(xdr, num * 4); + p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) goto out_overflow; - *layouttype = be32_to_cpup(p); + + /* If we get too many, then just cap it at the max */ + if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { + printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n", + __func__, fsinfo->nlayouttypes); + fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES; + } + + for(i = 0; i < fsinfo->nlayouttypes; ++i) + fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -4764,7 +4767,7 @@ out_overflow: * Note we must ensure that layouttype is set in any non-error case. */ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, - uint32_t *layouttype) + struct nfs_fsinfo *fsinfo) { int status = 0; @@ -4772,10 +4775,9 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U))) return -EIO; if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = decode_first_pnfs_layout_type(xdr, layouttype); + status = decode_pnfs_layout_types(xdr, fsinfo); bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES; - } else - *layouttype = 0; + } return status; } @@ -4856,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo) status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta); if (status != 0) goto xdr_error; - status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype); + status = decode_attr_pnfstype(xdr, bitmap, fsinfo); if (status != 0) goto xdr_error; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c93a85eda51..56b2d96f9103 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/sort.h> #include "internal.h" #include "pnfs.h" #include "iostat.h" @@ -99,35 +100,79 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss) } /* + * When the server sends a list of layout types, we choose one in the order + * given in the list below. + * + * FIXME: should this list be configurable in some fashion? module param? + * mount option? something else? + */ +static const u32 ld_prefs[] = { + LAYOUT_SCSI, + LAYOUT_BLOCK_VOLUME, + LAYOUT_OSD2_OBJECTS, + LAYOUT_FLEX_FILES, + LAYOUT_NFSV4_1_FILES, + 0 +}; + +static int +ld_cmp(const void *e1, const void *e2) +{ + u32 ld1 = *((u32 *)e1); + u32 ld2 = *((u32 *)e2); + int i; + + for (i = 0; ld_prefs[i] != 0; i++) { + if (ld1 == ld_prefs[i]) + return -1; + + if (ld2 == ld_prefs[i]) + return 1; + } + return 0; +} + +/* * Try to set the server's pnfs module to the pnfs layout type specified by id. * Currently only one pNFS layout driver per filesystem is supported. * - * @id layout type. Zero (illegal layout type) indicates pNFS not in use. + * @ids array of layout types supported by MDS. */ void set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh, - u32 id) + struct nfs_fsinfo *fsinfo) { struct pnfs_layoutdriver_type *ld_type = NULL; + u32 id; + int i; - if (id == 0) - goto out_no_driver; if (!(server->nfs_client->cl_exchange_flags & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) { - printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n", - __func__, id, server->nfs_client->cl_exchange_flags); + printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n", + __func__, server->nfs_client->cl_exchange_flags); goto out_no_driver; } - ld_type = find_pnfs_driver(id); - if (!ld_type) { - request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id); + + sort(fsinfo->layouttype, fsinfo->nlayouttypes, + sizeof(*fsinfo->layouttype), ld_cmp, NULL); + + for (i = 0; i < fsinfo->nlayouttypes; i++) { + id = fsinfo->layouttype[i]; ld_type = find_pnfs_driver(id); if (!ld_type) { - dprintk("%s: No pNFS module found for %u.\n", - __func__, id); - goto out_no_driver; + request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, + id); + ld_type = find_pnfs_driver(id); } + if (ld_type) + break; + } + + if (!ld_type) { + dprintk("%s: No pNFS module found!\n", __func__); + goto out_no_driver; } + server->pnfs_curr_ld = ld_type; if (ld_type->set_layoutdriver && ld_type->set_layoutdriver(server, mntfh)) { @@ -2185,10 +2230,8 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr) */ void pnfs_ld_read_done(struct nfs_pgio_header *hdr) { - if (likely(!hdr->pnfs_error)) { - __nfs4_read_done_cb(hdr); + if (likely(!hdr->pnfs_error)) hdr->mds_ops->rpc_call_done(&hdr->task, hdr); - } trace_nfs4_pnfs_read(hdr, hdr->pnfs_error); if (unlikely(hdr->pnfs_error)) pnfs_ld_handle_read_error(hdr); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 31d99b2927b0..5c295512c967 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); -void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); +void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); @@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) } static inline void set_pnfs_layoutdriver(struct nfs_server *s, - const struct nfs_fh *mntfh, u32 id) + const struct nfs_fh *mntfh, + struct nfs_fsinfo *fsinfo) { } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index f3468b57a32a..53b4705abcc7 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -690,13 +690,50 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, dprintk("%s: DS %s: trying address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - clp = nfs4_set_ds_client(mds_srv, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version, - au_flavor); - if (!IS_ERR(clp)) - break; + if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) { + struct xprt_create xprt_args = { + .ident = XPRT_TRANSPORT_TCP, + .net = clp->cl_net, + .dstaddr = (struct sockaddr *)&da->da_addr, + .addrlen = da->da_addrlen, + .servername = clp->cl_hostname, + }; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + .cred = nfs4_get_clid_cred(clp), + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; + + /** + * Test this address for session trunking and + * add as an alias + */ + rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args, + rpc_clnt_setup_test_and_add_xprt, + &rpcdata); + if (xprtdata.cred) + put_rpccred(xprtdata.cred); + } else { + clp = nfs4_set_ds_client(mds_srv, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans, minor_version, + au_flavor); + if (IS_ERR(clp)) + continue; + + status = nfs4_init_ds_session(clp, + mds_srv->nfs_client->cl_lease_time); + if (status) { + nfs_put_client(clp); + clp = ERR_PTR(-EIO); + continue; + } + + } } if (IS_ERR(clp)) { @@ -704,18 +741,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, goto out; } - status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); - if (status) - goto out_put; - smp_wmb(); ds->ds_clp = clp; dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); out: return status; -out_put: - nfs_put_client(clp); - goto out; } /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d39601381adf..001796bcd6c8 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2848,19 +2848,23 @@ out_invalid_transport_udp: * NFS client for backwards compatibility */ unsigned int nfs_callback_set_tcpport; +unsigned short nfs_callback_nr_threads; /* Default cache timeout is 10 minutes */ unsigned int nfs_idmap_cache_timeout = 600; /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */ bool nfs4_disable_idmapping = true; unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE; +unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE; unsigned short send_implementation_id = 1; char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = ""; bool recover_lost_locks = false; +EXPORT_SYMBOL_GPL(nfs_callback_nr_threads); EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport); EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout); EXPORT_SYMBOL_GPL(nfs4_disable_idmapping); EXPORT_SYMBOL_GPL(max_session_slots); +EXPORT_SYMBOL_GPL(max_session_cb_slots); EXPORT_SYMBOL_GPL(send_implementation_id); EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier); EXPORT_SYMBOL_GPL(recover_lost_locks); @@ -2887,6 +2891,9 @@ static const struct kernel_param_ops param_ops_portnr = { #define param_check_portnr(name, p) __param_check(name, p, unsigned int); module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); +module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644); +MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be " + "assigned to the NFSv4 callback channels."); module_param(nfs_idmap_cache_timeout, int, 0644); module_param(nfs4_disable_idmapping, bool, 0644); module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier, @@ -2896,6 +2903,9 @@ MODULE_PARM_DESC(nfs4_disable_idmapping, module_param(max_session_slots, ushort, 0644); MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " "requests the client will negotiate"); +module_param(max_session_cb_slots, ushort, 0644); +MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 " + "callbacks the client will process for a given server"); module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, "Send implementation ID with NFSv4.1 exchange_id"); diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index df880e9fa71f..b67287383010 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -126,6 +126,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, const struct nfsd4_layout_ops ff_layout_ops = { .notify_types = NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, + .disable_recalls = true, .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo, .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, .proc_layoutget = nfsd4_ff_proc_layoutget, diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 5fbf3bbd00d0..b10d557f9c9e 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -84,6 +84,7 @@ struct nfsd_net { struct list_head client_lru; struct list_head close_lru; struct list_head del_recall_lru; + struct list_head blocked_locks_lru; struct delayed_work laundromat_work; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 04c68d900324..211dc2aed8e1 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -448,7 +448,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, { int status; - if (cb->cb_minorversion == 0) + if (cb->cb_clp->cl_minorversion == 0) return 0; status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status); @@ -485,7 +485,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, const struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, - .minorversion = cb->cb_minorversion, + .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); @@ -594,7 +594,7 @@ static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, container_of(cb, struct nfs4_layout_stateid, ls_recall); struct nfs4_cb_compound_hdr hdr = { .ident = 0, - .minorversion = cb->cb_minorversion, + .minorversion = cb->cb_clp->cl_minorversion, }; encode_cb_compound4args(xdr, &hdr); @@ -623,6 +623,62 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, } #endif /* CONFIG_NFSD_PNFS */ +static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len); + p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8); + xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len); +} + +static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfsd4_blocked_lock *nbl = + container_of(cb, struct nfsd4_blocked_lock, nbl_cb); + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner; + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + __be32 *p; + + BUG_ON(hdr.minorversion == 0); + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(OP_CB_NOTIFY_LOCK); + encode_nfs_fh4(xdr, &nbl->nbl_fh); + encode_stateowner(xdr, &lo->lo_owner); + hdr.nops++; + + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + } + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); +} + /* * RPC procedure tables */ @@ -643,6 +699,7 @@ static struct rpc_procinfo nfs4_cb_procedures[] = { #ifdef CONFIG_NFSD_PNFS PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), #endif + PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), }; static struct rpc_version nfs_cb_version4 = { @@ -862,7 +919,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) struct nfs4_client *clp = cb->cb_clp; u32 minorversion = clp->cl_minorversion; - cb->cb_minorversion = minorversion; /* * cb_seq_status is only set in decode_cb_sequence4res, * and so will remain 1 if an rpc level failure occurs. diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 2be9602b0221..42aace4fc4c8 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -174,7 +174,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); fput(ls->ls_file); if (ls->ls_recalled) @@ -189,6 +190,9 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) struct file_lock *fl; int status; + if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + return 0; + fl = locks_alloc_lock(); if (!fl) return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 1fb222752b2b..abb09b580389 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1010,47 +1010,97 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } static __be32 -nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - struct nfsd4_clone *clone) +nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + stateid_t *src_stateid, struct file **src, + stateid_t *dst_stateid, struct file **dst) { - struct file *src, *dst; __be32 status; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, - &clone->cl_src_stateid, RD_STATE, - &src, NULL); + src_stateid, RD_STATE, src, NULL); if (status) { dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); goto out; } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - &clone->cl_dst_stateid, WR_STATE, - &dst, NULL); + dst_stateid, WR_STATE, dst, NULL); if (status) { dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); goto out_put_src; } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(src)->i_mode) || - !S_ISREG(file_inode(dst)->i_mode)) { + if (!S_ISREG(file_inode(*src)->i_mode) || + !S_ISREG(file_inode(*dst)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } +out: + return status; +out_put_dst: + fput(*dst); +out_put_src: + fput(*src); + goto out; +} + +static __be32 +nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_clone *clone) +{ + struct file *src, *dst; + __be32 status; + + status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, + &clone->cl_dst_stateid, &dst); + if (status) + goto out; + status = nfsd4_clone_file_range(src, clone->cl_src_pos, dst, clone->cl_dst_pos, clone->cl_count); -out_put_dst: fput(dst); -out_put_src: fput(src); out: return status; } static __be32 +nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_copy *copy) +{ + struct file *src, *dst; + __be32 status; + ssize_t bytes; + + status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, &src, + ©->cp_dst_stateid, &dst); + if (status) + goto out; + + bytes = nfsd_copy_file_range(src, copy->cp_src_pos, + dst, copy->cp_dst_pos, copy->cp_count); + + if (bytes < 0) + status = nfserrno(bytes); + else { + copy->cp_res.wr_bytes_written = bytes; + copy->cp_res.wr_stable_how = NFS_UNSTABLE; + copy->cp_consecutive = 1; + copy->cp_synchronous = 1; + gen_boot_verifier(©->cp_res.wr_verifier, SVC_NET(rqstp)); + status = nfs_ok; + } + + fput(src); + fput(dst); +out: + return status; +} + +static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { @@ -1966,6 +2016,18 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } +static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* wr_callback */ + + op_encode_stateid_maxsz /* wr_callback */ + + 2 /* wr_count */ + + 1 /* wr_committed */ + + op_encode_verifier_maxsz + + 1 /* cr_consecutive */ + + 1 /* cr_synchronous */) * sizeof(__be32); +} + #ifdef CONFIG_NFSD_PNFS /* * At this stage we don't really know what layout driver will handle the request, @@ -2328,6 +2390,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_CLONE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + [OP_COPY] = { + .op_func = (nfsd4op_func)nfsd4_copy, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_COPY", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_copy_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 39bfaba9c99c..9752beb78659 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -99,6 +99,7 @@ static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; static bool is_session_dead(struct nfsd4_session *ses) { @@ -210,6 +211,85 @@ static void nfsd4_put_session(struct nfsd4_session *ses) spin_unlock(&nn->client_lock); } +static struct nfsd4_blocked_lock * +find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *cur, *found = NULL; + + spin_lock(&nn->client_lock); + list_for_each_entry(cur, &lo->lo_blocked, nbl_list) { + if (fh_match(fh, &cur->nbl_fh)) { + list_del_init(&cur->nbl_list); + list_del_init(&cur->nbl_lru); + found = cur; + break; + } + } + spin_unlock(&nn->client_lock); + if (found) + posix_unblock_lock(&found->nbl_lock); + return found; +} + +static struct nfsd4_blocked_lock * +find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, + struct nfsd_net *nn) +{ + struct nfsd4_blocked_lock *nbl; + + nbl = find_blocked_lock(lo, fh, nn); + if (!nbl) { + nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); + if (nbl) { + fh_copy_shallow(&nbl->nbl_fh, fh); + locks_init_lock(&nbl->nbl_lock); + nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, + &nfsd4_cb_notify_lock_ops, + NFSPROC4_CLNT_CB_NOTIFY_LOCK); + } + } + return nbl; +} + +static void +free_blocked_lock(struct nfsd4_blocked_lock *nbl) +{ + locks_release_private(&nbl->nbl_lock); + kfree(nbl); +} + +static int +nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + /* + * Since this is just an optimization, we don't try very hard if it + * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and + * just quit trying on anything else. + */ + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 1 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb) +{ + struct nfsd4_blocked_lock *nbl = container_of(cb, + struct nfsd4_blocked_lock, nbl_cb); + + free_blocked_lock(nbl); +} + +static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = { + .done = nfsd4_cb_notify_lock_done, + .release = nfsd4_cb_notify_lock_release, +}; + static inline struct nfs4_stateowner * nfs4_get_stateowner(struct nfs4_stateowner *sop) { @@ -3224,9 +3304,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, goto out; /* cases below refer to rfc 3530 section 14.2.34: */ if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) { - if (conf && !unconf) /* case 2: probable retransmit */ + if (conf && same_verf(&confirm, &conf->cl_confirm)) { + /* case 2: probable retransmit */ status = nfs_ok; - else /* case 4: client hasn't noticed we rebooted yet? */ + } else /* case 4: client hasn't noticed we rebooted yet? */ status = nfserr_stale_clientid; goto out; } @@ -4410,9 +4491,11 @@ out: * To finish the open response, we just need to set the rflags. */ open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; - if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && - !nfsd4_has_session(&resp->cstate)) + if (nfsd4_has_session(&resp->cstate)) + open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK; + else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED)) open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; + if (dp) nfs4_put_stid(&dp->dl_stid); if (stp) @@ -4501,6 +4584,7 @@ nfs4_laundromat(struct nfsd_net *nn) struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct nfs4_ol_stateid *stp; + struct nfsd4_blocked_lock *nbl; struct list_head *pos, *next, reaplist; time_t cutoff = get_seconds() - nn->nfsd4_lease; time_t t, new_timeo = nn->nfsd4_lease; @@ -4569,6 +4653,41 @@ nfs4_laundromat(struct nfsd_net *nn) } spin_unlock(&nn->client_lock); + /* + * It's possible for a client to try and acquire an already held lock + * that is being held for a long time, and then lose interest in it. + * So, we clean out any un-revisited request after a lease period + * under the assumption that the client is no longer interested. + * + * RFC5661, sec. 9.6 states that the client must not rely on getting + * notifications and must continue to poll for locks, even when the + * server supports them. Thus this shouldn't lead to clients blocking + * indefinitely once the lock does become free. + */ + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->client_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + if (time_after((unsigned long)nbl->nbl_time, + (unsigned long)cutoff)) { + t = nbl->nbl_time - cutoff; + new_timeo = min(new_timeo, t); + break; + } + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } + new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); return new_timeo; } @@ -5309,7 +5428,31 @@ nfsd4_fl_put_owner(fl_owner_t owner) nfs4_put_stateowner(&lo->lo_owner); } +static void +nfsd4_lm_notify(struct file_lock *fl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct net *net = lo->lo_owner.so_client->net; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl = container_of(fl, + struct nfsd4_blocked_lock, nbl_lock); + bool queue = false; + + /* An empty list means that something else is going to be using it */ + spin_lock(&nn->client_lock); + if (!list_empty(&nbl->nbl_list)) { + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + queue = true; + } + spin_unlock(&nn->client_lock); + + if (queue) + nfsd4_run_cb(&nbl->nbl_cb); +} + static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_notify = nfsd4_lm_notify, .lm_get_owner = nfsd4_fl_get_owner, .lm_put_owner = nfsd4_fl_put_owner, }; @@ -5407,6 +5550,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); if (!lo) return NULL; + INIT_LIST_HEAD(&lo->lo_blocked); INIT_LIST_HEAD(&lo->lo_owner.so_stateids); lo->lo_owner.so_is_open_owner = 0; lo->lo_owner.so_seqid = lock->lk_new_lock_seqid; @@ -5588,12 +5732,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; struct file *filp = NULL; + struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; __be32 status = 0; int lkflg; int err; bool new = false; + unsigned char fl_type; + unsigned int fl_flags = FL_POSIX; struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -5658,46 +5805,55 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; - file_lock = locks_alloc_lock(); - if (!file_lock) { - dprintk("NFSD: %s: unable to allocate lock!\n", __func__); - status = nfserr_jukebox; - goto out; - } - fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { - case NFS4_READ_LT: case NFS4_READW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + /* Fallthrough */ + case NFS4_READ_LT: spin_lock(&fp->fi_lock); filp = find_readable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); - file_lock->fl_type = F_RDLCK; + fl_type = F_RDLCK; break; - case NFS4_WRITE_LT: case NFS4_WRITEW_LT: + if (nfsd4_has_session(cstate)) + fl_flags |= FL_SLEEP; + /* Fallthrough */ + case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); filp = find_writeable_file_locked(fp); if (filp) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); - file_lock->fl_type = F_WRLCK; + fl_type = F_WRLCK; break; default: status = nfserr_inval; goto out; } + if (!filp) { status = nfserr_openmode; goto out; } + nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); + if (!nbl) { + dprintk("NFSD: %s: unable to allocate block!\n", __func__); + status = nfserr_jukebox; + goto out; + } + + file_lock = &nbl->nbl_lock; + file_lock->fl_type = fl_type; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; - file_lock->fl_flags = FL_POSIX; + file_lock->fl_flags = fl_flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); @@ -5710,18 +5866,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } + if (fl_flags & FL_SLEEP) { + nbl->nbl_time = jiffies; + spin_lock(&nn->client_lock); + list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked); + list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru); + spin_unlock(&nn->client_lock); + } + err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); - switch (-err) { + switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); status = 0; break; - case (EAGAIN): /* conflock holds conflicting lock */ + case FILE_LOCK_DEFERRED: + nbl = NULL; + /* Fallthrough */ + case -EAGAIN: /* conflock holds conflicting lock */ status = nfserr_denied; dprintk("NFSD: nfsd4_lock: conflicting lock found!\n"); nfs4_set_lock_denied(conflock, &lock->lk_denied); break; - case (EDEADLK): + case -EDEADLK: status = nfserr_deadlock; break; default: @@ -5730,6 +5897,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, break; } out: + if (nbl) { + /* dequeue it if we queued it before */ + if (fl_flags & FL_SLEEP) { + spin_lock(&nn->client_lock); + list_del_init(&nbl->nbl_list); + list_del_init(&nbl->nbl_lru); + spin_unlock(&nn->client_lock); + } + free_blocked_lock(nbl); + } if (filp) fput(filp); if (lock_stp) { @@ -5753,8 +5930,6 @@ out: if (open_stp) nfs4_put_stid(&open_stp->st_stid); nfsd4_bump_seqid(cstate, status); - if (file_lock) - locks_free_lock(file_lock); if (conflock) locks_free_lock(conflock); return status; @@ -6768,6 +6943,7 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); + INIT_LIST_HEAD(&nn->blocked_locks_lru); spin_lock_init(&nn->client_lock); INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main); @@ -6865,6 +7041,7 @@ nfs4_state_shutdown_net(struct net *net) struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl; cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -6885,6 +7062,24 @@ nfs4_state_shutdown_net(struct net *net) nfs4_put_stid(&dp->dl_stid); } + BUG_ON(!list_empty(&reaplist)); + spin_lock(&nn->client_lock); + while (!list_empty(&nn->blocked_locks_lru)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_move(&nbl->nbl_lru, &reaplist); + list_del_init(&nbl->nbl_list); + } + spin_unlock(&nn->client_lock); + + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&nn->blocked_locks_lru, + struct nfsd4_blocked_lock, nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } + nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0aa0236a1429..c2d2895a1ec1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1694,6 +1694,30 @@ nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) } static __be32 +nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) +{ + DECODE_HEAD; + unsigned int tmp; + + status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, ©->cp_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8 + 4 + 4 + 4); + p = xdr_decode_hyper(p, ©->cp_src_pos); + p = xdr_decode_hyper(p, ©->cp_dst_pos); + p = xdr_decode_hyper(p, ©->cp_count); + copy->cp_consecutive = be32_to_cpup(p++); + copy->cp_synchronous = be32_to_cpup(p++); + tmp = be32_to_cpup(p); /* Source server list not supported */ + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1793,7 +1817,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { /* new operations for NFSv4.2 */ [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, - [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy, [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -4062,7 +4086,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, u32 starting_len = xdr->buf->len, needed_len; __be32 *p; - dprintk("%s: err %d\n", __func__, nfserr); + dprintk("%s: err %d\n", __func__, be32_to_cpu(nfserr)); if (nfserr) goto out; @@ -4202,6 +4226,41 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, #endif /* CONFIG_NFSD_PNFS */ static __be32 +nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write) +{ + __be32 *p; + + p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + + *p++ = cpu_to_be32(0); + p = xdr_encode_hyper(p, write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_stable_how); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); + return nfs_ok; +} + +static __be32 +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_copy *copy) +{ + __be32 *p; + + if (!nfserr) { + nfserr = nfsd42_encode_write_res(resp, ©->cp_res); + if (nfserr) + return nfserr; + + p = xdr_reserve_space(&resp->xdr, 4 + 4); + *p++ = cpu_to_be32(copy->cp_consecutive); + *p++ = cpu_to_be32(copy->cp_synchronous); + } + return nfserr; +} + +static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) { @@ -4300,7 +4359,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { /* NFSv4.2 operations */ [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, - [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy, [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 65ad0165a94f..36b2af931e06 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1216,6 +1216,8 @@ static __net_init int nfsd_init_net(struct net *net) goto out_idmap_error; nn->nfsd4_lease = 90; /* default lease time */ nn->nfsd4_grace = 90; + nn->clverifier_counter = prandom_u32(); + nn->clientid_counter = prandom_u32(); return 0; out_idmap_error: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 08188743db53..010aff5c5a79 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -789,6 +789,7 @@ nfserrno (int errno) { nfserr_toosmall, -ETOOSMALL }, { nfserr_serverfault, -ESERVERFAULT }, { nfserr_serverfault, -ENFILE }, + { nfserr_io, -EUCLEAN }, }; int i; @@ -796,7 +797,7 @@ nfserrno (int errno) if (nfs_errtbl[i].syserr == errno) return nfs_errtbl[i].nfserr; } - WARN(1, "nfsd: non-standard errno: %d\n", errno); + WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno); return nfserr_io; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 45007acaf364..a2b65fc56dd6 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -366,14 +366,21 @@ static struct notifier_block nfsd_inet6addr_notifier = { }; #endif +/* Only used under nfsd_mutex, so this atomic may be overkill: */ +static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0); + static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); + /* check if the notifier still has clients */ + if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { + unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) - unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); + unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif + } + /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are @@ -488,10 +495,13 @@ int nfsd_create_serv(struct net *net) } set_max_drc(); - register_inetaddr_notifier(&nfsd_inetaddr_notifier); + /* check if the notifier is already set */ + if (atomic_inc_return(&nfsd_notifier_refcount) == 1) { + register_inetaddr_notifier(&nfsd_inetaddr_notifier); #if IS_ENABLED(CONFIG_IPV6) - register_inet6addr_notifier(&nfsd_inet6addr_notifier); + register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif + } do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 0c2a716e8741..d27a5aa60022 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -19,6 +19,7 @@ struct nfsd4_deviceid_map { struct nfsd4_layout_ops { u32 notify_types; + bool disable_recalls; __be32 (*proc_getdeviceinfo)(struct super_block *sb, struct svc_rqst *rqstp, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index b95adf9a1595..c9399366f9df 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -63,7 +63,6 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; - u32 cb_minorversion; struct rpc_message cb_msg; const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; @@ -441,11 +440,11 @@ struct nfs4_openowner { /* * Represents a generic "lockowner". Similar to an openowner. References to it * are held by the lock stateids that are created on its behalf. This object is - * a superset of the nfs4_stateowner struct (or would be if it needed any extra - * fields). + * a superset of the nfs4_stateowner struct. */ struct nfs4_lockowner { - struct nfs4_stateowner lo_owner; /* must be first element */ + struct nfs4_stateowner lo_owner; /* must be first element */ + struct list_head lo_blocked; /* blocked file_locks */ }; static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) @@ -572,6 +571,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_RECALL, NFSPROC4_CLNT_CB_LAYOUT, NFSPROC4_CLNT_CB_SEQUENCE, + NFSPROC4_CLNT_CB_NOTIFY_LOCK, }; /* Returns true iff a is later than b: */ @@ -580,6 +580,20 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b) return (s32)(a->si_generation - b->si_generation) > 0; } +/* + * When a client tries to get a lock on a file, we set one of these objects + * on the blocking lock. When the lock becomes free, we can then issue a + * CB_NOTIFY_LOCK to the server. + */ +struct nfsd4_blocked_lock { + struct list_head nbl_list; + struct list_head nbl_lru; + unsigned long nbl_time; + struct file_lock nbl_lock; + struct knfsd_fh nbl_fh; + struct nfsd4_callback nbl_cb; +}; + struct nfsd4_compound_state; struct nfsd_net; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ff476e654b8f..8ca642fe9b21 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -513,6 +513,22 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, count)); } +ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, + u64 dst_pos, u64 count) +{ + + /* + * Limit copy to 4MB to prevent indefinitely blocking an nfsd + * thread and client rpc slot. The choice of 4MB is somewhat + * arbitrary. We might instead base this on r/wsize, or make it + * tunable, or use a time instead of a byte limit, or implement + * asynchronous copy. In theory a client could also recognize a + * limit like this and pipeline multiple COPY requests. + */ + count = min_t(u64, count, 1 << 22); + return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0); +} + __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 3cbb1b33777b..0bf9e7bf5800 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -96,6 +96,8 @@ __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, struct svc_fh *res); __be32 nfsd_link(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *); +ssize_t nfsd_copy_file_range(struct file *, u64, + struct file *, u64, u64); __be32 nfsd_rename(struct svc_rqst *, struct svc_fh *, char *, int, struct svc_fh *, char *, int); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index beea0c5edc51..8fda4abdf3b1 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -503,6 +503,28 @@ struct nfsd4_clone { u64 cl_count; }; +struct nfsd42_write_res { + u64 wr_bytes_written; + u32 wr_stable_how; + nfs4_verifier wr_verifier; +}; + +struct nfsd4_copy { + /* request */ + stateid_t cp_src_stateid; + stateid_t cp_dst_stateid; + u64 cp_src_pos; + u64 cp_dst_pos; + u64 cp_count; + + /* both */ + bool cp_consecutive; + bool cp_synchronous; + + /* response */ + struct nfsd42_write_res cp_res; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -568,6 +590,7 @@ struct nfsd4_op { struct nfsd4_fallocate allocate; struct nfsd4_fallocate deallocate; struct nfsd4_clone clone; + struct nfsd4_copy copy; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index c47f6fdb111a..49b719dfef95 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -28,3 +28,12 @@ #define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) + +#define NFS4_enc_cb_notify_lock_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 2 + 1 + \ + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ + enc_nfs4_fh_sz) +#define NFS4_dec_cb_notify_lock_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/open.c b/fs/open.c index a7719cfb7257..d3ed8171e8e0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -267,6 +267,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) (mode & ~FALLOC_FL_INSERT_RANGE)) return -EINVAL; + /* Unshare range should only be used with allocate mode. */ + if ((mode & FALLOC_FL_UNSHARE_RANGE) && + (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE))) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 3f803b3a1f82..aeb60f791418 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -57,6 +57,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) ssize_t list_size, size, value_size = 0; char *buf, *name, *value = NULL; int uninitialized_var(error); + size_t slen; if (!(old->d_inode->i_opflags & IOP_XATTR) || !(new->d_inode->i_opflags & IOP_XATTR)) @@ -79,7 +80,16 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new) goto out; } - for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + for (name = buf; list_size; name += slen) { + slen = strnlen(name, list_size) + 1; + + /* underlying fs providing us with an broken xattr list? */ + if (WARN_ON(slen > list_size)) { + error = -EIO; + break; + } + list_size -= slen; + if (ovl_is_private_xattr(name)) continue; retry: @@ -174,40 +184,6 @@ out_fput: return error; } -static char *ovl_read_symlink(struct dentry *realdentry) -{ - int res; - char *buf; - struct inode *inode = realdentry->d_inode; - mm_segment_t old_fs; - - res = -EINVAL; - if (!inode->i_op->readlink) - goto err; - - res = -ENOMEM; - buf = (char *) __get_free_page(GFP_KERNEL); - if (!buf) - goto err; - - old_fs = get_fs(); - set_fs(get_ds()); - /* The cast to a user pointer is valid due to the set_fs() */ - res = inode->i_op->readlink(realdentry, - (char __user *)buf, PAGE_SIZE - 1); - set_fs(old_fs); - if (res < 0) { - free_page((unsigned long) buf); - goto err; - } - buf[res] = '\0'; - - return buf; - -err: - return ERR_PTR(res); -} - static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) { struct iattr attr = { @@ -354,19 +330,20 @@ out_cleanup: int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct path *lowerpath, struct kstat *stat) { + DEFINE_DELAYED_CALL(done); struct dentry *workdir = ovl_workdir(dentry); int err; struct kstat pstat; struct path parentpath; + struct dentry *lowerdentry = lowerpath->dentry; struct dentry *upperdir; struct dentry *upperdentry; - const struct cred *old_cred; - char *link = NULL; + const char *link = NULL; if (WARN_ON(!workdir)) return -EROFS; - ovl_do_check_copy_up(lowerpath->dentry); + ovl_do_check_copy_up(lowerdentry); ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; @@ -376,13 +353,11 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, return err; if (S_ISLNK(stat->mode)) { - link = ovl_read_symlink(lowerpath->dentry); + link = vfs_get_link(lowerdentry, &done); if (IS_ERR(link)) return PTR_ERR(link); } - old_cred = ovl_override_creds(dentry->d_sb); - err = -EIO; if (lock_rename(workdir, upperdir) != NULL) { pr_err("overlayfs: failed to lock workdir+upperdir\n"); @@ -403,19 +378,16 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, } out_unlock: unlock_rename(workdir, upperdir); - revert_creds(old_cred); - - if (link) - free_page((unsigned long) link); + do_delayed_call(&done); return err; } int ovl_copy_up(struct dentry *dentry) { - int err; + int err = 0; + const struct cred *old_cred = ovl_override_creds(dentry->d_sb); - err = 0; while (!err) { struct dentry *next; struct dentry *parent; @@ -447,6 +419,7 @@ int ovl_copy_up(struct dentry *dentry) dput(parent); dput(next); } + revert_creds(old_cred); return err; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 5f90ddf778ba..306b6c161840 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -14,6 +14,7 @@ #include <linux/cred.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> +#include <linux/atomic.h> #include "overlayfs.h" void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) @@ -37,8 +38,10 @@ struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) { struct dentry *temp; char name[20]; + static atomic_t temp_id = ATOMIC_INIT(0); - snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + /* counter is allowed to wrap, since temp dentries are ephemeral */ + snprintf(name, sizeof(name), "#%x", atomic_inc_return(&temp_id)); temp = lookup_one_len(name, workdir, strlen(name)); if (!IS_ERR(temp) && temp->d_inode) { diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index c18d6a4ff456..c58f01babf30 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -19,6 +19,7 @@ static int ovl_copy_up_truncate(struct dentry *dentry) struct dentry *parent; struct kstat stat; struct path lowerpath; + const struct cred *old_cred; parent = dget_parent(dentry); err = ovl_copy_up(parent); @@ -26,12 +27,14 @@ static int ovl_copy_up_truncate(struct dentry *dentry) goto out_dput_parent; ovl_path_lower(dentry, &lowerpath); - err = vfs_getattr(&lowerpath, &stat); - if (err) - goto out_dput_parent; - stat.size = 0; - err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + old_cred = ovl_override_creds(dentry->d_sb); + err = vfs_getattr(&lowerpath, &stat); + if (!err) { + stat.size = 0; + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); + } + revert_creds(old_cred); out_dput_parent: dput(parent); @@ -153,45 +156,18 @@ static const char *ovl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct dentry *realdentry; - struct inode *realinode; const struct cred *old_cred; const char *p; if (!dentry) return ERR_PTR(-ECHILD); - realdentry = ovl_dentry_real(dentry); - realinode = realdentry->d_inode; - - if (WARN_ON(!realinode->i_op->get_link)) - return ERR_PTR(-EPERM); - old_cred = ovl_override_creds(dentry->d_sb); - p = realinode->i_op->get_link(realdentry, realinode, done); + p = vfs_get_link(ovl_dentry_real(dentry), done); revert_creds(old_cred); return p; } -static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) -{ - struct path realpath; - struct inode *realinode; - const struct cred *old_cred; - int err; - - ovl_path_real(dentry, &realpath); - realinode = realpath.dentry->d_inode; - - if (!realinode->i_op->readlink) - return -EINVAL; - - old_cred = ovl_override_creds(dentry->d_sb); - err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz); - revert_creds(old_cred); - return err; -} - bool ovl_is_private_xattr(const char *name) { return strncmp(name, OVL_XATTR_PREFIX, @@ -375,7 +351,7 @@ static const struct inode_operations ovl_file_inode_operations = { static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, .get_link = ovl_get_link, - .readlink = ovl_readlink, + .readlink = generic_readlink, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .update_time = ovl_update_time, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7e3f0127fc1a..bcf3965be819 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -273,12 +273,11 @@ static bool ovl_is_opaquedir(struct dentry *dentry) { int res; char val; - struct inode *inode = dentry->d_inode; - if (!S_ISDIR(inode->i_mode) || !(inode->i_opflags & IOP_XATTR)) + if (!d_is_dir(dentry)) return false; - res = __vfs_getxattr(dentry, inode, OVL_XATTR_OPAQUE, &val, 1); + res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); if (res == 1 && val == 'y') return true; @@ -419,16 +418,12 @@ static bool ovl_dentry_weird(struct dentry *dentry) DCACHE_OP_COMPARE); } -static inline struct dentry *ovl_lookup_real(struct super_block *ovl_sb, - struct dentry *dir, +static inline struct dentry *ovl_lookup_real(struct dentry *dir, const struct qstr *name) { - const struct cred *old_cred; struct dentry *dentry; - old_cred = ovl_override_creds(ovl_sb); dentry = lookup_one_len_unlocked(name->name, dir, name->len); - revert_creds(old_cred); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -469,6 +464,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe; + const struct cred *old_cred; struct ovl_entry *poe = dentry->d_parent->d_fsdata; struct path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; @@ -479,9 +475,10 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int i; int err; + old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_upperdentry_dereference(poe); if (upperdir) { - this = ovl_lookup_real(dentry->d_sb, upperdir, &dentry->d_name); + this = ovl_lookup_real(upperdir, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) goto out; @@ -514,8 +511,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, bool opaque = false; struct path lowerpath = poe->lowerstack[i]; - this = ovl_lookup_real(dentry->d_sb, - lowerpath.dentry, &dentry->d_name); + this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); err = PTR_ERR(this); if (IS_ERR(this)) { /* @@ -588,6 +584,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, ovl_copyattr(realdentry->d_inode, inode); } + revert_creds(old_cred); oe->opaque = upperopaque; oe->__upperdentry = upperdentry; memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); @@ -606,6 +603,7 @@ out_put: out_put_upper: dput(upperdentry); out: + revert_creds(old_cred); return ERR_PTR(err); } @@ -834,6 +832,19 @@ retry: if (err) goto out_dput; + /* + * Try to remove POSIX ACL xattrs from workdir. We are good if: + * + * a) success (there was a POSIX ACL xattr and was removed) + * b) -ENODATA (there was no POSIX ACL xattr) + * c) -EOPNOTSUPP (POSIX ACL xattrs are not supported) + * + * There are various other error values that could effectively + * mean that the xattr doesn't exist (e.g. -ERANGE is returned + * if the xattr name is too long), but the set of filesystems + * allowed as upper are limited to "normal" ones, where checking + * for the above two errors is sufficient. + */ err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; @@ -1292,6 +1303,12 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!oe) goto out_put_cred; + sb->s_magic = OVERLAYFS_SUPER_MAGIC; + sb->s_op = &ovl_super_operations; + sb->s_xattr = ovl_xattr_handlers; + sb->s_fs_info = ufs; + sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR)); if (!root_dentry) goto out_free_oe; @@ -1315,12 +1332,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ovl_inode_init(d_inode(root_dentry), realinode, !!upperpath.dentry); ovl_copyattr(realinode, d_inode(root_dentry)); - sb->s_magic = OVERLAYFS_SUPER_MAGIC; - sb->s_op = &ovl_super_operations; - sb->s_xattr = ovl_xattr_handlers; sb->s_root = root_dentry; - sb->s_fs_info = ufs; - sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; return 0; diff --git a/fs/proc/array.c b/fs/proc/array.c index 89600fd5963d..81818adb8e9e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -412,10 +412,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { vsize = task_vsize(mm); - if (permitted) { - eip = KSTK_EIP(task); - esp = KSTK_ESP(task); - } + /* + * esp and eip are intentionally zeroed out. There is no + * non-racy way to read them without freezing the task. + * Programs that need reliable values can use ptrace(2). + */ } get_task_comm(tcomm, task); diff --git a/fs/proc/base.c b/fs/proc/base.c index c2964d890c9a..8e654468ab67 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -252,7 +252,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_FORCE); if (rv <= 0) goto out_free_page; @@ -270,7 +270,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, int nr_read; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -305,7 +306,8 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -354,7 +356,8 @@ skip_argv: bool final; _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, 0); + nr_read = access_remote_vm(mm, p, page, _count, + FOLL_FORCE); if (nr_read < 0) rv = nr_read; if (nr_read <= 0) @@ -832,6 +835,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, unsigned long addr = *ppos; ssize_t copied; char *page; + unsigned int flags = FOLL_FORCE; if (!mm) return 0; @@ -844,6 +848,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + if (write) + flags |= FOLL_WRITE; + while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -852,7 +859,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, break; } - this_len = access_remote_vm(mm, addr, page, this_len, write); + this_len = access_remote_vm(mm, addr, page, this_len, flags); if (!this_len) { if (!copied) copied = -EIO; @@ -965,7 +972,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, this_len = min(max_len, this_len); retval = access_remote_vm(mm, (env_start + src), - page, this_len, 0); + page, this_len, FOLL_FORCE); if (retval <= 0) { ret = retval; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6909582ce5e5..35b92d81692f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -266,24 +266,15 @@ static int do_maps_open(struct inode *inode, struct file *file, * /proc/PID/maps that is the stack of the main task. */ static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, int is_pid) + struct vm_area_struct *vma) { - int stack = 0; - - if (is_pid) { - stack = vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; - } else { - struct inode *inode = priv->inode; - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) - stack = vma_is_stack_for_task(vma, task); - rcu_read_unlock(); - } - return stack; + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; } static void @@ -354,7 +345,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - if (is_stack(priv, vma, is_pid)) + if (is_stack(priv, vma)) name = "[stack]"; } @@ -1669,7 +1660,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else if (is_stack(proc_priv, vma, is_pid)) { + } else if (is_stack(proc_priv, vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index faacb0c0d857..37175621e890 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -124,25 +124,17 @@ unsigned long task_statm(struct mm_struct *mm, } static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, int is_pid) + struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; - int stack = 0; - - if (is_pid) { - stack = vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; - } else { - struct inode *inode = priv->inode; - struct task_struct *task; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) - stack = vma_is_stack_for_task(vma, task); - rcu_read_unlock(); - } - return stack; + + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; } /* @@ -184,7 +176,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(priv, vma, is_pid)) { + } else if (mm && is_stack(priv, vma)) { seq_pad(m, ' '); seq_printf(m, "[stack]"); } diff --git a/fs/read_write.c b/fs/read_write.c index 66215a7b17cf..190e0d362581 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -730,6 +730,35 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) +/** + * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace + * into the kernel and check that it is valid. + * + * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE. + * @uvector: Pointer to the userspace array. + * @nr_segs: Number of elements in userspace array. + * @fast_segs: Number of elements in @fast_pointer. + * @fast_pointer: Pointer to (usually small on-stack) kernel array. + * @ret_pointer: (output parameter) Pointer to a variable that will point to + * either @fast_pointer, a newly allocated kernel array, or NULL, + * depending on which array was used. + * + * This function copies an array of &struct iovec of @nr_segs from + * userspace into the kernel and checks that each element is valid (e.g. + * it does not point to a kernel address or cause overflow by being too + * large, etc.). + * + * As an optimization, the caller may provide a pointer to a small + * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long + * (the size of this array, or 0 if unused, should be given in @fast_segs). + * + * @ret_pointer will always point to the array that was used, so the + * caller must take care not to call kfree() on it e.g. in case the + * @fast_pointer array was used and it was allocated on the stack. + * + * Return: The total number of bytes covered by the iovec array on success + * or a negative error code on error. + */ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, diff --git a/fs/super.c b/fs/super.c index c2ff475c1711..c183835566c1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1269,25 +1269,34 @@ EXPORT_SYMBOL(__sb_start_write); static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); - /* - * We are going to return to userspace and forget about this lock, the - * ownership goes to the caller of thaw_super() which does unlock. - * - * FIXME: we should do this before return from freeze_super() after we - * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() - * should re-acquire these locks before s_op->unfreeze_fs(sb). However - * this leads to lockdep false-positives, so currently we do the early - * release right after acquire. - */ - percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); } -static void sb_freeze_unlock(struct super_block *sb) +/* + * We are going to return to userspace and forget about these locks, the + * ownership goes to the caller of thaw_super() which does unlock(). + */ +static void lockdep_sb_freeze_release(struct super_block *sb) +{ + int level; + + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); +} + +/* + * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). + */ +static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); +} + +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); @@ -1379,10 +1388,11 @@ int freeze_super(struct super_block *sb) } } /* - * This is just for debugging purposes so that fs can warn if it - * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + * For debugging purposes so that fs can warn if it sees write activity + * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; + lockdep_sb_freeze_release(sb); up_write(&sb->s_umount); return 0; } @@ -1399,7 +1409,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_writers.frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; } @@ -1409,11 +1419,14 @@ int thaw_super(struct super_block *sb) goto out; } + lockdep_sb_freeze_acquire(sb); + if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); + lockdep_sb_freeze_release(sb); up_write(&sb->s_umount); return error; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 94374e435025..2b67bda2021b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -21,14 +21,14 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock); void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { - char *buf, *path = NULL; + char *buf; buf = kzalloc(PATH_MAX, GFP_KERNEL); if (buf) - path = kernfs_path(parent, buf, PATH_MAX); + kernfs_path(parent, buf, PATH_MAX); WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", - path, name); + buf, name); kfree(buf); } diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 584e87e11cb6..26ef1958b65b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -55,6 +55,8 @@ xfs-y += $(addprefix libxfs/, \ xfs_ag_resv.o \ xfs_rmap.o \ xfs_rmap_btree.o \ + xfs_refcount.o \ + xfs_refcount_btree.o \ xfs_sb.o \ xfs_symlink_remote.o \ xfs_trans_resv.o \ @@ -88,6 +90,7 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_reflink.o \ xfs_stats.o \ xfs_super.o \ xfs_symlink.o \ @@ -100,16 +103,20 @@ xfs-y += xfs_aops.o \ # low-level transaction/log code xfs-y += xfs_log.o \ xfs_log_cil.o \ + xfs_bmap_item.o \ xfs_buf_item.o \ xfs_extfree_item.o \ xfs_icreate_item.o \ xfs_inode_item.o \ + xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ xfs_trans_ail.o \ + xfs_trans_bmap.o \ xfs_trans_buf.o \ xfs_trans_extfree.o \ xfs_trans_inode.o \ + xfs_trans_refcount.o \ xfs_trans_rmap.o \ # optional features diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e3ae0f2b4294..e5ebc3770460 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -38,6 +38,7 @@ #include "xfs_trans_space.h" #include "xfs_rmap_btree.h" #include "xfs_btree.h" +#include "xfs_refcount_btree.h" /* * Per-AG Block Reservations @@ -108,7 +109,9 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS; + return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, + pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL, + XFS_RANDOM_AG_RESV_CRITICAL); } /* @@ -228,6 +231,11 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; + error = xfs_refcountbt_calc_reserves(pag->pag_mount, + pag->pag_agno, &ask, &used); + if (error) + goto out; + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, ask, used); if (error) @@ -238,6 +246,11 @@ xfs_ag_resv_init( if (pag->pag_agfl_resv.ar_asked == 0) { ask = used = 0; + error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, + &ask, &used); + if (error) + goto out; + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ca75dc90ebe0..effb64cf714f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -52,10 +52,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +unsigned int +xfs_refc_block( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + return XFS_RMAP_BLOCK(mp) + 1; + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + return XFS_FIBT_BLOCK(mp) + 1; + return XFS_IBT_BLOCK(mp) + 1; +} + xfs_extlen_t xfs_prealloc_blocks( struct xfs_mount *mp) { + if (xfs_sb_version_hasreflink(&mp->m_sb)) + return xfs_refc_block(mp) + 1; if (xfs_sb_version_hasrmapbt(&mp->m_sb)) return XFS_RMAP_BLOCK(mp) + 1; if (xfs_sb_version_hasfinobt(&mp->m_sb)) @@ -115,6 +128,8 @@ xfs_alloc_ag_max_usable( blocks++; /* finobt root block */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) blocks++; /* rmap root block */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + blocks++; /* refcount root block */ return mp->m_sb.sb_agblocks - blocks; } @@ -2321,6 +2336,9 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_btreeblks), offsetof(xfs_agf_t, agf_uuid), offsetof(xfs_agf_t, agf_rmap_blocks), + offsetof(xfs_agf_t, agf_refcount_blocks), + offsetof(xfs_agf_t, agf_refcount_root), + offsetof(xfs_agf_t, agf_refcount_level), /* needed so that we don't log the whole rest of the structure: */ offsetof(xfs_agf_t, agf_spare64), sizeof(xfs_agf_t) @@ -2458,6 +2476,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) return false; + if (xfs_sb_version_hasreflink(&mp->m_sb) && + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + return false; + return true;; } @@ -2578,6 +2600,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); pag->pagf_levels[XFS_BTNUM_RMAPi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); spin_lock_init(&pag->pagb_lock); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9d7f61d36645..c27344cf38e1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -48,6 +48,7 @@ #include "xfs_filestream.h" #include "xfs_rmap.h" #include "xfs_ag_resv.h" +#include "xfs_refcount.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -140,7 +141,8 @@ xfs_bmbt_lookup_ge( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + return whichfork != XFS_COW_FORK && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && XFS_IFORK_NEXTENTS(ip, whichfork) > XFS_IFORK_MAXEXT(ip, whichfork); } @@ -150,7 +152,8 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { - return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + return whichfork != XFS_COW_FORK && + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && XFS_IFORK_NEXTENTS(ip, whichfork) <= XFS_IFORK_MAXEXT(ip, whichfork); } @@ -640,6 +643,7 @@ xfs_bmap_btree_to_extents( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_flags & XFS_IFEXTENTS); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); rblock = ifp->if_broot; @@ -706,6 +710,7 @@ xfs_bmap_extents_to_btree( xfs_bmbt_ptr_t *pp; /* root block address pointer */ mp = ip->i_mount; + ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); @@ -748,6 +753,7 @@ xfs_bmap_extents_to_btree( args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { +try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = *firstblock; } else { @@ -762,6 +768,21 @@ xfs_bmap_extents_to_btree( xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } + + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + dfops->dop_low = true; + goto try_another_ag; + } /* * Allocation can't fail, the space was reserved. */ @@ -837,6 +858,7 @@ xfs_bmap_local_to_extents_empty( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_bytes == 0); ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); @@ -896,6 +918,7 @@ xfs_bmap_local_to_extents( * file currently fits in an inode. */ if (*firstblock == NULLFSBLOCK) { +try_another_ag: args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); args.type = XFS_ALLOCTYPE_START_BNO; } else { @@ -908,6 +931,19 @@ xfs_bmap_local_to_extents( if (error) goto done; + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + goto try_another_ag; + } /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); @@ -1670,7 +1706,8 @@ xfs_bmap_one_block( */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - struct xfs_bmalloca *bma) + struct xfs_bmalloca *bma, + int whichfork) { struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ @@ -1688,11 +1725,14 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - int whichfork = XFS_DATA_FORK; struct xfs_mount *mp; + xfs_extnum_t *nextents; mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); + ASSERT(whichfork != XFS_ATTR_FORK); + nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents : + &bma->ip->i_d.di_nextents); ASSERT(bma->idx >= 0); ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1706,6 +1746,9 @@ xfs_bmap_add_extent_delay_real( #define RIGHT r[1] #define PREV r[2] + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; + /* * Set up a bunch of variables to make the tests simpler. */ @@ -1792,7 +1835,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); - bma->ip->i_d.di_nextents--; + (*nextents)--; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1894,7 +1937,7 @@ xfs_bmap_add_extent_delay_real( xfs_bmbt_set_startblock(ep, new->br_startblock); trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -1964,7 +2007,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); xfs_iext_insert(bma->ip, bma->idx, 1, new, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2048,7 +2091,7 @@ xfs_bmap_add_extent_delay_real( trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2117,7 +2160,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount = temp2; /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); - bma->ip->i_d.di_nextents++; + (*nextents)++; if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { @@ -2215,7 +2258,8 @@ xfs_bmap_add_extent_delay_real( xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - bma->logflags |= rval; + if (whichfork != XFS_COW_FORK) + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT @@ -2759,6 +2803,7 @@ done: STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ + int whichfork, xfs_extnum_t *idx, /* extent number to update/insert */ xfs_bmbt_irec_t *new) /* new data to add to file extents */ { @@ -2770,8 +2815,10 @@ xfs_bmap_add_extent_hole_delay( int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; /* temp for indirect calculations */ - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = XFS_IFORK_PTR(ip, whichfork); state = 0; + if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ASSERT(isnullstartblock(new->br_startblock)); /* @@ -2789,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2923,6 +2970,7 @@ xfs_bmap_add_extent_hole_real( ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(whichfork != XFS_COW_FORK); XFS_STATS_INC(mp, xs_add_exlist); @@ -3648,7 +3696,9 @@ xfs_bmap_btalloc( else if (mp->m_dalign) stripe_align = mp->m_dalign; - if (xfs_alloc_is_userdata(ap->datatype)) + if (ap->flags & XFS_BMAPI_COWFORK) + align = xfs_get_cowextsz_hint(ap->ip); + else if (xfs_alloc_is_userdata(ap->datatype)) align = xfs_get_extsz_hint(ap->ip); if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, @@ -3856,7 +3906,8 @@ xfs_bmap_btalloc( ASSERT(nullfb || fb_agno == args.agno || (ap->dfops->dop_low && fb_agno < args.agno)); ap->length = args.len; - ap->ip->i_d.di_nblocks += args.len; + if (!(ap->flags & XFS_BMAPI_COWFORK)) + ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) ap->ip->i_delayed_blks -= args.len; @@ -3876,6 +3927,63 @@ xfs_bmap_btalloc( } /* + * For a remap operation, just "allocate" an extent at the address that the + * caller passed in, and ensure that the AGFL is the right size. The caller + * will then map the "allocated" extent into the file somewhere. + */ +STATIC int +xfs_bmap_remap_alloc( + struct xfs_bmalloca *ap) +{ + struct xfs_trans *tp = ap->tp; + struct xfs_mount *mp = tp->t_mountp; + xfs_agblock_t bno; + struct xfs_alloc_arg args; + int error; + + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ + memset(&args, 0, sizeof(struct xfs_alloc_arg)); + args.tp = ap->tp; + args.mp = ap->tp->t_mountp; + bno = *ap->firstblock; + args.agno = XFS_FSB_TO_AGNO(mp, bno); + args.agbno = XFS_FSB_TO_AGBNO(mp, bno); + if (args.agno >= mp->m_sb.sb_agcount || + args.agbno >= mp->m_sb.sb_agblocks) + return -EFSCORRUPTED; + + /* "Allocate" the extent from the range we passed in. */ + trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length); + ap->blkno = bno; + ap->ip->i_d.di_nblocks += ap->length; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + + /* Fix the freelist, like a real allocator does. */ + args.datatype = ap->datatype; + args.pag = xfs_perag_get(args.mp, args.agno); + ASSERT(args.pag); + + /* + * The freelist fixing code will decline the allocation if + * the size and shape of the free space doesn't allow for + * allocating the extent and updating all the metadata that + * happens during an allocation. We're remapping, not + * allocating, so skip that check by pretending to be freeing. + */ + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + if (error) + goto error0; +error0: + xfs_perag_put(args.pag); + if (error) + trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); + return error; +} + +/* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ @@ -3883,6 +3991,8 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { + if (ap->flags & XFS_BMAPI_REMAP) + return xfs_bmap_remap_alloc(ap); if (XFS_IS_REALTIME_INODE(ap->ip) && xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); @@ -4012,12 +4122,11 @@ xfs_bmapi_read( int error; int eof; int n = 0; - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| - XFS_BMAPI_IGSTATE))); + XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK))); ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); if (unlikely(XFS_TEST_ERROR( @@ -4035,6 +4144,16 @@ xfs_bmapi_read( ifp = XFS_IFORK_PTR(ip, whichfork); + /* No CoW fork? Return a hole. */ + if (whichfork == XFS_COW_FORK && !ifp) { + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = len; + mval->br_state = XFS_EXT_NORM; + *nmap = 1; + return 0; + } + if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, whichfork); if (error) @@ -4084,6 +4203,7 @@ xfs_bmapi_read( int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, + int whichfork, xfs_fileoff_t aoff, xfs_filblks_t len, struct xfs_bmbt_irec *got, @@ -4092,7 +4212,7 @@ xfs_bmapi_reserve_delalloc( int eof) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; char rt = XFS_IS_REALTIME_INODE(ip); @@ -4104,7 +4224,10 @@ xfs_bmapi_reserve_delalloc( alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); /* Figure out the extent size, adjust alen */ - extsz = xfs_get_extsz_hint(ip); + if (whichfork == XFS_COW_FORK) + extsz = xfs_get_cowextsz_hint(ip); + else + extsz = xfs_get_extsz_hint(ip); if (extsz) { error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, 1, 0, &aoff, &alen); @@ -4151,7 +4274,7 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, lastx, got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); /* * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay @@ -4182,8 +4305,7 @@ xfs_bmapi_allocate( struct xfs_bmalloca *bma) { struct xfs_mount *mp = bma->ip->i_mount; - int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(bma->flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4278,7 +4400,7 @@ xfs_bmapi_allocate( bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) - error = xfs_bmap_add_extent_delay_real(bma); + error = xfs_bmap_add_extent_delay_real(bma, whichfork); else error = xfs_bmap_add_extent_hole_real(bma, whichfork); @@ -4308,8 +4430,7 @@ xfs_bmapi_convert_unwritten( xfs_filblks_t len, int flags) { - int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + int whichfork = xfs_bmapi_whichfork(flags); struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4325,6 +4446,8 @@ xfs_bmapi_convert_unwritten( (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) return 0; + ASSERT(whichfork != XFS_COW_FORK); + /* * Modify (by adding) the state flag, if writing. */ @@ -4431,8 +4554,7 @@ xfs_bmapi_write( orig_mval = mval; orig_nmap = *nmap; #endif - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + whichfork = xfs_bmapi_whichfork(flags); ASSERT(*nmap >= 1); ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); @@ -4441,6 +4563,11 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK); + ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4502,6 +4629,14 @@ xfs_bmapi_write( wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); /* + * Make sure we only reflink into a hole. + */ + if (flags & XFS_BMAPI_REMAP) + ASSERT(inhole); + if (flags & XFS_BMAPI_COWFORK) + ASSERT(!inhole); + + /* * First, deal with the hole before the allocated space * that we found, if any. */ @@ -4531,6 +4666,17 @@ xfs_bmapi_write( goto error0; if (bma.blkno == NULLFSBLOCK) break; + + /* + * If this is a CoW allocation, record the data in + * the refcount btree for orphan recovery. + */ + if (whichfork == XFS_COW_FORK) { + error = xfs_refcount_alloc_cow_extent(mp, dfops, + bma.blkno, bma.length); + if (error) + goto error0; + } } /* Deal with the allocated space we found. */ @@ -4696,7 +4842,8 @@ xfs_bmap_del_extent( xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ + int whichfork, /* data or attr fork */ + int bflags) /* bmapi flags */ { xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ @@ -4725,6 +4872,8 @@ xfs_bmap_del_extent( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / @@ -4805,6 +4954,7 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_iext_remove(ip, *idx, 1, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); --*idx; @@ -4988,9 +5138,16 @@ xfs_bmap_del_extent( /* * If we need to, add to list of extents to delete. */ - if (do_fx) - xfs_bmap_add_free(mp, dfops, del->br_startblock, - del->br_blockcount, NULL); + if (do_fx && !(bflags & XFS_BMAPI_REMAP)) { + if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { + error = xfs_refcount_decrease_extent(mp, dfops, del); + if (error) + goto done; + } else + xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL); + } + /* * Adjust inode # blocks in the file. */ @@ -4999,7 +5156,7 @@ xfs_bmap_del_extent( /* * Adjust quota data. */ - if (qfield) + if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); /* @@ -5014,6 +5171,175 @@ done: return error; } +/* Remove an extent from the CoW fork. Similar to xfs_bmap_del_extent. */ +int +xfs_bunmapi_cow( + struct xfs_inode *ip, + struct xfs_bmbt_irec *del) +{ + xfs_filblks_t da_new; + xfs_filblks_t da_old; + xfs_fsblock_t del_endblock = 0; + xfs_fileoff_t del_endoff; + int delay; + struct xfs_bmbt_rec_host *ep; + int error; + struct xfs_bmbt_irec got; + xfs_fileoff_t got_endoff; + struct xfs_ifork *ifp; + struct xfs_mount *mp; + xfs_filblks_t nblks; + struct xfs_bmbt_irec new; + /* REFERENCED */ + uint qfield; + xfs_filblks_t temp; + xfs_filblks_t temp2; + int state = BMAP_COWFORK; + int eof; + xfs_extnum_t eidx; + + mp = ip->i_mount; + XFS_STATS_INC(mp, xs_del_exlist); + + ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof, + &eidx, &got, &new); + + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp; + ASSERT((eidx >= 0) && (eidx < ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(del->br_blockcount > 0); + ASSERT(got.br_startoff <= del->br_startoff); + del_endoff = del->br_startoff + del->br_blockcount; + got_endoff = got.br_startoff + got.br_blockcount; + ASSERT(got_endoff >= del_endoff); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); + qfield = 0; + error = 0; + /* + * If deleting a real allocation, must free up the disk space. + */ + if (!delay) { + nblks = del->br_blockcount; + qfield = XFS_TRANS_DQ_BCOUNT; + /* + * Set up del_endblock and cur for later. + */ + del_endblock = del->br_startblock + del->br_blockcount; + da_old = da_new = 0; + } else { + da_old = startblockval(got.br_startblock); + da_new = 0; + nblks = 0; + } + qfield = qfield; + nblks = nblks; + + /* + * Set flag value to use in switch statement. + * Left-contig is 2, right-contig is 1. + */ + switch (((got.br_startoff == del->br_startoff) << 1) | + (got_endoff == del_endoff)) { + case 3: + /* + * Matches the whole extent. Delete the entry. + */ + xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK); + --eidx; + break; + + case 2: + /* + * Deleting the first part of the extent. + */ + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_startoff(ep, del_endoff); + temp = got.br_blockcount - del->br_blockcount; + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + da_new = temp; + break; + } + xfs_bmbt_set_startblock(ep, del_endblock); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + break; + + case 1: + /* + * Deleting the last part of the extent. + */ + temp = got.br_blockcount - del->br_blockcount; + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + if (delay) { + temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + da_old); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + da_new = temp; + break; + } + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + break; + + case 0: + /* + * Deleting the middle of the extent. + */ + temp = del->br_startoff - got.br_startoff; + trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); + new.br_startoff = del_endoff; + temp2 = got_endoff - del_endoff; + new.br_blockcount = temp2; + new.br_state = got.br_state; + if (!delay) { + new.br_startblock = del_endblock; + } else { + temp = xfs_bmap_worst_indlen(ip, temp); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + temp2 = xfs_bmap_worst_indlen(ip, temp2); + new.br_startblock = nullstartblock((int)temp2); + da_new = temp + temp2; + while (da_new > da_old) { + if (temp) { + temp--; + da_new--; + xfs_bmbt_set_startblock(ep, + nullstartblock((int)temp)); + } + if (da_new == da_old) + break; + if (temp2) { + temp2--; + da_new--; + new.br_startblock = + nullstartblock((int)temp2); + } + } + } + trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_); + xfs_iext_insert(ip, eidx + 1, 1, &new, state); + ++eidx; + break; + } + + /* + * Account for change in delayed indirect blocks. + * Nothing to do for disk quota accounting here. + */ + ASSERT(da_old >= da_new); + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); + + return error; +} + /* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to @@ -5021,17 +5347,16 @@ done: * *done is set. */ int /* error */ -xfs_bunmapi( +__xfs_bunmapi( xfs_trans_t *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ + xfs_filblks_t *rlen, /* i/o: amount remaining */ int flags, /* misc flags */ xfs_extnum_t nexts, /* number of extents max */ xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ - struct xfs_defer_ops *dfops, /* i/o: list extents to free */ - int *done) /* set if not done yet */ + struct xfs_defer_ops *dfops) /* i/o: deferred updates */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ @@ -5053,11 +5378,12 @@ xfs_bunmapi( int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ xfs_fsblock_t sum; + xfs_filblks_t len = *rlen; /* length to unmap in file */ trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; + whichfork = xfs_bmapi_whichfork(flags); + ASSERT(whichfork != XFS_COW_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); if (unlikely( XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5079,7 +5405,7 @@ xfs_bunmapi( return error; nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); if (nextents == 0) { - *done = 1; + *rlen = 0; return 0; } XFS_STATS_INC(mp, xs_blk_unmap); @@ -5324,7 +5650,7 @@ xfs_bunmapi( cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL; error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del, - &tmp_logflags, whichfork); + &tmp_logflags, whichfork, flags); logflags |= tmp_logflags; if (error) goto error0; @@ -5350,7 +5676,10 @@ nodelete: extno++; } } - *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; + if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0) + *rlen = 0; + else + *rlen = bno - start + 1; /* * Convert to a btree if necessary. @@ -5406,6 +5735,27 @@ error0: return error; } +/* Unmap a range of a file. */ +int +xfs_bunmapi( + xfs_trans_t *tp, + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_extnum_t nexts, + xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops, + int *done) +{ + int error; + + error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock, + dfops); + *done = (len == 0); + return error; +} + /* * Determine whether an extent shift can be accomplished by a merge with the * extent that precedes the target hole of the shift. @@ -5985,3 +6335,146 @@ out: xfs_trans_cancel(tp); return error; } + +/* Deferred mapping is only for real extents in the data fork. */ +static bool +xfs_bmap_is_update_needed( + struct xfs_bmbt_irec *bmap) +{ + return bmap->br_startblock != HOLESTARTBLOCK && + bmap->br_startblock != DELAYSTARTBLOCK; +} + +/* Record a bmap intent. */ +static int +__xfs_bmap_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *bmap) +{ + int error; + struct xfs_bmap_intent *bi; + + trace_xfs_bmap_defer(mp, + XFS_FSB_TO_AGNO(mp, bmap->br_startblock), + type, + XFS_FSB_TO_AGBNO(mp, bmap->br_startblock), + ip->i_ino, whichfork, + bmap->br_startoff, + bmap->br_blockcount, + bmap->br_state); + + bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&bi->bi_list); + bi->bi_type = type; + bi->bi_owner = ip; + bi->bi_whichfork = whichfork; + bi->bi_bmap = *bmap; + + error = xfs_defer_join(dfops, bi->bi_owner); + if (error) { + kmem_free(bi); + return error; + } + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); + return 0; +} + +/* Map an extent into a file. */ +int +xfs_bmap_map_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_bmap_is_update_needed(PREV)) + return 0; + + return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip, + XFS_DATA_FORK, PREV); +} + +/* Unmap an extent out of a file. */ +int +xfs_bmap_unmap_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_bmap_is_update_needed(PREV)) + return 0; + + return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip, + XFS_DATA_FORK, PREV); +} + +/* + * Process one of the deferred bmap operations. We pass back the + * btree cursor to maintain our lock on the bmapbt between calls. + */ +int +xfs_bmap_finish_one( + struct xfs_trans *tp, + struct xfs_defer_ops *dfops, + struct xfs_inode *ip, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + struct xfs_bmbt_irec bmap; + int nimaps = 1; + xfs_fsblock_t firstfsb; + int flags = XFS_BMAPI_REMAP; + int done; + int error = 0; + + bmap.br_startblock = startblock; + bmap.br_startoff = startoff; + bmap.br_blockcount = blockcount; + bmap.br_state = state; + + trace_xfs_bmap_deferred(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, + XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), + ip->i_ino, whichfork, startoff, blockcount, state); + + if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) + return -EFSCORRUPTED; + if (whichfork == XFS_ATTR_FORK) + flags |= XFS_BMAPI_ATTRFORK; + + if (XFS_TEST_ERROR(false, tp->t_mountp, + XFS_ERRTAG_BMAP_FINISH_ONE, + XFS_RANDOM_BMAP_FINISH_ONE)) + return -EIO; + + switch (type) { + case XFS_BMAP_MAP: + firstfsb = bmap.br_startblock; + error = xfs_bmapi_write(tp, ip, bmap.br_startoff, + bmap.br_blockcount, flags, &firstfsb, + bmap.br_blockcount, &bmap, &nimaps, + dfops); + break; + case XFS_BMAP_UNMAP: + error = xfs_bunmapi(tp, ip, bmap.br_startoff, + bmap.br_blockcount, flags, 1, &firstfsb, + dfops, &done); + ASSERT(done); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 8395f6e8cf7d..f97db7132564 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -97,6 +97,19 @@ struct xfs_extent_free_item */ #define XFS_BMAPI_ZERO 0x080 +/* + * Map the inode offset to the block given in ap->firstblock. Primarily + * used for reflink. The range must be in a hole, and this flag cannot be + * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork. + * + * For bunmapi, this flag unmaps the range without adjusting quota, reducing + * refcount, or freeing the blocks. + */ +#define XFS_BMAPI_REMAP 0x100 + +/* Map something in the CoW fork. */ +#define XFS_BMAPI_COWFORK 0x200 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -105,12 +118,24 @@ struct xfs_extent_free_item { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ { XFS_BMAPI_CONVERT, "CONVERT" }, \ - { XFS_BMAPI_ZERO, "ZERO" } + { XFS_BMAPI_ZERO, "ZERO" }, \ + { XFS_BMAPI_REMAP, "REMAP" }, \ + { XFS_BMAPI_COWFORK, "COWFORK" } static inline int xfs_bmapi_aflag(int w) { - return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); + return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : + (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); +} + +static inline int xfs_bmapi_whichfork(int bmapi_flags) +{ + if (bmapi_flags & XFS_BMAPI_COWFORK) + return XFS_COW_FORK; + else if (bmapi_flags & XFS_BMAPI_ATTRFORK) + return XFS_ATTR_FORK; + return XFS_DATA_FORK; } /* @@ -131,13 +156,15 @@ static inline int xfs_bmapi_aflag(int w) #define BMAP_LEFT_VALID (1 << 6) #define BMAP_RIGHT_VALID (1 << 7) #define BMAP_ATTRFORK (1 << 8) +#define BMAP_COWFORK (1 << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ { BMAP_RIGHT_CONTIG, "RC" }, \ { BMAP_LEFT_FILLING, "LF" }, \ { BMAP_RIGHT_FILLING, "RF" }, \ - { BMAP_ATTRFORK, "ATTR" } + { BMAP_ATTRFORK, "ATTR" }, \ + { BMAP_COWFORK, "COW" } /* @@ -186,10 +213,15 @@ int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fsblock_t *firstblock, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap, struct xfs_defer_ops *dfops); +int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_defer_ops *dfops); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_extnum_t nexts, xfs_fsblock_t *firstblock, struct xfs_defer_ops *dfops, int *done); +int xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del); int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); @@ -203,8 +235,31 @@ struct xfs_bmbt_rec_host * xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, int fork, int *eofp, xfs_extnum_t *lastxp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff, - xfs_filblks_t len, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof); +int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, + xfs_fileoff_t aoff, xfs_filblks_t len, + struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, int eof); + +enum xfs_bmap_intent_type { + XFS_BMAP_MAP = 1, + XFS_BMAP_UNMAP, +}; + +struct xfs_bmap_intent { + struct list_head bi_list; + enum xfs_bmap_intent_type bi_type; + struct xfs_inode *bi_owner; + int bi_whichfork; + struct xfs_bmbt_irec bi_bmap; +}; + +int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, enum xfs_bmap_intent_type type, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, + xfs_filblks_t blockcount, xfs_exntst_t state); +int xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); +int xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops, + struct xfs_inode *ip, struct xfs_bmbt_irec *imap); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index cd85274e810c..8007d2ba9aef 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -453,6 +453,7 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); +try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; /* * Make sure there is sufficient room left in the AG to @@ -482,6 +483,22 @@ xfs_bmbt_alloc_block( if (error) goto error0; + /* + * During a CoW operation, the allocation and bmbt updates occur in + * different transactions. The mapping code tries to put new bmbt + * blocks near extents being mapped, but the only way to guarantee this + * is if the alloc and the mapping happen in a single transaction that + * has a block reservation. That isn't the case here, so if we run out + * of space we'll try again with another AG. + */ + if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && + args.fsbno == NULLFSBLOCK && + args.type == XFS_ALLOCTYPE_NEAR_BNO) { + cur->bc_private.b.dfops->dop_low = true; + args.fsbno = cur->bc_private.b.firstblock; + goto try_another_ag; + } + if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy @@ -777,6 +794,7 @@ xfs_bmbt_init_cursor( { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_btree_cur *cur; + ASSERT(whichfork != XFS_COW_FORK); cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index aa1752f918b8..5c8e6f2ce44f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -45,9 +45,10 @@ kmem_zone_t *xfs_btree_cur_zone; */ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, - XFS_FIBT_MAGIC }, + XFS_FIBT_MAGIC, 0 }, { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, + XFS_REFC_CRC_MAGIC } }; #define xfs_btree_magic(cur) \ xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] @@ -1216,6 +1217,9 @@ xfs_btree_set_refs( case XFS_BTNUM_RMAP: xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); break; + case XFS_BTNUM_REFC: + xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); + break; default: ASSERT(0); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 3f8556a5c2ad..c2b01d1c79ee 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -49,6 +49,7 @@ union xfs_btree_key { struct xfs_inobt_key inobt; struct xfs_rmap_key rmap; struct xfs_rmap_key __rmap_bigkey[2]; + struct xfs_refcount_key refc; }; union xfs_btree_rec { @@ -57,6 +58,7 @@ union xfs_btree_rec { struct xfs_alloc_rec alloc; struct xfs_inobt_rec inobt; struct xfs_rmap_rec rmap; + struct xfs_refcount_rec refc; }; /* @@ -72,6 +74,7 @@ union xfs_btree_rec { #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) #define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) +#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) /* * For logging record fields. @@ -105,6 +108,7 @@ do { \ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ + case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -127,6 +131,8 @@ do { \ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ case XFS_BTNUM_RMAP: \ __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ + case XFS_BTNUM_REFC: \ + __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) @@ -217,6 +223,15 @@ union xfs_btree_irec { struct xfs_bmbt_irec b; struct xfs_inobt_rec_incore i; struct xfs_rmap_irec r; + struct xfs_refcount_irec rc; +}; + +/* Per-AG btree private information. */ +union xfs_btree_cur_private { + struct { + unsigned long nr_ops; /* # record updates */ + int shape_changes; /* # of extent splits */ + } refc; }; /* @@ -243,6 +258,7 @@ typedef struct xfs_btree_cur struct xfs_buf *agbp; /* agf/agi buffer pointer */ struct xfs_defer_ops *dfops; /* deferred updates */ xfs_agnumber_t agno; /* ag number */ + union xfs_btree_cur_private priv; } a; struct { /* needed for BMAP */ struct xfs_inode *ip; /* pointer to our inode */ diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index e96533d178cf..f6e93ef0bffe 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -51,6 +51,8 @@ struct xfs_defer_pending { * find all the space it needs. */ enum xfs_defer_ops_type { + XFS_DEFER_OPS_TYPE_BMAP, + XFS_DEFER_OPS_TYPE_REFCOUNT, XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_MAX, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 270fb5cf4fa1..f6547fc5e016 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -456,9 +456,11 @@ xfs_sb_has_compat_feature( #define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ #define XFS_SB_FEAT_RO_COMPAT_RMAPBT (1 << 1) /* reverse map btree */ +#define XFS_SB_FEAT_RO_COMPAT_REFLINK (1 << 2) /* reflinked files */ #define XFS_SB_FEAT_RO_COMPAT_ALL \ (XFS_SB_FEAT_RO_COMPAT_FINOBT | \ - XFS_SB_FEAT_RO_COMPAT_RMAPBT) + XFS_SB_FEAT_RO_COMPAT_RMAPBT | \ + XFS_SB_FEAT_RO_COMPAT_REFLINK) #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL static inline bool xfs_sb_has_ro_compat_feature( @@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); } +static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); +} + /* * end of superblock version macros */ @@ -641,14 +649,17 @@ typedef struct xfs_agf { uuid_t agf_uuid; /* uuid of filesystem */ __be32 agf_rmap_blocks; /* rmapbt blocks used */ - __be32 agf_padding; /* padding */ + __be32 agf_refcount_blocks; /* refcountbt blocks used */ + + __be32 agf_refcount_root; /* refcount tree root block */ + __be32 agf_refcount_level; /* refcount btree levels */ /* * reserve some contiguous space for future logged fields before we add * the unlogged fields. This makes the range logging via flags and * structure offsets much simpler. */ - __be64 agf_spare64[15]; + __be64 agf_spare64[14]; /* unlogged fields, written during buffer writeback. */ __be64 agf_lsn; /* last write sequence */ @@ -674,8 +685,11 @@ typedef struct xfs_agf { #define XFS_AGF_BTREEBLKS 0x00000800 #define XFS_AGF_UUID 0x00001000 #define XFS_AGF_RMAP_BLOCKS 0x00002000 -#define XFS_AGF_SPARE64 0x00004000 -#define XFS_AGF_NUM_BITS 15 +#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000 +#define XFS_AGF_REFCOUNT_ROOT 0x00008000 +#define XFS_AGF_REFCOUNT_LEVEL 0x00010000 +#define XFS_AGF_SPARE64 0x00020000 +#define XFS_AGF_NUM_BITS 18 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) #define XFS_AGF_FLAGS \ @@ -693,6 +707,9 @@ typedef struct xfs_agf { { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ { XFS_AGF_UUID, "UUID" }, \ { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \ + { XFS_AGF_REFCOUNT_BLOCKS, "REFCOUNT_BLOCKS" }, \ + { XFS_AGF_REFCOUNT_ROOT, "REFCOUNT_ROOT" }, \ + { XFS_AGF_REFCOUNT_LEVEL, "REFCOUNT_LEVEL" }, \ { XFS_AGF_SPARE64, "SPARE64" } /* disk block (xfs_daddr_t) in the AG */ @@ -885,7 +902,8 @@ typedef struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __u8 di_pad2[16]; /* more padding for future expansion */ + __be32 di_cowextsize; /* basic cow extent size for file */ + __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_timestamp_t di_crtime; /* time created */ @@ -1041,9 +1059,14 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) * 16 bits of the XFS_XFLAG_s range. */ #define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */ +#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */ #define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) +#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT) +#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT) -#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) +#define XFS_DIFLAG2_ANY \ + (XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE) /* * Inode number format: @@ -1353,7 +1376,9 @@ struct xfs_owner_info { #define XFS_RMAP_OWN_AG (-5ULL) /* AG freespace btree blocks */ #define XFS_RMAP_OWN_INOBT (-6ULL) /* Inode btree blocks */ #define XFS_RMAP_OWN_INODES (-7ULL) /* Inode chunk */ -#define XFS_RMAP_OWN_MIN (-8ULL) /* guard */ +#define XFS_RMAP_OWN_REFC (-8ULL) /* refcount tree */ +#define XFS_RMAP_OWN_COW (-9ULL) /* cow allocations */ +#define XFS_RMAP_OWN_MIN (-10ULL) /* guard */ #define XFS_RMAP_NON_INODE_OWNER(owner) (!!((owner) & (1ULL << 63))) @@ -1434,6 +1459,62 @@ typedef __be32 xfs_rmap_ptr_t; XFS_IBT_BLOCK(mp) + 1) /* + * Reference Count Btree format definitions + * + */ +#define XFS_REFC_CRC_MAGIC 0x52334643 /* 'R3FC' */ + +unsigned int xfs_refc_block(struct xfs_mount *mp); + +/* + * Data record/key structure + * + * Each record associates a range of physical blocks (starting at + * rc_startblock and ending rc_blockcount blocks later) with a reference + * count (rc_refcount). Extents that are being used to stage a copy on + * write (CoW) operation are recorded in the refcount btree with a + * refcount of 1. All other records must have a refcount > 1 and must + * track an extent mapped only by file data forks. + * + * Extents with a single owner (attributes, metadata, non-shared file + * data) are not tracked here. Free space is also not tracked here. + * This is consistent with pre-reflink XFS. + */ + +/* + * Extents that are being used to stage a copy on write are stored + * in the refcount btree with a refcount of 1 and the upper bit set + * on the startblock. This speeds up mount time deletion of stale + * staging extents because they're all at the right side of the tree. + */ +#define XFS_REFC_COW_START ((xfs_agblock_t)(1U << 31)) +#define REFCNTBT_COWFLAG_BITLEN 1 +#define REFCNTBT_AGBLOCK_BITLEN 31 + +struct xfs_refcount_rec { + __be32 rc_startblock; /* starting block number */ + __be32 rc_blockcount; /* count of blocks */ + __be32 rc_refcount; /* number of inodes linked here */ +}; + +struct xfs_refcount_key { + __be32 rc_startblock; /* starting block number */ +}; + +struct xfs_refcount_irec { + xfs_agblock_t rc_startblock; /* starting block number */ + xfs_extlen_t rc_blockcount; /* count of free blocks */ + xfs_nlink_t rc_refcount; /* number of inodes linked here */ +}; + +#define MAXREFCOUNT ((xfs_nlink_t)~0U) +#define MAXREFCEXTLEN ((xfs_extlen_t)~0U) + +/* btree pointer type */ +typedef __be32 xfs_refcount_ptr_t; + + +/* * BMAP Btree format definitions * * This includes both the root block definition that sits inside an inode fork diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 79455058b752..b72dc821d78b 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -81,14 +81,16 @@ struct getbmapx { #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ #define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ +#define BMV_IF_COWFORK 0x20 /* return CoW fork rather than data */ #define BMV_IF_VALID \ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ - BMV_IF_DELALLOC|BMV_IF_NO_HOLES) + BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK) /* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ #define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ #define BMV_OF_LAST 0x4 /* segment is the last in the file */ +#define BMV_OF_SHARED 0x8 /* segment shared with another file */ /* * Structure for XFS_IOC_FSSETDM. @@ -206,7 +208,8 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ #define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ -#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* Reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_RMAPBT 0x80000 /* reverse mapping btree */ +#define XFS_FSOP_GEOM_FLAGS_REFLINK 0x100000 /* files can share blocks */ /* * Minimum and maximum sizes need for growth checks. @@ -275,7 +278,8 @@ typedef struct xfs_bstat { #define bs_projid bs_projid_lo /* (previously just bs_projid) */ __u16 bs_forkoff; /* inode fork offset in bytes */ __u16 bs_projid_hi; /* higher part of project id */ - unsigned char bs_pad[10]; /* pad space, unused */ + unsigned char bs_pad[6]; /* pad space, unused */ + __u32 bs_cowextsize; /* cow extent size */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 4b9769e23c83..8de9a3a29589 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -256,6 +256,7 @@ xfs_inode_from_disk( to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_cowextsize = be32_to_cpu(from->di_cowextsize); } } @@ -305,7 +306,7 @@ xfs_inode_to_disk( to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); - + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); memset(to->di_pad2, 0, sizeof(to->di_pad2)); @@ -357,6 +358,7 @@ xfs_log_dinode_to_disk( to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(from->di_lsn); memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); @@ -373,6 +375,9 @@ xfs_dinode_verify( struct xfs_inode *ip, struct xfs_dinode *dip) { + uint16_t flags; + uint64_t flags2; + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; @@ -389,6 +394,23 @@ xfs_dinode_verify( return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; + + flags = be16_to_cpu(dip->di_flags); + flags2 = be64_to_cpu(dip->di_flags2); + + /* don't allow reflink/cowextsize if we don't have reflink */ + if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && + !xfs_sb_version_hasreflink(&mp->m_sb)) + return false; + + /* don't let reflink and realtime mix */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME)) + return false; + + /* don't let reflink and dax mix */ + if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX)) + return false; + return true; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 7c4dd321b215..62d9d4681c8c 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -47,6 +47,7 @@ struct xfs_icdinode { __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ __uint64_t di_flags2; /* more random flags */ + __uint32_t di_cowextsize; /* basic cow extent size for file */ xfs_ictimestamp_t di_crtime; /* time created */ }; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index bbcc8c7a44b3..5dd56d3dbb3a 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -121,6 +121,26 @@ xfs_iformat_fork( return -EFSCORRUPTED; } + if (unlikely(xfs_is_reflink_inode(ip) && + (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) { + xfs_warn(ip->i_mount, + "corrupt dinode %llu, wrong file type for reflink.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + + if (unlikely(xfs_is_reflink_inode(ip) && + (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) { + xfs_warn(ip->i_mount, + "corrupt dinode %llu, has reflink+realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(reflink)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return -EFSCORRUPTED; + } + switch (VFS_I(ip)->i_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: @@ -186,9 +206,14 @@ xfs_iformat_fork( XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); return -EFSCORRUPTED; } - if (error) { + if (error) return error; + + if (xfs_is_reflink_inode(ip)) { + ASSERT(ip->i_cowfp == NULL); + xfs_ifork_init_cow(ip); } + if (!XFS_DFORK_Q(dip)) return 0; @@ -208,7 +233,8 @@ xfs_iformat_fork( XFS_CORRUPTION_ERROR("xfs_iformat(8)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); - return -EFSCORRUPTED; + error = -EFSCORRUPTED; + break; } error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); @@ -226,6 +252,9 @@ xfs_iformat_fork( if (error) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; + if (ip->i_cowfp) + kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + ip->i_cowfp = NULL; xfs_idestroy_fork(ip, XFS_DATA_FORK); } return error; @@ -740,6 +769,9 @@ xfs_idestroy_fork( if (whichfork == XFS_ATTR_FORK) { kmem_zone_free(xfs_ifork_zone, ip->i_afp); ip->i_afp = NULL; + } else if (whichfork == XFS_COW_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_cowfp); + ip->i_cowfp = NULL; } } @@ -927,6 +959,19 @@ xfs_iext_get_ext( } } +/* Convert bmap state flags to an inode fork. */ +struct xfs_ifork * +xfs_iext_state_to_fork( + struct xfs_inode *ip, + int state) +{ + if (state & BMAP_COWFORK) + return ip->i_cowfp; + else if (state & BMAP_ATTRFORK) + return ip->i_afp; + return &ip->i_df; +} + /* * Insert new item(s) into the extent records for incore inode * fork 'ifp'. 'count' new items are inserted at index 'idx'. @@ -939,7 +984,7 @@ xfs_iext_insert( xfs_bmbt_irec_t *new, /* items to insert */ int state) /* type of extent conversion */ { - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); xfs_extnum_t i; /* extent record index */ trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); @@ -1189,7 +1234,7 @@ xfs_iext_remove( int ext_diff, /* number of extents to remove */ int state) /* type of extent conversion */ { - xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_ifork_t *ifp = xfs_iext_state_to_fork(ip, state); xfs_extnum_t nextents; /* number of extents in file */ int new_size; /* size of extents after removal */ @@ -1934,3 +1979,20 @@ xfs_iext_irec_update_extoffs( ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; } } + +/* + * Initialize an inode's copy-on-write fork. + */ +void +xfs_ifork_init_cow( + struct xfs_inode *ip) +{ + if (ip->i_cowfp) + return; + + ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone, + KM_SLEEP | KM_NOFS); + ip->i_cowfp->if_flags = XFS_IFEXTENTS; + ip->i_cformat = XFS_DINODE_FMT_EXTENTS; + ip->i_cnextents = 0; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index f95e072ae646..c9476f50e32d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -92,7 +92,9 @@ typedef struct xfs_ifork { #define XFS_IFORK_PTR(ip,w) \ ((w) == XFS_DATA_FORK ? \ &(ip)->i_df : \ - (ip)->i_afp) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_afp : \ + (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ (XFS_IFORK_Q(ip) ? \ XFS_IFORK_BOFF(ip) : \ @@ -105,26 +107,38 @@ typedef struct xfs_ifork { #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ - XFS_IFORK_ASIZE(ip)) + ((w) == XFS_ATTR_FORK ? \ + XFS_IFORK_ASIZE(ip) : \ + 0)) #define XFS_IFORK_FORMAT(ip,w) \ ((w) == XFS_DATA_FORK ? \ (ip)->i_d.di_format : \ - (ip)->i_d.di_aformat) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_d.di_aformat : \ + (ip)->i_cformat)) #define XFS_IFORK_FMT_SET(ip,w,n) \ ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_format = (n)) : \ - ((ip)->i_d.di_aformat = (n))) + ((w) == XFS_ATTR_FORK ? \ + ((ip)->i_d.di_aformat = (n)) : \ + ((ip)->i_cformat = (n)))) #define XFS_IFORK_NEXTENTS(ip,w) \ ((w) == XFS_DATA_FORK ? \ (ip)->i_d.di_nextents : \ - (ip)->i_d.di_anextents) + ((w) == XFS_ATTR_FORK ? \ + (ip)->i_d.di_anextents : \ + (ip)->i_cnextents)) #define XFS_IFORK_NEXT_SET(ip,w,n) \ ((w) == XFS_DATA_FORK ? \ ((ip)->i_d.di_nextents = (n)) : \ - ((ip)->i_d.di_anextents = (n))) + ((w) == XFS_ATTR_FORK ? \ + ((ip)->i_d.di_anextents = (n)) : \ + ((ip)->i_cnextents = (n)))) #define XFS_IFORK_MAXEXT(ip, w) \ (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) +struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); + int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); @@ -169,4 +183,6 @@ void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); extern struct kmem_zone *xfs_ifork_zone; +extern void xfs_ifork_init_cow(struct xfs_inode *ip); + #endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index fc5eef85d61e..083cdd6d6c28 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -112,7 +112,11 @@ static inline uint xlog_get_cycle(char *ptr) #define XLOG_REG_TYPE_ICREATE 20 #define XLOG_REG_TYPE_RUI_FORMAT 21 #define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_MAX 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_MAX 26 /* * Flags to log operation header @@ -231,6 +235,10 @@ typedef struct xfs_trans_header { #define XFS_LI_ICREATE 0x123f #define XFS_LI_RUI 0x1240 /* rmap update intent */ #define XFS_LI_RUD 0x1241 +#define XFS_LI_CUI 0x1242 /* refcount update intent */ +#define XFS_LI_CUD 0x1243 +#define XFS_LI_BUI 0x1244 /* bmbt update intent */ +#define XFS_LI_BUD 0x1245 #define XFS_LI_TYPE_DESC \ { XFS_LI_EFI, "XFS_LI_EFI" }, \ @@ -242,7 +250,11 @@ typedef struct xfs_trans_header { { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ { XFS_LI_ICREATE, "XFS_LI_ICREATE" }, \ { XFS_LI_RUI, "XFS_LI_RUI" }, \ - { XFS_LI_RUD, "XFS_LI_RUD" } + { XFS_LI_RUD, "XFS_LI_RUD" }, \ + { XFS_LI_CUI, "XFS_LI_CUI" }, \ + { XFS_LI_CUD, "XFS_LI_CUD" }, \ + { XFS_LI_BUI, "XFS_LI_BUI" }, \ + { XFS_LI_BUD, "XFS_LI_BUD" } /* * Inode Log Item Format definitions. @@ -411,7 +423,8 @@ struct xfs_log_dinode { __uint64_t di_changecount; /* number of attribute changes */ xfs_lsn_t di_lsn; /* flush sequence */ __uint64_t di_flags2; /* more random flags */ - __uint8_t di_pad2[16]; /* more padding for future expansion */ + __uint32_t di_cowextsize; /* basic cow extent size for file */ + __uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ xfs_ictimestamp_t di_crtime; /* time created */ @@ -622,8 +635,11 @@ struct xfs_map_extent { /* rmap me_flags: upper bits are flags, lower byte is type code */ #define XFS_RMAP_EXTENT_MAP 1 +#define XFS_RMAP_EXTENT_MAP_SHARED 2 #define XFS_RMAP_EXTENT_UNMAP 3 +#define XFS_RMAP_EXTENT_UNMAP_SHARED 4 #define XFS_RMAP_EXTENT_CONVERT 5 +#define XFS_RMAP_EXTENT_CONVERT_SHARED 6 #define XFS_RMAP_EXTENT_ALLOC 7 #define XFS_RMAP_EXTENT_FREE 8 #define XFS_RMAP_EXTENT_TYPE_MASK 0xFF @@ -671,6 +687,102 @@ struct xfs_rud_log_format { }; /* + * CUI/CUD (refcount update) log format definitions + */ +struct xfs_phys_extent { + __uint64_t pe_startblock; + __uint32_t pe_len; + __uint32_t pe_flags; +}; + +/* refcount pe_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_refcount_intent_type. */ +#define XFS_REFCOUNT_EXTENT_TYPE_MASK 0xFF + +#define XFS_REFCOUNT_EXTENT_FLAGS (XFS_REFCOUNT_EXTENT_TYPE_MASK) + +/* + * This is the structure used to lay out a cui log item in the + * log. The cui_extents field is a variable size array whose + * size is given by cui_nextents. + */ +struct xfs_cui_log_format { + __uint16_t cui_type; /* cui log item type */ + __uint16_t cui_size; /* size of this item */ + __uint32_t cui_nextents; /* # extents to free */ + __uint64_t cui_id; /* cui identifier */ + struct xfs_phys_extent cui_extents[]; /* array of extents */ +}; + +static inline size_t +xfs_cui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_cui_log_format) + + nr * sizeof(struct xfs_phys_extent); +} + +/* + * This is the structure used to lay out a cud log item in the + * log. The cud_extents array is a variable size array whose + * size is given by cud_nextents; + */ +struct xfs_cud_log_format { + __uint16_t cud_type; /* cud log item type */ + __uint16_t cud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t cud_cui_id; /* id of corresponding cui */ +}; + +/* + * BUI/BUD (inode block mapping) log format definitions + */ + +/* bmbt me_flags: upper bits are flags, lower byte is type code */ +/* Type codes are taken directly from enum xfs_bmap_intent_type. */ +#define XFS_BMAP_EXTENT_TYPE_MASK 0xFF + +#define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31) +#define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30) + +#define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \ + XFS_BMAP_EXTENT_ATTR_FORK | \ + XFS_BMAP_EXTENT_UNWRITTEN) + +/* + * This is the structure used to lay out an bui log item in the + * log. The bui_extents field is a variable size array whose + * size is given by bui_nextents. + */ +struct xfs_bui_log_format { + __uint16_t bui_type; /* bui log item type */ + __uint16_t bui_size; /* size of this item */ + __uint32_t bui_nextents; /* # extents to free */ + __uint64_t bui_id; /* bui identifier */ + struct xfs_map_extent bui_extents[]; /* array of extents to bmap */ +}; + +static inline size_t +xfs_bui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_bui_log_format) + + nr * sizeof(struct xfs_map_extent); +} + +/* + * This is the structure used to lay out an bud log item in the + * log. The bud_extents array is a variable size array whose + * size is given by bud_nextents; + */ +struct xfs_bud_log_format { + __uint16_t bud_type; /* bud log item type */ + __uint16_t bud_size; /* size of this item */ + __uint32_t __pad; + __uint64_t bud_bui_id; /* id of corresponding bui */ +}; + +/* * Dquot Log format definitions. * * The first two fields must be the type and size fitting into diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c new file mode 100644 index 000000000000..b177ef33cd4c --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -0,0 +1,1698 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_refcount.h" +#include "xfs_rmap.h" + +/* Allowable refcount adjustment amounts. */ +enum xfs_refc_adjust_op { + XFS_REFCOUNT_ADJUST_INCREASE = 1, + XFS_REFCOUNT_ADJUST_DECREASE = -1, + XFS_REFCOUNT_ADJUST_COW_ALLOC = 0, + XFS_REFCOUNT_ADJUST_COW_FREE = -1, +}; + +STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, xfs_extlen_t aglen, + struct xfs_defer_ops *dfops); +STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, xfs_extlen_t aglen, + struct xfs_defer_ops *dfops); + +/* + * Look up the first record less than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_le( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_LE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Look up the first record greater than or equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_ge( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_GE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* Convert on-disk record to in-core format. */ +static inline void +xfs_refcount_btrec_to_irec( + union xfs_btree_rec *rec, + struct xfs_refcount_irec *irec) +{ + irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); + irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount); + irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount); +} + +/* + * Get the data from the pointed-to record. + */ +int +xfs_refcount_get_rec( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *stat) +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + xfs_refcount_btrec_to_irec(rec, irec); + trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno, + irec); + } + return error; +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_update( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec) +{ + union xfs_btree_rec rec; + int error; + + trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec); + rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock); + rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount); + rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount); + error = xfs_btree_update(cur, &rec); + if (error) + trace_xfs_refcount_update_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Insert the record referred to by cur to the value given + * by [bno, len, refcount]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_insert( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, + int *i) +{ + int error; + + trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec); + cur->bc_rec.rc.rc_startblock = irec->rc_startblock; + cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; + cur->bc_rec.rc.rc_refcount = irec->rc_refcount; + error = xfs_btree_insert(cur, i); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); +out_error: + if (error) + trace_xfs_refcount_insert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Remove the record referred to by cur, then set the pointer to the spot + * where the record could be re-inserted, in case we want to increment or + * decrement the cursor. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int +xfs_refcount_delete( + struct xfs_btree_cur *cur, + int *i) +{ + struct xfs_refcount_irec irec; + int found_rec; + int error; + + error = xfs_refcount_get_rec(cur, &irec, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec); + error = xfs_btree_delete(cur, i); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + if (error) + goto out_error; + error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec); +out_error: + if (error) + trace_xfs_refcount_delete_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Adjusting the Reference Count + * + * As stated elsewhere, the reference count btree (refcbt) stores + * >1 reference counts for extents of physical blocks. In this + * operation, we're either raising or lowering the reference count of + * some subrange stored in the tree: + * + * <------ adjustment range ------> + * ----+ +---+-----+ +--+--------+--------- + * 2 | | 3 | 4 | |17| 55 | 10 + * ----+ +---+-----+ +--+--------+--------- + * X axis is physical blocks number; + * reference counts are the numbers inside the rectangles + * + * The first thing we need to do is to ensure that there are no + * refcount extents crossing either boundary of the range to be + * adjusted. For any extent that does cross a boundary, split it into + * two extents so that we can increment the refcount of one of the + * pieces later: + * + * <------ adjustment range ------> + * ----+ +---+-----+ +--+--------+----+---- + * 2 | | 3 | 2 | |17| 55 | 10 | 10 + * ----+ +---+-----+ +--+--------+----+---- + * + * For this next step, let's assume that all the physical blocks in + * the adjustment range are mapped to a file and are therefore in use + * at least once. Therefore, we can infer that any gap in the + * refcount tree within the adjustment range represents a physical + * extent with refcount == 1: + * + * <------ adjustment range ------> + * ----+---+---+-----+-+--+--------+----+---- + * 2 |"1"| 3 | 2 |1|17| 55 | 10 | 10 + * ----+---+---+-----+-+--+--------+----+---- + * ^ + * + * For each extent that falls within the interval range, figure out + * which extent is to the left or the right of that extent. Now we + * have a left, current, and right extent. If the new reference count + * of the center extent enables us to merge left, center, and right + * into one record covering all three, do so. If the center extent is + * at the left end of the range, abuts the left extent, and its new + * reference count matches the left extent's record, then merge them. + * If the center extent is at the right end of the range, abuts the + * right extent, and the reference counts match, merge those. In the + * example, we can left merge (assuming an increment operation): + * + * <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + * 2 | 3 | 2 |1|17| 55 | 10 | 10 + * --------+---+-----+-+--+--------+----+---- + * ^ + * + * For all other extents within the range, adjust the reference count + * or delete it if the refcount falls below 2. If we were + * incrementing, the end result looks like this: + * + * <------ adjustment range ------> + * --------+---+-----+-+--+--------+----+---- + * 2 | 4 | 3 |2|18| 56 | 11 | 10 + * --------+---+-----+-+--+--------+----+---- + * + * The result of a decrement operation looks as such: + * + * <------ adjustment range ------> + * ----+ +---+ +--+--------+----+---- + * 2 | | 2 | |16| 54 | 9 | 10 + * ----+ +---+ +--+--------+----+---- + * DDDD 111111DD + * + * The blocks marked "D" are freed; the blocks marked "1" are only + * referenced once and therefore the record is removed from the + * refcount btree. + */ + +/* Next block after this extent. */ +static inline xfs_agblock_t +xfs_refc_next( + struct xfs_refcount_irec *rc) +{ + return rc->rc_startblock + rc->rc_blockcount; +} + +/* + * Split a refcount extent that crosses agbno. + */ +STATIC int +xfs_refcount_split_extent( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + bool *shape_changed) +{ + struct xfs_refcount_irec rcext, tmp; + int found_rec; + int error; + + *shape_changed = false; + error = xfs_refcount_lookup_le(cur, agbno, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &rcext, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno) + return 0; + + *shape_changed = true; + trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno, + &rcext, agbno); + + /* Establish the right extent. */ + tmp = rcext; + tmp.rc_startblock = agbno; + tmp.rc_blockcount -= (agbno - rcext.rc_startblock); + error = xfs_refcount_update(cur, &tmp); + if (error) + goto out_error; + + /* Insert the left extent. */ + tmp = rcext; + tmp.rc_blockcount = agbno - rcext.rc_startblock; + error = xfs_refcount_insert(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + return error; + +out_error: + trace_xfs_refcount_split_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge the left, center, and right extents. + */ +STATIC int +xfs_refcount_merge_center_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *center, + struct xfs_refcount_irec *right, + unsigned long long extlen, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_center_extents(cur->bc_mp, + cur->bc_private.a.agno, left, center, right); + + /* + * Make sure the center and right extents are not in the btree. + * If the center extent was synthesized, the first delete call + * removes the right extent and we skip the second deletion. + * If center and right were in the btree, then the first delete + * call removes the center and the second one removes the right + * extent. + */ + error = xfs_refcount_lookup_ge(cur, center->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (center->rc_refcount > 1) { + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the left extent. */ + error = xfs_refcount_lookup_le(cur, left->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + left->rc_blockcount = extlen; + error = xfs_refcount_update(cur, left); + if (error) + goto out_error; + + *aglen = 0; + return error; + +out_error: + trace_xfs_refcount_merge_center_extents_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge with the left extent. + */ +STATIC int +xfs_refcount_merge_left_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *cleft, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_left_extent(cur->bc_mp, + cur->bc_private.a.agno, left, cleft); + + /* If the extent at agbno (cleft) wasn't synthesized, remove it. */ + if (cleft->rc_refcount > 1) { + error = xfs_refcount_lookup_le(cur, cleft->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the left extent. */ + error = xfs_refcount_lookup_le(cur, left->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + left->rc_blockcount += cleft->rc_blockcount; + error = xfs_refcount_update(cur, left); + if (error) + goto out_error; + + *agbno += cleft->rc_blockcount; + *aglen -= cleft->rc_blockcount; + return error; + +out_error: + trace_xfs_refcount_merge_left_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Merge with the right extent. + */ +STATIC int +xfs_refcount_merge_right_extent( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *right, + struct xfs_refcount_irec *cright, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen) +{ + int error; + int found_rec; + + trace_xfs_refcount_merge_right_extent(cur->bc_mp, + cur->bc_private.a.agno, cright, right); + + /* + * If the extent ending at agbno+aglen (cright) wasn't synthesized, + * remove it. + */ + if (cright->rc_refcount > 1) { + error = xfs_refcount_lookup_le(cur, cright->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + } + + /* Enlarge the right extent. */ + error = xfs_refcount_lookup_le(cur, right->rc_startblock, + &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + right->rc_startblock -= cright->rc_blockcount; + right->rc_blockcount += cright->rc_blockcount; + error = xfs_refcount_update(cur, right); + if (error) + goto out_error; + + *aglen -= cright->rc_blockcount; + return error; + +out_error: + trace_xfs_refcount_merge_right_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +#define XFS_FIND_RCEXT_SHARED 1 +#define XFS_FIND_RCEXT_COW 2 +/* + * Find the left extent and the one after it (cleft). This function assumes + * that we've already split any extent crossing agbno. + */ +STATIC int +xfs_refcount_find_left_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *left, + struct xfs_refcount_irec *cleft, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + int flags) +{ + struct xfs_refcount_irec tmp; + int error; + int found_rec; + + left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK; + error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (xfs_refc_next(&tmp) != agbno) + return 0; + if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + return 0; + if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + return 0; + /* We have a left extent; retrieve (or invent) the next right one */ + *left = tmp; + + error = xfs_btree_increment(cur, 0, &found_rec); + if (error) + goto out_error; + if (found_rec) { + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + /* if tmp starts at the end of our range, just use that */ + if (tmp.rc_startblock == agbno) + *cleft = tmp; + else { + /* + * There's a gap in the refcntbt at the start of the + * range we're interested in (refcount == 1) so + * synthesize the implied extent and pass it back. + * We assume here that the agbno/aglen range was + * passed in from a data fork extent mapping and + * therefore is allocated to exactly one owner. + */ + cleft->rc_startblock = agbno; + cleft->rc_blockcount = min(aglen, + tmp.rc_startblock - agbno); + cleft->rc_refcount = 1; + } + } else { + /* + * No extents, so pretend that there's one covering the whole + * range. + */ + cleft->rc_startblock = agbno; + cleft->rc_blockcount = aglen; + cleft->rc_refcount = 1; + } + trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno, + left, cleft, agbno); + return error; + +out_error: + trace_xfs_refcount_find_left_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Find the right extent and the one before it (cright). This function + * assumes that we've already split any extents crossing agbno + aglen. + */ +STATIC int +xfs_refcount_find_right_extents( + struct xfs_btree_cur *cur, + struct xfs_refcount_irec *right, + struct xfs_refcount_irec *cright, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + int flags) +{ + struct xfs_refcount_irec tmp; + int error; + int found_rec; + + right->rc_startblock = cright->rc_startblock = NULLAGBLOCK; + error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec); + if (error) + goto out_error; + if (!found_rec) + return 0; + + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error); + + if (tmp.rc_startblock != agbno + aglen) + return 0; + if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2) + return 0; + if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1) + return 0; + /* We have a right extent; retrieve (or invent) the next left one */ + *right = tmp; + + error = xfs_btree_decrement(cur, 0, &found_rec); + if (error) + goto out_error; + if (found_rec) { + error = xfs_refcount_get_rec(cur, &tmp, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, + out_error); + + /* if tmp ends at the end of our range, just use that */ + if (xfs_refc_next(&tmp) == agbno + aglen) + *cright = tmp; + else { + /* + * There's a gap in the refcntbt at the end of the + * range we're interested in (refcount == 1) so + * create the implied extent and pass it back. + * We assume here that the agbno/aglen range was + * passed in from a data fork extent mapping and + * therefore is allocated to exactly one owner. + */ + cright->rc_startblock = max(agbno, xfs_refc_next(&tmp)); + cright->rc_blockcount = right->rc_startblock - + cright->rc_startblock; + cright->rc_refcount = 1; + } + } else { + /* + * No extents, so pretend that there's one covering the whole + * range. + */ + cright->rc_startblock = agbno; + cright->rc_blockcount = aglen; + cright->rc_refcount = 1; + } + trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno, + cright, right, agbno + aglen); + return error; + +out_error: + trace_xfs_refcount_find_right_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* Is this extent valid? */ +static inline bool +xfs_refc_valid( + struct xfs_refcount_irec *rc) +{ + return rc->rc_startblock != NULLAGBLOCK; +} + +/* + * Try to merge with any extents on the boundaries of the adjustment range. + */ +STATIC int +xfs_refcount_merge_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adjust, + int flags, + bool *shape_changed) +{ + struct xfs_refcount_irec left = {0}, cleft = {0}; + struct xfs_refcount_irec cright = {0}, right = {0}; + int error; + unsigned long long ulen; + bool cequal; + + *shape_changed = false; + /* + * Find the extent just below agbno [left], just above agbno [cleft], + * just below (agbno + aglen) [cright], and just above (agbno + aglen) + * [right]. + */ + error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno, + *aglen, flags); + if (error) + return error; + error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno, + *aglen, flags); + if (error) + return error; + + /* No left or right extent to merge; exit. */ + if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right)) + return 0; + + cequal = (cleft.rc_startblock == cright.rc_startblock) && + (cleft.rc_blockcount == cright.rc_blockcount); + + /* Try to merge left, cleft, and right. cleft must == cright. */ + ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + + right.rc_blockcount; + if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && + xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && + left.rc_refcount == cleft.rc_refcount + adjust && + right.rc_refcount == cleft.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + return xfs_refcount_merge_center_extents(cur, &left, &cleft, + &right, ulen, agbno, aglen); + } + + /* Try to merge left and cleft. */ + ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; + if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && + left.rc_refcount == cleft.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + error = xfs_refcount_merge_left_extent(cur, &left, &cleft, + agbno, aglen); + if (error) + return error; + + /* + * If we just merged left + cleft and cleft == cright, + * we no longer have a cright to merge with right. We're done. + */ + if (cequal) + return 0; + } + + /* Try to merge cright and right. */ + ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; + if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && + right.rc_refcount == cright.rc_refcount + adjust && + ulen < MAXREFCEXTLEN) { + *shape_changed = true; + return xfs_refcount_merge_right_extent(cur, &right, &cright, + agbno, aglen); + } + + return error; +} + +/* + * While we're adjusting the refcounts records of an extent, we have + * to keep an eye on the number of extents we're dirtying -- run too + * many in a single transaction and we'll exceed the transaction's + * reservation and crash the fs. Each record adds 12 bytes to the + * log (plus any key updates) so we'll conservatively assume 24 bytes + * per record. We must also leave space for btree splits on both ends + * of the range and space for the CUD and a new CUI. + * + * XXX: This is a pretty hand-wavy estimate. The penalty for guessing + * true incorrectly is a shutdown FS; the penalty for guessing false + * incorrectly is more transaction rolls than might be necessary. + * Be conservative here. + */ +static bool +xfs_refcount_still_have_space( + struct xfs_btree_cur *cur) +{ + unsigned long overhead; + + overhead = cur->bc_private.a.priv.refc.shape_changes * + xfs_allocfree_log_count(cur->bc_mp, 1); + overhead *= cur->bc_mp->m_sb.sb_blocksize; + + /* + * Only allow 2 refcount extent updates per transaction if the + * refcount continue update "error" has been injected. + */ + if (cur->bc_private.a.priv.refc.nr_ops > 2 && + XFS_TEST_ERROR(false, cur->bc_mp, + XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE, + XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE)) + return false; + + if (cur->bc_private.a.priv.refc.nr_ops == 0) + return true; + else if (overhead > cur->bc_tp->t_log_res) + return false; + return cur->bc_tp->t_log_res - overhead > + cur->bc_private.a.priv.refc.nr_ops * 32; +} + +/* + * Adjust the refcounts of middle extents. At this point we should have + * split extents that crossed the adjustment range; merged with adjacent + * extents; and updated agbno/aglen to reflect the merges. Therefore, + * all we have to do is update the extents inside [agbno, agbno + aglen]. + */ +STATIC int +xfs_refcount_adjust_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t *agbno, + xfs_extlen_t *aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + struct xfs_refcount_irec ext, tmp; + int error; + int found_rec, found_tmp; + xfs_fsblock_t fsbno; + + /* Merging did all the work already. */ + if (*aglen == 0) + return 0; + + error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec); + if (error) + goto out_error; + + while (*aglen > 0 && xfs_refcount_still_have_space(cur)) { + error = xfs_refcount_get_rec(cur, &ext, &found_rec); + if (error) + goto out_error; + if (!found_rec) { + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks; + ext.rc_blockcount = 0; + ext.rc_refcount = 0; + } + + /* + * Deal with a hole in the refcount tree; if a file maps to + * these blocks and there's no refcountbt record, pretend that + * there is one with refcount == 1. + */ + if (ext.rc_startblock != *agbno) { + tmp.rc_startblock = *agbno; + tmp.rc_blockcount = min(*aglen, + ext.rc_startblock - *agbno); + tmp.rc_refcount = 1 + adj; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &tmp); + + /* + * Either cover the hole (increment) or + * delete the range (decrement). + */ + if (tmp.rc_refcount) { + error = xfs_refcount_insert(cur, &tmp, + &found_tmp); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_tmp == 1, out_error); + cur->bc_private.a.priv.refc.nr_ops++; + } else { + fsbno = XFS_AGB_TO_FSB(cur->bc_mp, + cur->bc_private.a.agno, + tmp.rc_startblock); + xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, + tmp.rc_blockcount, oinfo); + } + + (*agbno) += tmp.rc_blockcount; + (*aglen) -= tmp.rc_blockcount; + + error = xfs_refcount_lookup_ge(cur, *agbno, + &found_rec); + if (error) + goto out_error; + } + + /* Stop if there's nothing left to modify */ + if (*aglen == 0 || !xfs_refcount_still_have_space(cur)) + break; + + /* + * Adjust the reference count and either update the tree + * (incr) or free the blocks (decr). + */ + if (ext.rc_refcount == MAXREFCOUNT) + goto skip; + ext.rc_refcount += adj; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &ext); + if (ext.rc_refcount > 1) { + error = xfs_refcount_update(cur, &ext); + if (error) + goto out_error; + cur->bc_private.a.priv.refc.nr_ops++; + } else if (ext.rc_refcount == 1) { + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_rec == 1, out_error); + cur->bc_private.a.priv.refc.nr_ops++; + goto advloop; + } else { + fsbno = XFS_AGB_TO_FSB(cur->bc_mp, + cur->bc_private.a.agno, + ext.rc_startblock); + xfs_bmap_add_free(cur->bc_mp, dfops, fsbno, + ext.rc_blockcount, oinfo); + } + +skip: + error = xfs_btree_increment(cur, 0, &found_rec); + if (error) + goto out_error; + +advloop: + (*agbno) += ext.rc_blockcount; + (*aglen) -= ext.rc_blockcount; + } + + return error; +out_error: + trace_xfs_refcount_modify_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* Adjust the reference count of a range of AG blocks. */ +STATIC int +xfs_refcount_adjust( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *new_agbno, + xfs_extlen_t *new_aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + bool shape_changed; + int shape_changes = 0; + int error; + + *new_agbno = agbno; + *new_aglen = aglen; + if (adj == XFS_REFCOUNT_ADJUST_INCREASE) + trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + else + trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + + /* + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj, + XFS_FIND_RCEXT_SHARED, &shape_changed); + if (error) + goto out_error; + if (shape_changed) + shape_changes++; + if (shape_changes) + cur->bc_private.a.priv.refc.shape_changes++; + + /* Now that we've taken care of the ends, adjust the middle extents */ + error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, + adj, dfops, oinfo); + if (error) + goto out_error; + + return 0; + +out_error: + trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* Clean up after calling xfs_refcount_finish_one. */ +void +xfs_refcount_finish_one_cleanup( + struct xfs_trans *tp, + struct xfs_btree_cur *rcur, + int error) +{ + struct xfs_buf *agbp; + + if (rcur == NULL) + return; + agbp = rcur->bc_private.a.agbp; + xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + xfs_trans_brelse(tp, agbp); +} + +/* + * Process one of the deferred refcount operations. We pass back the + * btree cursor to maintain our lock on the btree between calls. + * This saves time and eliminates a buffer deadlock between the + * superblock and the AGF because we'll always grab them in the same + * order. + */ +int +xfs_refcount_finish_one( + struct xfs_trans *tp, + struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *rcur; + struct xfs_buf *agbp = NULL; + int error = 0; + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_agblock_t new_agbno; + unsigned long nr_ops = 0; + int shape_changes = 0; + + agno = XFS_FSB_TO_AGNO(mp, startblock); + ASSERT(agno != NULLAGNUMBER); + bno = XFS_FSB_TO_AGBNO(mp, startblock); + + trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock), + type, XFS_FSB_TO_AGBNO(mp, startblock), + blockcount); + + if (XFS_TEST_ERROR(false, mp, + XFS_ERRTAG_REFCOUNT_FINISH_ONE, + XFS_RANDOM_REFCOUNT_FINISH_ONE)) + return -EIO; + + /* + * If we haven't gotten a cursor or the cursor AG doesn't match + * the startblock, get one now. + */ + rcur = *pcur; + if (rcur != NULL && rcur->bc_private.a.agno != agno) { + nr_ops = rcur->bc_private.a.priv.refc.nr_ops; + shape_changes = rcur->bc_private.a.priv.refc.shape_changes; + xfs_refcount_finish_one_cleanup(tp, rcur, 0); + rcur = NULL; + *pcur = NULL; + } + if (rcur == NULL) { + error = xfs_alloc_read_agf(tp->t_mountp, tp, agno, + XFS_ALLOC_FLAG_FREEING, &agbp); + if (error) + return error; + if (!agbp) + return -EFSCORRUPTED; + + rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops); + if (!rcur) { + error = -ENOMEM; + goto out_cur; + } + rcur->bc_private.a.priv.refc.nr_ops = nr_ops; + rcur->bc_private.a.priv.refc.shape_changes = shape_changes; + } + *pcur = rcur; + + switch (type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, + new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL); + *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, + new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL); + *new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno); + break; + case XFS_REFCOUNT_ALLOC_COW: + *new_fsb = startblock + blockcount; + *new_len = 0; + error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops); + break; + case XFS_REFCOUNT_FREE_COW: + *new_fsb = startblock + blockcount; + *new_len = 0; + error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (!error && *new_len > 0) + trace_xfs_refcount_finish_one_leftover(mp, agno, type, + bno, blockcount, new_agbno, *new_len); + return error; + +out_cur: + xfs_trans_brelse(tp, agbp); + + return error; +} + +/* + * Record a refcount intent for later processing. + */ +static int +__xfs_refcount_add( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount) +{ + struct xfs_refcount_intent *ri; + + trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock), + type, XFS_FSB_TO_AGBNO(mp, startblock), + blockcount); + + ri = kmem_alloc(sizeof(struct xfs_refcount_intent), + KM_SLEEP | KM_NOFS); + INIT_LIST_HEAD(&ri->ri_list); + ri->ri_type = type; + ri->ri_startblock = startblock; + ri->ri_blockcount = blockcount; + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + return 0; +} + +/* + * Increase the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_increase_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE, + PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Decrease the reference count of the blocks backing a file's extent. + */ +int +xfs_refcount_decrease_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + struct xfs_bmbt_irec *PREV) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE, + PREV->br_startblock, PREV->br_blockcount); +} + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen. If + * find_end_of_shared is set, return the longest contiguous extent of + * shared blocks; if not, just return the first extent we find. If no + * shared blocks are found, fbno and flen will be set to NULLAGBLOCK + * and 0, respectively. + */ +int +xfs_refcount_find_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *fbno, + xfs_extlen_t *flen, + bool find_end_of_shared) +{ + struct xfs_refcount_irec tmp; + int i; + int have; + int error; + + trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno, + agbno, aglen); + + /* By default, skip the whole range */ + *fbno = NULLAGBLOCK; + *flen = 0; + + /* Try to find a refcount extent that crosses the start */ + error = xfs_refcount_lookup_le(cur, agbno, &have); + if (error) + goto out_error; + if (!have) { + /* No left extent, look at the next one */ + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + goto done; + } + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + + /* If the extent ends before the start, look at the next one */ + if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) { + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + goto done; + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + } + + /* If the extent starts after the range we want, bail out */ + if (tmp.rc_startblock >= agbno + aglen) + goto done; + + /* We found the start of a shared extent! */ + if (tmp.rc_startblock < agbno) { + tmp.rc_blockcount -= (agbno - tmp.rc_startblock); + tmp.rc_startblock = agbno; + } + + *fbno = tmp.rc_startblock; + *flen = min(tmp.rc_blockcount, agbno + aglen - *fbno); + if (!find_end_of_shared) + goto done; + + /* Otherwise, find the end of this shared extent */ + while (*fbno + *flen < agbno + aglen) { + error = xfs_btree_increment(cur, 0, &have); + if (error) + goto out_error; + if (!have) + break; + error = xfs_refcount_get_rec(cur, &tmp, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error); + if (tmp.rc_startblock >= agbno + aglen || + tmp.rc_startblock != *fbno + *flen) + break; + *flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno); + } + +done: + trace_xfs_refcount_find_shared_result(cur->bc_mp, + cur->bc_private.a.agno, *fbno, *flen); + +out_error: + if (error) + trace_xfs_refcount_find_shared_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Recovering CoW Blocks After a Crash + * + * Due to the way that the copy on write mechanism works, there's a window of + * opportunity in which we can lose track of allocated blocks during a crash. + * Because CoW uses delayed allocation in the in-core CoW fork, writeback + * causes blocks to be allocated and stored in the CoW fork. The blocks are + * no longer in the free space btree but are not otherwise recorded anywhere + * until the write completes and the blocks are mapped into the file. A crash + * in between allocation and remapping results in the replacement blocks being + * lost. This situation is exacerbated by the CoW extent size hint because + * allocations can hang around for long time. + * + * However, there is a place where we can record these allocations before they + * become mappings -- the reference count btree. The btree does not record + * extents with refcount == 1, so we can record allocations with a refcount of + * 1. Blocks being used for CoW writeout cannot be shared, so there should be + * no conflict with shared block records. These mappings should be created + * when we allocate blocks to the CoW fork and deleted when they're removed + * from the CoW fork. + * + * Minor nit: records for in-progress CoW allocations and records for shared + * extents must never be merged, to preserve the property that (except for CoW + * allocations) there are no refcount btree entries with refcount == 1. The + * only time this could potentially happen is when unsharing a block that's + * adjacent to CoW allocations, so we must be careful to avoid this. + * + * At mount time we recover lost CoW allocations by searching the refcount + * btree for these refcount == 1 mappings. These represent CoW allocations + * that were in progress at the time the filesystem went down, so we can free + * them to get the space back. + * + * This mechanism is superior to creating EFIs for unmapped CoW extents for + * several reasons -- first, EFIs pin the tail of the log and would have to be + * periodically relogged to avoid filling up the log. Second, CoW completions + * will have to file an EFD and create new EFIs for whatever remains in the + * CoW fork; this partially takes care of (1) but extent-size reservations + * will have to periodically relog even if there's no writeout in progress. + * This can happen if the CoW extent size hint is set, which you really want. + * Third, EFIs cannot currently be automatically relogged into newer + * transactions to advance the log tail. Fourth, stuffing the log full of + * EFIs places an upper bound on the number of CoW allocations that can be + * held filesystem-wide at any given time. Recording them in the refcount + * btree doesn't require us to maintain any state in memory and doesn't pin + * the log. + */ +/* + * Adjust the refcounts of CoW allocations. These allocations are "magic" + * in that they're not referenced anywhere else in the filesystem, so we + * stash them in the refcount btree with a refcount of 1 until either file + * remapping (or CoW cancellation) happens. + */ +STATIC int +xfs_refcount_adjust_cow_extents( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops, + struct xfs_owner_info *oinfo) +{ + struct xfs_refcount_irec ext, tmp; + int error; + int found_rec, found_tmp; + + if (aglen == 0) + return 0; + + /* Find any overlapping refcount records */ + error = xfs_refcount_lookup_ge(cur, agbno, &found_rec); + if (error) + goto out_error; + error = xfs_refcount_get_rec(cur, &ext, &found_rec); + if (error) + goto out_error; + if (!found_rec) { + ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks + + XFS_REFC_COW_START; + ext.rc_blockcount = 0; + ext.rc_refcount = 0; + } + + switch (adj) { + case XFS_REFCOUNT_ADJUST_COW_ALLOC: + /* Adding a CoW reservation, there should be nothing here. */ + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_startblock >= agbno + aglen, out_error); + + tmp.rc_startblock = agbno; + tmp.rc_blockcount = aglen; + tmp.rc_refcount = 1; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &tmp); + + error = xfs_refcount_insert(cur, &tmp, + &found_tmp); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_tmp == 1, out_error); + break; + case XFS_REFCOUNT_ADJUST_COW_FREE: + /* Removing a CoW reservation, there should be one extent. */ + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_startblock == agbno, out_error); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_blockcount == aglen, out_error); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + ext.rc_refcount == 1, out_error); + + ext.rc_refcount = 0; + trace_xfs_refcount_modify_extent(cur->bc_mp, + cur->bc_private.a.agno, &ext); + error = xfs_refcount_delete(cur, &found_rec); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, + found_rec == 1, out_error); + break; + default: + ASSERT(0); + } + + return error; +out_error: + trace_xfs_refcount_modify_extent_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Add or remove refcount btree entries for CoW reservations. + */ +STATIC int +xfs_refcount_adjust_cow( + struct xfs_btree_cur *cur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + enum xfs_refc_adjust_op adj, + struct xfs_defer_ops *dfops) +{ + bool shape_changed; + int error; + + agbno += XFS_REFC_COW_START; + + /* + * Ensure that no rcextents cross the boundary of the adjustment range. + */ + error = xfs_refcount_split_extent(cur, agbno, &shape_changed); + if (error) + goto out_error; + + error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed); + if (error) + goto out_error; + + /* + * Try to merge with the left or right extents of the range. + */ + error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj, + XFS_FIND_RCEXT_COW, &shape_changed); + if (error) + goto out_error; + + /* Now that we've taken care of the ends, adjust the middle extents */ + error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj, + dfops, NULL); + if (error) + goto out_error; + + return 0; + +out_error: + trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno, + error, _RET_IP_); + return error; +} + +/* + * Record a CoW allocation in the refcount btree. + */ +STATIC int +__xfs_refcount_cow_alloc( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + struct xfs_defer_ops *dfops) +{ + int error; + + trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno, + agbno, aglen); + + /* Add refcount btree reservation */ + error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops); + if (error) + return error; + + /* Add rmap entry */ + if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { + error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops, + rcur->bc_private.a.agno, + agbno, aglen, XFS_RMAP_OWN_COW); + if (error) + return error; + } + + return error; +} + +/* + * Remove a CoW allocation from the refcount btree. + */ +STATIC int +__xfs_refcount_cow_free( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + struct xfs_defer_ops *dfops) +{ + int error; + + trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno, + agbno, aglen); + + /* Remove refcount btree reservation */ + error = xfs_refcount_adjust_cow(rcur, agbno, aglen, + XFS_REFCOUNT_ADJUST_COW_FREE, dfops); + if (error) + return error; + + /* Remove rmap entry */ + if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) { + error = xfs_rmap_free_extent(rcur->bc_mp, dfops, + rcur->bc_private.a.agno, + agbno, aglen, XFS_RMAP_OWN_COW); + if (error) + return error; + } + + return error; +} + +/* Record a CoW staging extent in the refcount btree. */ +int +xfs_refcount_alloc_cow_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t fsb, + xfs_extlen_t len) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW, + fsb, len); +} + +/* Forget a CoW staging event in the refcount btree. */ +int +xfs_refcount_free_cow_extent( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t fsb, + xfs_extlen_t len) +{ + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW, + fsb, len); +} + +struct xfs_refcount_recovery { + struct list_head rr_list; + struct xfs_refcount_irec rr_rrec; +}; + +/* Stuff an extent on the recovery list. */ +STATIC int +xfs_refcount_recover_extent( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct list_head *debris = priv; + struct xfs_refcount_recovery *rr; + + if (be32_to_cpu(rec->refc.rc_refcount) != 1) + return -EFSCORRUPTED; + + rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP); + xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); + list_add_tail(&rr->rr_list, debris); + + return 0; +} + +/* Find and remove leftover CoW reservations. */ +int +xfs_refcount_recover_cow_leftovers( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_trans *tp; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_refcount_recovery *rr, *n; + struct list_head debris; + union xfs_btree_irec low; + union xfs_btree_irec high; + struct xfs_defer_ops dfops; + xfs_fsblock_t fsb; + xfs_agblock_t agbno; + int error; + + if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START) + return -EOPNOTSUPP; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + /* Find all the leftover CoW staging extents. */ + INIT_LIST_HEAD(&debris); + memset(&low, 0, sizeof(low)); + memset(&high, 0, sizeof(high)); + low.rc.rc_startblock = XFS_REFC_COW_START; + high.rc.rc_startblock = -1U; + error = xfs_btree_query_range(cur, &low, &high, + xfs_refcount_recover_extent, &debris); + if (error) + goto out_cursor; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + /* Now iterate the list to free the leftovers */ + list_for_each_entry(rr, &debris, rr_list) { + /* Set up transaction. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + goto out_free; + + trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec); + + /* Free the orphan record */ + xfs_defer_init(&dfops, &fsb); + agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START; + fsb = XFS_AGB_TO_FSB(mp, agno, agbno); + error = xfs_refcount_free_cow_extent(mp, &dfops, fsb, + rr->rr_rrec.rc_blockcount); + if (error) + goto out_defer; + + /* Free the block. */ + xfs_bmap_add_free(mp, &dfops, fsb, + rr->rr_rrec.rc_blockcount, NULL); + + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_defer; + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + } + +out_free: + /* Free the leftover list */ + list_for_each_entry_safe(rr, n, &debris, rr_list) { + list_del(&rr->rr_list); + kmem_free(rr); + } + return error; + +out_cursor: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_buf_relse(agbp); + goto out_free; + +out_defer: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + goto out_free; +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h new file mode 100644 index 000000000000..098dc668ab2c --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_H__ +#define __XFS_REFCOUNT_H__ + +extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); +extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); +extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, int *stat); + +enum xfs_refcount_intent_type { + XFS_REFCOUNT_INCREASE = 1, + XFS_REFCOUNT_DECREASE, + XFS_REFCOUNT_ALLOC_COW, + XFS_REFCOUNT_FREE_COW, +}; + +struct xfs_refcount_intent { + struct list_head ri_list; + enum xfs_refcount_intent_type ri_type; + xfs_fsblock_t ri_startblock; + xfs_extlen_t ri_blockcount; +}; + +extern int xfs_refcount_increase_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); +extern int xfs_refcount_decrease_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec); + +extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp, + struct xfs_btree_cur *rcur, int error); +extern int xfs_refcount_finish_one(struct xfs_trans *tp, + struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur); + +extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, + xfs_extlen_t *flen, bool find_end_of_shared); + +extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, + xfs_extlen_t len); +extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp, + struct xfs_defer_ops *dfops, xfs_fsblock_t fsb, + xfs_extlen_t len); +extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp, + xfs_agnumber_t agno); + +#endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c new file mode 100644 index 000000000000..453bb2757ec2 --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -0,0 +1,451 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_rmap.h" + +static struct xfs_btree_cur * +xfs_refcountbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_private.a.dfops); +} + +STATIC void +xfs_refcountbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); + + ASSERT(ptr->s != 0); + + agf->agf_refcount_root = ptr->s; + be32_add_cpu(&agf->agf_refcount_level, inc); + pag->pagf_refcount_level += inc; + xfs_perag_put(pag); + + xfs_alloc_log_agf(cur->bc_tp, agbp, + XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); +} + +STATIC int +xfs_refcountbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_alloc_arg args; /* block allocation args */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + xfs_refc_block(args.mp)); + args.firstblock = args.fsbno; + xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC); + args.minlen = args.maxlen = args.prod = 1; + args.resv = XFS_AG_RESV_METADATA; + + error = xfs_alloc_vextent(&args); + if (error) + goto out_error; + trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, + args.agbno, 1); + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.agno == cur->bc_private.a.agno); + ASSERT(args.len == 1); + + new->s = cpu_to_be32(args.agbno); + be32_add_cpu(&agf->agf_refcount_blocks, 1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out_error: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_refcountbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + struct xfs_owner_info oinfo; + int error; + + trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno, + XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC); + be32_add_cpu(&agf->agf_refcount_blocks, -1); + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); + error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo, + XFS_AG_RESV_METADATA); + if (error) + return error; + + return error; +} + +STATIC int +xfs_refcountbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_refc_mnr[level != 0]; +} + +STATIC int +xfs_refcountbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_mp->m_refc_mxr[level != 0]; +} + +STATIC void +xfs_refcountbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->refc.rc_startblock = rec->refc.rc_startblock; +} + +STATIC void +xfs_refcountbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->refc.rc_startblock); + x += be32_to_cpu(rec->refc.rc_blockcount) - 1; + key->refc.rc_startblock = cpu_to_be32(x); +} + +STATIC void +xfs_refcountbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock); + rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount); + rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount); +} + +STATIC void +xfs_refcountbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_refcount_root != 0); + + ptr->s = agf->agf_refcount_root; +} + +STATIC __int64_t +xfs_refcountbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + struct xfs_refcount_irec *rec = &cur->bc_rec.rc; + struct xfs_refcount_key *kp = &key->refc; + + return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; +} + +STATIC __int64_t +xfs_refcountbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) - + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC bool +xfs_refcountbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC)) + return false; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return false; + if (!xfs_btree_sblock_v5hdr_verify(bp)) + return false; + + level = be16_to_cpu(block->bb_level); + if (pag && pag->pagf_init) { + if (level >= pag->pagf_refcount_level) + return false; + } else if (level >= mp->m_refc_maxlevels) + return false; + + return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); +} + +STATIC void +xfs_refcountbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, -EFSBADCRC); + else if (!xfs_refcountbt_verify(bp)) + xfs_buf_ioerror(bp, -EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +STATIC void +xfs_refcountbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_refcountbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_sblock_calc_crc(bp); + +} + +const struct xfs_buf_ops xfs_refcountbt_buf_ops = { + .name = "xfs_refcountbt", + .verify_read = xfs_refcountbt_read_verify, + .verify_write = xfs_refcountbt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_refcountbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->refc.rc_startblock) < + be32_to_cpu(k2->refc.rc_startblock); +} + +STATIC int +xfs_refcountbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->refc.rc_startblock) + + be32_to_cpu(r1->refc.rc_blockcount) <= + be32_to_cpu(r2->refc.rc_startblock); +} +#endif + +static const struct xfs_btree_ops xfs_refcountbt_ops = { + .rec_len = sizeof(struct xfs_refcount_rec), + .key_len = sizeof(struct xfs_refcount_key), + + .dup_cursor = xfs_refcountbt_dup_cursor, + .set_root = xfs_refcountbt_set_root, + .alloc_block = xfs_refcountbt_alloc_block, + .free_block = xfs_refcountbt_free_block, + .get_minrecs = xfs_refcountbt_get_minrecs, + .get_maxrecs = xfs_refcountbt_get_maxrecs, + .init_key_from_rec = xfs_refcountbt_init_key_from_rec, + .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur, + .key_diff = xfs_refcountbt_key_diff, + .buf_ops = &xfs_refcountbt_buf_ops, + .diff_two_keys = xfs_refcountbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_refcountbt_keys_inorder, + .recs_inorder = xfs_refcountbt_recs_inorder, +#endif +}; + +/* + * Allocate a new refcount btree cursor. + */ +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agnumber_t agno, + struct xfs_defer_ops *dfops) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; + + ASSERT(agno != NULLAGNUMBER); + ASSERT(agno < mp->m_sb.sb_agcount); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = XFS_BTNUM_REFC; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_refcountbt_ops; + + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + cur->bc_private.a.dfops = dfops; + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.a.priv.refc.nr_ops = 0; + cur->bc_private.a.priv.refc.shape_changes = 0; + + return cur; +} + +/* + * Calculate the number of records in a refcount btree block. + */ +int +xfs_refcountbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + bool leaf) +{ + blocklen -= XFS_REFCOUNT_BLOCK_LEN; + + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_refcount_ptr_t)); +} + +/* Compute the maximum height of a refcount btree. */ +void +xfs_refcountbt_compute_maxlevels( + struct xfs_mount *mp) +{ + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_refc_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_refcountbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_refc_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_refcountbt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_refc_mxr[0] == 0) + return 0; + + return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_refcountbt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + struct xfs_buf *agbp; + struct xfs_agf *agf; + xfs_extlen_t tree_len; + int error; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + *ask += xfs_refcountbt_max_size(mp); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(agbp); + tree_len = be32_to_cpu(agf->agf_refcount_blocks); + xfs_buf_relse(agbp); + + *used += tree_len; + + return error; +} diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h new file mode 100644 index 000000000000..3be7768bd51a --- /dev/null +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_BTREE_H__ +#define __XFS_REFCOUNT_BTREE_H__ + +/* + * Reference Count Btree on-disk structures + */ + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +/* + * Btree block header size + */ +#define XFS_REFCOUNT_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_REFCOUNT_REC_ADDR(block, index) \ + ((struct xfs_refcount_rec *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct xfs_refcount_rec)))) + +#define XFS_REFCOUNT_KEY_ADDR(block, index) \ + ((struct xfs_refcount_key *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct xfs_refcount_key))) + +#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \ + ((xfs_refcount_ptr_t *) \ + ((char *)(block) + \ + XFS_REFCOUNT_BLOCK_LEN + \ + (maxrecs) * sizeof(struct xfs_refcount_key) + \ + ((index) - 1) * sizeof(xfs_refcount_ptr_t))) + +extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno, + struct xfs_defer_ops *dfops); +extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen, + bool leaf); +extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); + +extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, + unsigned long long len); +extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp); + +extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, + xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + +#endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 73d05407d663..3a8cc7139912 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -148,6 +148,37 @@ done: return error; } +STATIC int +xfs_rmap_delete( + struct xfs_btree_cur *rcur, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + int i; + int error; + + trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno, + len, owner, offset, flags); + + error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); + + error = xfs_btree_delete(rcur, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done); +done: + if (error) + trace_xfs_rmap_delete_error(rcur->bc_mp, + rcur->bc_private.a.agno, error, _RET_IP_); + return error; +} + static int xfs_rmap_btrec_to_irec( union xfs_btree_rec *rec, @@ -180,6 +211,160 @@ xfs_rmap_get_rec( return xfs_rmap_btrec_to_irec(rec, irec); } +struct xfs_find_left_neighbor_info { + struct xfs_rmap_irec high; + struct xfs_rmap_irec *irec; + int *stat; +}; + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_find_left_neighbor_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_find_left_neighbor_info *info = priv; + + trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp, + cur->bc_private.a.agno, rec->rm_startblock, + rec->rm_blockcount, rec->rm_owner, rec->rm_offset, + rec->rm_flags); + + if (rec->rm_owner != info->high.rm_owner) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + *info->irec = *rec; + *info->stat = 1; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and adjacent physical and logical + * block ranges. + */ +int +xfs_rmap_find_left_neighbor( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + uint64_t owner, + uint64_t offset, + unsigned int flags, + struct xfs_rmap_irec *irec, + int *stat) +{ + struct xfs_find_left_neighbor_info info; + int error; + + *stat = 0; + if (bno == 0) + return 0; + info.high.rm_startblock = bno - 1; + info.high.rm_owner = owner; + if (!XFS_RMAP_NON_INODE_OWNER(owner) && + !(flags & XFS_RMAP_BMBT_BLOCK)) { + if (offset == 0) + return 0; + info.high.rm_offset = offset - 1; + } else + info.high.rm_offset = 0; + info.high.rm_flags = flags; + info.high.rm_blockcount = 0; + info.irec = irec; + info.stat = stat; + + trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp, + cur->bc_private.a.agno, bno, 0, owner, offset, flags); + + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_find_left_neighbor_helper, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + if (*stat) + trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, + irec->rm_offset, irec->rm_flags); + return error; +} + +/* For each rmap given, figure out if it matches the key we want. */ +STATIC int +xfs_rmap_lookup_le_range_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_find_left_neighbor_info *info = priv; + + trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp, + cur->bc_private.a.agno, rec->rm_startblock, + rec->rm_blockcount, rec->rm_owner, rec->rm_offset, + rec->rm_flags); + + if (rec->rm_owner != info->high.rm_owner) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) && + !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + (rec->rm_offset > info->high.rm_offset || + rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset)) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + *info->irec = *rec; + *info->stat = 1; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Find the record to the left of the given extent, being careful only to + * return a match with the same owner and overlapping physical and logical + * block ranges. This is the overlapping-interval version of + * xfs_rmap_lookup_le. + */ +int +xfs_rmap_lookup_le_range( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + uint64_t owner, + uint64_t offset, + unsigned int flags, + struct xfs_rmap_irec *irec, + int *stat) +{ + struct xfs_find_left_neighbor_info info; + int error; + + info.high.rm_startblock = bno; + info.high.rm_owner = owner; + if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK)) + info.high.rm_offset = offset; + else + info.high.rm_offset = 0; + info.high.rm_flags = flags; + info.high.rm_blockcount = 0; + *stat = 0; + info.irec = irec; + info.stat = stat; + + trace_xfs_rmap_lookup_le_range(cur->bc_mp, + cur->bc_private.a.agno, bno, 0, owner, offset, flags); + error = xfs_rmap_query_range(cur, &info.high, &info.high, + xfs_rmap_lookup_le_range_helper, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + if (*stat) + trace_xfs_rmap_lookup_le_range_result(cur->bc_mp, + cur->bc_private.a.agno, irec->rm_startblock, + irec->rm_blockcount, irec->rm_owner, + irec->rm_offset, irec->rm_flags); + return error; +} + /* * Find the extent in the rmap btree and remove it. * @@ -1093,11 +1278,704 @@ done: return error; } +/* + * Convert an unwritten extent to a real extent or vice versa. If there is no + * possibility of overlapping extents, delegate to the simpler convert + * function. + */ +STATIC int +xfs_rmap_convert_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec r[4]; /* neighbor extent entries */ + /* left is 0, right is 1, prev is 2 */ + /* new is 3 */ + uint64_t owner; + uint64_t offset; + uint64_t new_endoff; + unsigned int oldext; + unsigned int newext; + unsigned int flags = 0; + int i; + int state = 0; + int error; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) || + (flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))); + oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0; + new_endoff = offset + len; + trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * For the initial lookup, look for and exact match or the left-adjacent + * record for our insertion point. This will also give us the record for + * start block contiguity tests. + */ + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + &PREV, &i); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + + ASSERT(PREV.rm_offset <= offset); + ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff); + ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext); + newext = ~oldext & XFS_RMAP_UNWRITTEN; + + /* + * Set flags determining what part of the previous oldext allocation + * extent is being replaced by a newext allocation. + */ + if (PREV.rm_offset == offset) + state |= RMAP_LEFT_FILLING; + if (PREV.rm_offset + PREV.rm_blockcount == new_endoff) + state |= RMAP_RIGHT_FILLING; + + /* Is there a left record that abuts our range? */ + error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext, + &LEFT, &i); + if (error) + goto done; + if (i) { + state |= RMAP_LEFT_VALID; + XFS_WANT_CORRUPTED_GOTO(mp, + LEFT.rm_startblock + LEFT.rm_blockcount <= bno, + done); + if (xfs_rmap_is_mergeable(&LEFT, owner, newext)) + state |= RMAP_LEFT_CONTIG; + } + + /* Is there a right record that abuts our range? */ + error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, + newext, &i); + if (error) + goto done; + if (i) { + state |= RMAP_RIGHT_VALID; + error = xfs_rmap_get_rec(cur, &RIGHT, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock, + done); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (xfs_rmap_is_mergeable(&RIGHT, owner, newext)) + state |= RMAP_RIGHT_CONTIG; + } + + /* check that left + prev + right is not too long */ + if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) == + (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) && + (unsigned long)LEFT.rm_blockcount + len + + RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX) + state &= ~RMAP_RIGHT_CONTIG; + + trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state, + _RET_IP_); + /* + * Switch out based on the FILLING and CONTIG state bits. + */ + switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) { + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | + RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left and right neighbors are both contiguous with new. + */ + error = xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (error) + goto done; + error = xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The left neighbor is contiguous, the right is not. + */ + error = xfs_rmap_delete(cur, PREV.rm_startblock, + PREV.rm_blockcount, PREV.rm_owner, + PREV.rm_offset, PREV.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += PREV.rm_blockcount; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting all of a previous oldext extent to newext. + * The right neighbor is contiguous, the left is not. + */ + error = xfs_rmap_delete(cur, RIGHT.rm_startblock, + RIGHT.rm_blockcount, RIGHT.rm_owner, + RIGHT.rm_offset, RIGHT.rm_flags); + if (error) + goto done; + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += RIGHT.rm_blockcount; + NEW.rm_flags = RIGHT.rm_flags; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING: + /* + * Setting all of a previous oldext extent to newext. + * Neither the left nor right neighbors are contiguous with + * the new one. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_flags = newext; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is contiguous. + */ + NEW = PREV; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW = LEFT; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount += len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING: + /* + * Setting the first part of a previous oldext extent to newext. + * The left neighbor is not contiguous. + */ + NEW = PREV; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset += len; + NEW.rm_startblock += len; + NEW.rm_blockcount -= len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is contiguous with the new allocation. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount = offset - NEW.rm_offset; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + NEW = RIGHT; + error = xfs_rmap_delete(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + NEW.rm_offset = offset; + NEW.rm_startblock = bno; + NEW.rm_blockcount += len; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags); + if (error) + goto done; + break; + + case RMAP_RIGHT_FILLING: + /* + * Setting the last part of a previous oldext extent to newext. + * The right neighbor is not contiguous. + */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount -= len; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + error = xfs_rmap_insert(cur, bno, len, owner, offset, newext); + if (error) + goto done; + break; + + case 0: + /* + * Setting the middle part of a previous oldext extent to + * newext. Contiguity is impossible here. + * One extent becomes three extents. + */ + /* new right extent - oldext */ + NEW.rm_startblock = bno + len; + NEW.rm_owner = owner; + NEW.rm_offset = new_endoff; + NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount - + new_endoff; + NEW.rm_flags = PREV.rm_flags; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + if (error) + goto done; + /* new left extent - oldext */ + NEW = PREV; + error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, + NEW.rm_offset, NEW.rm_flags, &i); + if (error) + goto done; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); + NEW.rm_blockcount = offset - NEW.rm_offset; + error = xfs_rmap_update(cur, &NEW); + if (error) + goto done; + /* new middle extent - newext */ + NEW.rm_startblock = bno; + NEW.rm_blockcount = len; + NEW.rm_owner = owner; + NEW.rm_offset = offset; + NEW.rm_flags = newext; + error = xfs_rmap_insert(cur, NEW.rm_startblock, + NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset, + NEW.rm_flags); + if (error) + goto done; + break; + + case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG: + case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG: + case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG: + case RMAP_LEFT_CONTIG: + case RMAP_RIGHT_CONTIG: + /* + * These cases are all impossible. + */ + ASSERT(0); + } + + trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +done: + if (error) + trace_xfs_rmap_convert_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + #undef NEW #undef LEFT #undef RIGHT #undef PREV +/* + * Find an extent in the rmap btree and unmap it. For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _free function. + */ +STATIC int +xfs_rmap_unmap_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + uint64_t ltoff; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* + * We should always have a left record because there's a static record + * for the AG headers at rm_startblock == 0 created by mkfs/growfs that + * will not ever be removed from the tree. + */ + error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, + <rec, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltoff = ltrec.rm_offset; + + /* Make sure the extent we found covers the entire freeing range. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno && + ltrec.rm_startblock + ltrec.rm_blockcount >= + bno + len, out_error); + + /* Make sure the owner matches what we expect to find in the tree. */ + XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error); + + /* Make sure the unwritten flag matches. */ + XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) == + (ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error); + + /* Check the offset. */ + XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error); + XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount, + out_error); + + if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) { + /* Exact match, simply remove the record from rmap tree. */ + error = xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + } else if (ltrec.rm_startblock == bno) { + /* + * Overlap left hand side of extent: move the start, trim the + * length and update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + + /* Delete prev rmap. */ + error = xfs_rmap_delete(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + + /* Add an rmap at the new offset. */ + ltrec.rm_startblock += len; + ltrec.rm_blockcount -= len; + ltrec.rm_offset += len; + error = xfs_rmap_insert(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags); + if (error) + goto out_error; + } else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) { + /* + * Overlap right hand side of extent: trim the length and + * update the current record. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrrrrrrr| + * bno len + */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltrec.rm_blockcount -= len; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else { + /* + * Overlap middle of extent: trim the length of the existing + * record to the length of the new left-extent size, increment + * the insertion position so we can insert a new record + * containing the remaining right-extent space. + * + * ltbno ltlen + * Orig: |oooooooooooooooooooo| + * Freeing: |fffffffff| + * Result: |rrrrr| |rrrr| + * bno len + */ + xfs_extlen_t orig_len = ltrec.rm_blockcount; + + /* Shrink the left side of the rmap */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + ltrec.rm_blockcount = bno - ltrec.rm_startblock; + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + + /* Add an rmap at the new offset */ + error = xfs_rmap_insert(cur, bno + len, + orig_len - len - ltrec.rm_blockcount, + ltrec.rm_owner, offset + len, + ltrec.rm_flags); + if (error) + goto out_error; + } + + trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_unmap_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + +/* + * Find an extent in the rmap btree and map it. For rmap extent types that + * can overlap (data fork rmaps on reflink filesystems) we must be careful + * that the prev/next records in the btree might belong to another owner. + * Therefore we must use delete+insert to alter any of the key fields. + * + * For every other situation there can only be one owner for a given extent, + * so we can call the regular _alloc function. + */ +STATIC int +xfs_rmap_map_shared( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + bool unwritten, + struct xfs_owner_info *oinfo) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_rmap_irec ltrec; + struct xfs_rmap_irec gtrec; + int have_gt; + int have_lt; + int error = 0; + int i; + uint64_t owner; + uint64_t offset; + unsigned int flags = 0; + + xfs_owner_info_unpack(oinfo, &owner, &offset, &flags); + if (unwritten) + flags |= XFS_RMAP_UNWRITTEN; + trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); + + /* Is there a left record that abuts our range? */ + error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags, + <rec, &have_lt); + if (error) + goto out_error; + if (have_lt && + !xfs_rmap_is_mergeable(<rec, owner, flags)) + have_lt = 0; + + /* Is there a right record that abuts our range? */ + error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len, + flags, &have_gt); + if (error) + goto out_error; + if (have_gt) { + error = xfs_rmap_get_rec(cur, >rec, &have_gt); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error); + trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp, + cur->bc_private.a.agno, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + + if (!xfs_rmap_is_mergeable(>rec, owner, flags)) + have_gt = 0; + } + + if (have_lt && + ltrec.rm_startblock + ltrec.rm_blockcount == bno && + ltrec.rm_offset + ltrec.rm_blockcount == offset) { + /* + * Left edge contiguous, merge into left record. + * + * ltbno ltlen + * orig: |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + ltrec.rm_blockcount += len; + if (have_gt && + bno + len == gtrec.rm_startblock && + offset + len == gtrec.rm_offset) { + /* + * Right edge also contiguous, delete right record + * and merge into left record. + * + * ltbno ltlen gtbno gtlen + * orig: |ooooooooo| |ooooooooo| + * adding: |aaaaaaaaa| + * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr| + */ + ltrec.rm_blockcount += gtrec.rm_blockcount; + error = xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + } + + /* Point the cursor back to the left record and update. */ + error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock, + ltrec.rm_blockcount, ltrec.rm_owner, + ltrec.rm_offset, ltrec.rm_flags, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error); + + error = xfs_rmap_update(cur, <rec); + if (error) + goto out_error; + } else if (have_gt && + bno + len == gtrec.rm_startblock && + offset + len == gtrec.rm_offset) { + /* + * Right edge contiguous, merge into right record. + * + * gtbno gtlen + * Orig: |ooooooooo| + * adding: |aaaaaaaaa| + * Result: |rrrrrrrrrrrrrrrrrrr| + * bno len + */ + /* Delete the old record. */ + error = xfs_rmap_delete(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + + /* Move the start and re-add it. */ + gtrec.rm_startblock = bno; + gtrec.rm_blockcount += len; + gtrec.rm_offset = offset; + error = xfs_rmap_insert(cur, gtrec.rm_startblock, + gtrec.rm_blockcount, gtrec.rm_owner, + gtrec.rm_offset, gtrec.rm_flags); + if (error) + goto out_error; + } else { + /* + * No contiguous edge with identical owner, insert + * new record at current cursor position. + */ + error = xfs_rmap_insert(cur, bno, len, owner, offset, flags); + if (error) + goto out_error; + } + + trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len, + unwritten, oinfo); +out_error: + if (error) + trace_xfs_rmap_map_error(cur->bc_mp, + cur->bc_private.a.agno, error, _RET_IP_); + return error; +} + struct xfs_rmap_query_range_info { xfs_rmap_query_range_fn fn; void *priv; @@ -1237,15 +2115,27 @@ xfs_rmap_finish_one( case XFS_RMAP_MAP: error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo); break; + case XFS_RMAP_MAP_SHARED: + error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten, + &oinfo); + break; case XFS_RMAP_FREE: case XFS_RMAP_UNMAP: error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten, &oinfo); break; + case XFS_RMAP_UNMAP_SHARED: + error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten, + &oinfo); + break; case XFS_RMAP_CONVERT: error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten, &oinfo); break; + case XFS_RMAP_CONVERT_SHARED: + error = xfs_rmap_convert_shared(rcur, bno, blockcount, + !unwritten, &oinfo); + break; default: ASSERT(0); error = -EFSCORRUPTED; @@ -1263,9 +2153,10 @@ out_cur: */ static bool xfs_rmap_update_is_needed( - struct xfs_mount *mp) + struct xfs_mount *mp, + int whichfork) { - return xfs_sb_version_hasrmapbt(&mp->m_sb); + return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK; } /* @@ -1311,10 +2202,11 @@ xfs_rmap_map_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, whichfork, PREV); } @@ -1327,10 +2219,11 @@ xfs_rmap_unmap_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, whichfork, PREV); } @@ -1343,10 +2236,11 @@ xfs_rmap_convert_extent( int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, whichfork)) return 0; - return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino, + return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ? + XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, whichfork, PREV); } @@ -1362,7 +2256,7 @@ xfs_rmap_alloc_extent( { struct xfs_bmbt_irec bmap; - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK)) return 0; bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); @@ -1386,7 +2280,7 @@ xfs_rmap_free_extent( { struct xfs_bmbt_irec bmap; - if (!xfs_rmap_update_is_needed(mp)) + if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK)) return 0; bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 71cf99a4acba..789930599339 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -206,4 +206,11 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); +int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); +int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, + uint64_t owner, uint64_t offset, unsigned int flags, + struct xfs_rmap_irec *irec, int *stat); + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 17b8eeb34ac8..83e672ff7577 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -35,6 +35,7 @@ #include "xfs_cksum.h" #include "xfs_error.h" #include "xfs_extent_busy.h" +#include "xfs_ag_resv.h" /* * Reverse map btree. @@ -512,6 +513,83 @@ void xfs_rmapbt_compute_maxlevels( struct xfs_mount *mp) { - mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, - mp->m_rmap_mnr, mp->m_sb.sb_agblocks); + /* + * On a non-reflink filesystem, the maximum number of rmap + * records is the number of blocks in the AG, hence the max + * rmapbt height is log_$maxrecs($agblocks). However, with + * reflink each AG block can have up to 2^32 (per the refcount + * record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. + * + * That effectively means that the max rmapbt height must be + * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG + * blocks to feed the rmapbt long before the rmapbt reaches + * maximum height. The reflink code uses ag_resv_critical to + * disallow reflinking when less than 10% of the per-AG metadata + * block reservation since the fallback is a regular file copy. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; + else + mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp, + mp->m_rmap_mnr, mp->m_sb.sb_agblocks); +} + +/* Calculate the refcount btree size for some records. */ +xfs_extlen_t +xfs_rmapbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len); +} + +/* + * Calculate the maximum refcount btree size. + */ +xfs_extlen_t +xfs_rmapbt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_rmap_mxr[0] == 0) + return 0; + + return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks); +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_rmapbt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + struct xfs_buf *agbp; + struct xfs_agf *agf; + xfs_extlen_t pool_len; + xfs_extlen_t tree_len; + int error; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + /* Reserve 1% of the AG or enough for 1 block per record. */ + pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp)); + *ask += pool_len; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(agbp); + tree_len = be32_to_cpu(agf->agf_rmap_blocks); + xfs_buf_relse(agbp); + + *used += tree_len; + + return error; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index e73a55357dab..2a9ac472fb15 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf); extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); +extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, + unsigned long long len); +extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp); + +extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, + xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 4aecc5fefe96..a70aec910626 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -38,6 +38,8 @@ #include "xfs_ialloc_btree.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_refcount_btree.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -737,6 +739,13 @@ xfs_sb_mount_common( mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; + mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, + true); + mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize, + false); + mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; + mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 0c5b30bd884c..c6f4eb46fe26 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops; extern const struct xfs_buf_ops xfs_agfl_buf_ops; extern const struct xfs_buf_ops xfs_allocbt_buf_ops; extern const struct xfs_buf_ops xfs_rmapbt_buf_ops; +extern const struct xfs_buf_ops xfs_refcountbt_buf_ops; extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; extern const struct xfs_buf_ops xfs_bmbt_buf_ops; @@ -122,6 +123,7 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_INO_REF 2 #define XFS_ATTR_BTREE_REF 1 #define XFS_DQUOT_REF 1 +#define XFS_REFC_BTREE_REF 1 /* * Flags for xfs_trans_ichgtime(). diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 301ef2f4dbd6..b456cca1bfb2 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -67,13 +67,14 @@ xfs_calc_buf_res( * Per-extent log reservation for the btree changes involved in freeing or * allocating an extent. In classic XFS there were two trees that will be * modified (bnobt + cntbt). With rmap enabled, there are three trees - * (rmapbt). The number of blocks reserved is based on the formula: + * (rmapbt). With reflink, there are four trees (refcountbt). The number of + * blocks reserved is based on the formula: * * num trees * ((2 blocks/level * max depth) - 1) * * Keep in mind that max depth is calculated separately for each type of tree. */ -static uint +uint xfs_allocfree_log_count( struct xfs_mount *mp, uint num_ops) @@ -83,6 +84,8 @@ xfs_allocfree_log_count( blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); if (xfs_sb_version_hasrmapbt(&mp->m_sb)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); + if (xfs_sb_version_hasreflink(&mp->m_sb)) + blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; } @@ -809,11 +812,18 @@ xfs_trans_resv_calc( * require a permanent reservation on space. */ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + else + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_itruncate.tr_logcount = + XFS_ITRUNCATE_LOG_COUNT_REFLINK; + else + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); @@ -870,7 +880,10 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; + else + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; /* diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0eb46ed6d404..b7e5357d060a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -87,6 +87,7 @@ struct xfs_trans_resv { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8 #define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_CREATE_TMPFILE_LOG_COUNT 2 @@ -96,11 +97,13 @@ struct xfs_trans_resv { #define XFS_LINK_LOG_COUNT 2 #define XFS_RENAME_LOG_COUNT 2 #define XFS_WRITE_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT_REFLINK 8 #define XFS_ADDAFORK_LOG_COUNT 2 #define XFS_ATTRINVAL_LOG_COUNT 1 #define XFS_ATTRSET_LOG_COUNT 3 #define XFS_ATTRRM_LOG_COUNT 3 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); +uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops); #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 41e0428d8175..7917f6e44286 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,6 +21,8 @@ /* * Components of space reservations. */ +#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ + (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -28,6 +30,13 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) +#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ + (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ + XFS_EXTENTADD_SPACE_RES(mp,w) + \ + ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + (mp)->m_rmap_maxlevels) #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 3d503647f26b..8d74870468c2 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -90,6 +90,7 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ */ #define XFS_DATA_FORK 0 #define XFS_ATTR_FORK 1 +#define XFS_COW_FORK 2 /* * Min numbers of data/attr fork btree root pointers. @@ -109,7 +110,7 @@ typedef enum { typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, - XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX + XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX } xfs_btnum_t; struct xfs_name { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4a28fa91e3b1..3e57a56cf829 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,6 +31,7 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" +#include "xfs_reflink.h" #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> @@ -39,6 +40,7 @@ /* flags for direct write completions */ #define XFS_DIO_FLAG_UNWRITTEN (1 << 0) #define XFS_DIO_FLAG_APPEND (1 << 1) +#define XFS_DIO_FLAG_COW (1 << 2) /* * structure owned by writepages passed to individual writepage calls @@ -287,6 +289,25 @@ xfs_end_io( error = -EIO; /* + * For a CoW extent, we need to move the mapping from the CoW fork + * to the data fork. If instead an error happened, just dump the + * new blocks. + */ + if (ioend->io_type == XFS_IO_COW) { + if (error) + goto done; + if (ioend->io_bio->bi_error) { + error = xfs_reflink_cancel_cow_range(ip, + ioend->io_offset, ioend->io_size); + goto done; + } + error = xfs_reflink_end_cow(ip, ioend->io_offset, + ioend->io_size); + if (error) + goto done; + } + + /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. * Detecting and handling completion IO errors is done individually @@ -301,7 +322,8 @@ xfs_end_io( } else if (ioend->io_append_trans) { error = xfs_setfilesize_ioend(ioend, error); } else { - ASSERT(!xfs_ioend_is_append(ioend)); + ASSERT(!xfs_ioend_is_append(ioend) || + ioend->io_type == XFS_IO_COW); } done: @@ -315,7 +337,7 @@ xfs_end_bio( struct xfs_ioend *ioend = bio->bi_private; struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; - if (ioend->io_type == XFS_IO_UNWRITTEN) + if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); @@ -341,6 +363,7 @@ xfs_map_blocks( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; + ASSERT(type != XFS_IO_COW); if (type == XFS_IO_UNWRITTEN) bmapi_flags |= XFS_BMAPI_IGSTATE; @@ -355,6 +378,13 @@ xfs_map_blocks( offset_fsb = XFS_B_TO_FSBT(mp, offset); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, imap, &nimaps, bmapi_flags); + /* + * Truncate an overwrite extent if there's a pending CoW + * reservation before the end of this extent. This forces us + * to come back to writepage to take care of the CoW. + */ + if (nimaps && type == XFS_IO_OVERWRITE) + xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (error) @@ -362,7 +392,8 @@ xfs_map_blocks( if (type == XFS_IO_DELALLOC && (!nimaps || isnullstartblock(imap->br_startblock))) { - error = xfs_iomap_write_allocate(ip, offset, imap); + error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset, + imap); if (!error) trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); return error; @@ -737,6 +768,56 @@ out_invalidate: return; } +static int +xfs_map_cow( + struct xfs_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int *new_type) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_bmbt_irec imap; + bool is_cow = false, need_alloc = false; + int error; + + /* + * If we already have a valid COW mapping keep using it. + */ + if (wpc->io_type == XFS_IO_COW) { + wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); + if (wpc->imap_valid) { + *new_type = XFS_IO_COW; + return 0; + } + } + + /* + * Else we need to check if there is a COW mapping at this offset. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!is_cow) + return 0; + + /* + * And if the COW mapping has a delayed extent here we need to + * allocate real space for it now. + */ + if (need_alloc) { + error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, + &imap); + if (error) + return error; + } + + wpc->io_type = *new_type = XFS_IO_COW; + wpc->imap_valid = true; + wpc->imap = imap; + return 0; +} + /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate @@ -769,6 +850,7 @@ xfs_writepage_map( int error = 0; int count = 0; int uptodate = 1; + unsigned int new_type; bh = head = page_buffers(page); offset = page_offset(page); @@ -789,22 +871,13 @@ xfs_writepage_map( continue; } - if (buffer_unwritten(bh)) { - if (wpc->io_type != XFS_IO_UNWRITTEN) { - wpc->io_type = XFS_IO_UNWRITTEN; - wpc->imap_valid = false; - } - } else if (buffer_delay(bh)) { - if (wpc->io_type != XFS_IO_DELALLOC) { - wpc->io_type = XFS_IO_DELALLOC; - wpc->imap_valid = false; - } - } else if (buffer_uptodate(bh)) { - if (wpc->io_type != XFS_IO_OVERWRITE) { - wpc->io_type = XFS_IO_OVERWRITE; - wpc->imap_valid = false; - } - } else { + if (buffer_unwritten(bh)) + new_type = XFS_IO_UNWRITTEN; + else if (buffer_delay(bh)) + new_type = XFS_IO_DELALLOC; + else if (buffer_uptodate(bh)) + new_type = XFS_IO_OVERWRITE; + else { if (PageUptodate(page)) ASSERT(buffer_mapped(bh)); /* @@ -817,6 +890,17 @@ xfs_writepage_map( continue; } + if (xfs_is_reflink_inode(XFS_I(inode))) { + error = xfs_map_cow(wpc, inode, offset, &new_type); + if (error) + goto out; + } + + if (wpc->io_type != new_type) { + wpc->io_type = new_type; + wpc->imap_valid = false; + } + if (wpc->imap_valid) wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); @@ -1107,18 +1191,24 @@ xfs_map_direct( struct inode *inode, struct buffer_head *bh_result, struct xfs_bmbt_irec *imap, - xfs_off_t offset) + xfs_off_t offset, + bool is_cow) { uintptr_t *flags = (uintptr_t *)&bh_result->b_private; xfs_off_t size = bh_result->b_size; trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap); + ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : + XFS_IO_OVERWRITE, imap); if (ISUNWRITTEN(imap)) { *flags |= XFS_DIO_FLAG_UNWRITTEN; set_buffer_defer_completion(bh_result); - } else if (offset + size > i_size_read(inode) || offset + size < 0) { + } else if (is_cow) { + *flags |= XFS_DIO_FLAG_COW; + set_buffer_defer_completion(bh_result); + } + if (offset + size > i_size_read(inode) || offset + size < 0) { *flags |= XFS_DIO_FLAG_APPEND; set_buffer_defer_completion(bh_result); } @@ -1164,6 +1254,44 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } +/* Bounce unaligned directio writes to the page cache. */ +static int +xfs_bounce_unaligned_dio_write( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t delta; + bool shared; + bool x; + int error; + + irec = *imap; + if (offset_fsb > irec.br_startoff) { + delta = offset_fsb - irec.br_startoff; + irec.br_blockcount -= delta; + irec.br_startblock += delta; + irec.br_startoff = offset_fsb; + } + error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); + if (error) + return error; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If any part + * of the extent is shared, fall back to buffered mode to handle + * the RMW. This is done by returning -EREMCHG ("remote addr + * changed"), which is caught further up the call stack. + */ + if (shared) { + trace_xfs_reflink_bounce_dio_write(ip, imap); + return -EREMCHG; + } + return 0; +} + STATIC int __xfs_get_blocks( struct inode *inode, @@ -1183,6 +1311,8 @@ __xfs_get_blocks( xfs_off_t offset; ssize_t size; int new = 0; + bool is_cow = false; + bool need_alloc = false; BUG_ON(create && !direct); @@ -1208,8 +1338,26 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); + if (create && direct && xfs_is_reflink_inode(ip)) + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, + &need_alloc); + if (!is_cow) { + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); + /* + * Truncate an overwrite extent if there's a pending CoW + * reservation before the end of this extent. This + * forces us to come back to get_blocks to take care of + * the CoW. + */ + if (create && direct && nimaps && + imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK && + !ISUNWRITTEN(&imap)) + xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, + &imap); + } + ASSERT(!need_alloc); if (error) goto out_unlock; @@ -1261,6 +1409,13 @@ __xfs_get_blocks( if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && (create || !ISUNWRITTEN(&imap))) { + if (create && direct && !is_cow) { + error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, + &imap); + if (error) + return error; + } + xfs_map_buffer(inode, bh_result, &imap, offset); if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); @@ -1269,7 +1424,8 @@ __xfs_get_blocks( if (dax_fault) ASSERT(!ISUNWRITTEN(&imap)); else - xfs_map_direct(inode, bh_result, &imap, offset); + xfs_map_direct(inode, bh_result, &imap, offset, + is_cow); } } @@ -1391,11 +1547,14 @@ xfs_end_io_direct_write( i_size_write(inode, offset + size); spin_unlock(&ip->i_flags_lock); + if (flags & XFS_DIO_FLAG_COW) + error = xfs_reflink_end_cow(ip, offset, size); if (flags & XFS_DIO_FLAG_UNWRITTEN) { trace_xfs_end_io_direct_write_unwritten(ip, offset, size); error = xfs_iomap_write_unwritten(ip, offset, size); - } else if (flags & XFS_DIO_FLAG_APPEND) { + } + if (flags & XFS_DIO_FLAG_APPEND) { trace_xfs_end_io_direct_write_append(ip, offset, size); error = xfs_setfilesize(ip, offset, size); @@ -1425,6 +1584,17 @@ xfs_vm_bmap( trace_xfs_vm_bmap(XFS_I(inode)); xfs_ilock(ip, XFS_IOLOCK_SHARED); + + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseÑ• the file system for actual I/O. We really can't allow + * that on reflinks inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (xfs_is_reflink_inode(ip)) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; + } filemap_write_and_wait(mapping); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 1950e3bca2ac..b3c6634f9518 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -28,13 +28,15 @@ enum { XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ + XFS_IO_COW, /* covers copy-on-write extent */ }; #define XFS_IO_TYPES \ { XFS_IO_INVALID, "invalid" }, \ { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" } + { XFS_IO_OVERWRITE, "overwrite" }, \ + { XFS_IO_COW, "CoW" } /* * Structure for buffered I/O completions. diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c new file mode 100644 index 000000000000..9bf57c76623b --- /dev/null +++ b/fs/xfs/xfs_bmap_item.c @@ -0,0 +1,508 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_icache.h" +#include "xfs_trace.h" + + +kmem_zone_t *xfs_bui_zone; +kmem_zone_t *xfs_bud_zone; + +static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_bui_log_item, bui_item); +} + +void +xfs_bui_item_free( + struct xfs_bui_log_item *buip) +{ + kmem_zone_free(xfs_bui_zone, buip); +} + +STATIC void +xfs_bui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + + *nvecs += 1; + *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bui log item. We use only 1 iovec, and we point that + * at the bui_log_format structure embedded in the bui item. + * It is at this point that we assert that all of the extent + * slots in the bui item have been filled. + */ +STATIC void +xfs_bui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&buip->bui_next_extent) == + buip->bui_format.bui_nextents); + + buip->bui_format.bui_type = XFS_LI_BUI; + buip->bui_format.bui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format, + xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents)); +} + +/* + * Pinning has no meaning for an bui item, so just return. + */ +STATIC void +xfs_bui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an BUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the BUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the BUI to either construct + * and commit the BUD or drop the BUD's reference in the event of error. Simply + * drop the log's BUI reference now that the log is done with it. + */ +STATIC void +xfs_bui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_bui_log_item *buip = BUI_ITEM(lip); + + xfs_bui_release(buip); +} + +/* + * BUI items have no locking or pushing. However, since BUIs are pulled from + * the AIL when their corresponding BUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the BUI out of + * the AIL. + */ +STATIC uint +xfs_bui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The BUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an BUD isn't going to be + * constructed and thus we free the BUI here directly. + */ +STATIC void +xfs_bui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_bui_item_free(BUI_ITEM(lip)); +} + +/* + * The BUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_bui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The BUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all bui log items. + */ +static const struct xfs_item_ops xfs_bui_item_ops = { + .iop_size = xfs_bui_item_size, + .iop_format = xfs_bui_item_format, + .iop_pin = xfs_bui_item_pin, + .iop_unpin = xfs_bui_item_unpin, + .iop_unlock = xfs_bui_item_unlock, + .iop_committed = xfs_bui_item_committed, + .iop_push = xfs_bui_item_push, + .iop_committing = xfs_bui_item_committing, +}; + +/* + * Allocate and initialize an bui item with the given number of extents. + */ +struct xfs_bui_log_item * +xfs_bui_init( + struct xfs_mount *mp) + +{ + struct xfs_bui_log_item *buip; + + buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP); + + xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); + buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; + buip->bui_format.bui_id = (uintptr_t)(void *)buip; + atomic_set(&buip->bui_next_extent, 0); + atomic_set(&buip->bui_refcount, 2); + + return buip; +} + +/* + * Freeing the BUI requires that we remove it from the AIL if it has already + * been placed there. However, the BUI may not yet have been placed in the AIL + * when called by xfs_bui_release() from BUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the BUI. + */ +void +xfs_bui_release( + struct xfs_bui_log_item *buip) +{ + if (atomic_dec_and_test(&buip->bui_refcount)) { + xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_bui_item_free(buip); + } +} + +static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_bud_log_item, bud_item); +} + +STATIC void +xfs_bud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_bud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given bud log item. We use only 1 iovec, and we point that + * at the bud_log_format structure embedded in the bud item. + * It is at this point that we assert that all of the extent + * slots in the bud item have been filled. + */ +STATIC void +xfs_bud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + budp->bud_format.bud_type = XFS_LI_BUD; + budp->bud_format.bud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format, + sizeof(struct xfs_bud_log_format)); +} + +/* + * Pinning has no meaning for an bud item, so just return. + */ +STATIC void +xfs_bud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an bud item, unpinning does + * not either. + */ +STATIC void +xfs_bud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an bud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_bud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The BUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the BUI and free the + * BUD. + */ +STATIC void +xfs_bud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); + } +} + +/* + * When the bud item is committed to disk, all we need to do is delete our + * reference to our partner bui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_bud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_bud_log_item *budp = BUD_ITEM(lip); + + /* + * Drop the BUI reference regardless of whether the BUD has been + * aborted. Once the BUD transaction is constructed, it is the sole + * responsibility of the BUD to release the BUI (even if the BUI is + * aborted due to log I/O error). + */ + xfs_bui_release(budp->bud_buip); + kmem_zone_free(xfs_bud_zone, budp); + + return (xfs_lsn_t)-1; +} + +/* + * The BUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_bud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all bud log items. + */ +static const struct xfs_item_ops xfs_bud_item_ops = { + .iop_size = xfs_bud_item_size, + .iop_format = xfs_bud_item_format, + .iop_pin = xfs_bud_item_pin, + .iop_unpin = xfs_bud_item_unpin, + .iop_unlock = xfs_bud_item_unlock, + .iop_committed = xfs_bud_item_committed, + .iop_push = xfs_bud_item_push, + .iop_committing = xfs_bud_item_committing, +}; + +/* + * Allocate and initialize an bud item with the given number of extents. + */ +struct xfs_bud_log_item * +xfs_bud_init( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) + +{ + struct xfs_bud_log_item *budp; + + budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP); + xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return budp; +} + +/* + * Process a bmap update intent item that was recovered from the log. + * We need to update some inode's bmbt. + */ +int +xfs_bui_recover( + struct xfs_mount *mp, + struct xfs_bui_log_item *buip) +{ + int error = 0; + unsigned int bui_type; + struct xfs_map_extent *bmap; + xfs_fsblock_t startblock_fsb; + xfs_fsblock_t inode_fsb; + bool op_ok; + struct xfs_bud_log_item *budp; + enum xfs_bmap_intent_type type; + int whichfork; + xfs_exntst_t state; + struct xfs_trans *tp; + struct xfs_inode *ip = NULL; + struct xfs_defer_ops dfops; + xfs_fsblock_t firstfsb; + + ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)); + + /* Only one mapping operation per BUI... */ + if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + xfs_bui_release(buip); + return -EIO; + } + + /* + * First check the validity of the extent described by the + * BUI. If anything is bad, then toss the BUI. + */ + bmap = &buip->bui_format.bui_extents[0]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); + inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, + XFS_INO_TO_FSB(mp, bmap->me_owner))); + switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + bmap->me_len == 0 || + inode_fsb == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + bmap->me_len >= mp->m_sb.sb_agblocks || + inode_fsb >= mp->m_sb.sb_dblocks || + (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { + /* + * This will pull the BUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + xfs_bui_release(buip); + return -EIO; + } + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + budp = xfs_trans_get_bud(tp, buip); + + /* Grab the inode. */ + error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + if (error) + goto err_inode; + + if (VFS_I(ip)->i_nlink == 0) + xfs_iflags_set(ip, XFS_IRECOVERY); + xfs_defer_init(&dfops, &firstfsb); + + /* Process deferred bmap item. */ + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + type = bui_type; + break; + default: + error = -EFSCORRUPTED; + goto err_dfops; + } + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type, + ip, whichfork, bmap->me_startoff, + bmap->me_startblock, bmap->me_len, + state); + if (error) + goto err_dfops; + + /* Finish transaction, free inodes. */ + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto err_dfops; + + set_bit(XFS_BUI_RECOVERED, &buip->bui_flags); + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + + return error; + +err_dfops: + xfs_defer_cancel(&dfops); +err_inode: + xfs_trans_cancel(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + } + return error; +} diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h new file mode 100644 index 000000000000..c867daae4a3c --- /dev/null +++ b/fs/xfs/xfs_bmap_item.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_BMAP_ITEM_H__ +#define __XFS_BMAP_ITEM_H__ + +/* + * There are (currently) two pairs of bmap btree redo item types: map & unmap. + * The common abbreviations for these are BUI (bmap update intent) and BUD + * (bmap update done). The redo item type is encoded in the flags field of + * each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same transaction + * that records the associated bmbt updates. + * + * Should the system crash after the commit of the first transaction but + * before the commit of the final transaction in a series, log recovery will + * use the redo information recorded by the intent items to replay the + * bmbt metadata updates in the non-first transaction. + */ + +/* kernel only BUI/BUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_BUI_MAX_FAST_EXTENTS 1 + +/* + * Define BUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_BUI_RECOVERED 1 + +/* + * This is the "bmap update intent" log item. It is used to log the fact that + * some reverse mappings need to change. It is used in conjunction with the + * "bmap update done" log item described below. + * + * These log items follow the same rules as struct xfs_efi_log_item; see the + * comments about that structure (in xfs_extfree_item.h) for more details. + */ +struct xfs_bui_log_item { + struct xfs_log_item bui_item; + atomic_t bui_refcount; + atomic_t bui_next_extent; + unsigned long bui_flags; /* misc flags */ + struct xfs_bui_log_format bui_format; +}; + +static inline size_t +xfs_bui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_bui_log_item, bui_format) + + xfs_bui_log_format_sizeof(nr); +} + +/* + * This is the "bmap update done" log item. It is used to log the fact that + * some bmbt updates mentioned in an earlier bui item have been performed. + */ +struct xfs_bud_log_item { + struct xfs_log_item bud_item; + struct xfs_bui_log_item *bud_buip; + struct xfs_bud_log_format bud_format; +}; + +extern struct kmem_zone *xfs_bui_zone; +extern struct kmem_zone *xfs_bud_zone; + +struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *); +struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *, + struct xfs_bui_log_item *); +void xfs_bui_item_free(struct xfs_bui_log_item *); +void xfs_bui_release(struct xfs_bui_log_item *); +int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip); + +#endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e827d657c314..552465e011ec 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -42,6 +42,9 @@ #include "xfs_icache.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" +#include "xfs_iomap.h" +#include "xfs_reflink.h" +#include "xfs_refcount.h" /* Kernel only BMAP related definitions and functions */ @@ -389,11 +392,13 @@ xfs_bmap_count_blocks( STATIC int xfs_getbmapx_fix_eof_hole( xfs_inode_t *ip, /* xfs incore inode pointer */ + int whichfork, struct getbmapx *out, /* output structure */ int prealloced, /* this is a file with * preallocated data space */ __int64_t end, /* last block requested */ - xfs_fsblock_t startblock) + xfs_fsblock_t startblock, + bool moretocome) { __int64_t fixlen; xfs_mount_t *mp; /* file system mount point */ @@ -418,8 +423,9 @@ xfs_getbmapx_fix_eof_hole( else out->bmv_block = xfs_fsb_to_db(ip, startblock); fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!moretocome && + xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) out->bmv_oflags |= BMV_OF_LAST; } @@ -427,6 +433,81 @@ xfs_getbmapx_fix_eof_hole( return 1; } +/* Adjust the reported bmap around shared/unshared extent transitions. */ +STATIC int +xfs_getbmap_adjust_shared( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *map, + struct getbmapx *out, + struct xfs_bmbt_irec *next_map) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_agblock_t ebno; + xfs_extlen_t elen; + xfs_extlen_t nlen; + int error; + + next_map->br_startblock = NULLFSBLOCK; + next_map->br_startoff = NULLFILEOFF; + next_map->br_blockcount = 0; + + /* Only written data blocks can be shared. */ + if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK || + map->br_startblock == DELAYSTARTBLOCK || + map->br_startblock == HOLESTARTBLOCK || + ISUNWRITTEN(map)) + return 0; + + agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock); + error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount, + &ebno, &elen, true); + if (error) + return error; + + if (ebno == NULLAGBLOCK) { + /* No shared blocks at all. */ + return 0; + } else if (agbno == ebno) { + /* + * Shared extent at (agbno, elen). Shrink the reported + * extent length and prepare to move the start of map[i] + * to agbno+elen, with the aim of (re)formatting the new + * map[i] the next time through the inner loop. + */ + out->bmv_length = XFS_FSB_TO_BB(mp, elen); + out->bmv_oflags |= BMV_OF_SHARED; + if (elen != map->br_blockcount) { + *next_map = *map; + next_map->br_startblock += elen; + next_map->br_startoff += elen; + next_map->br_blockcount -= elen; + } + map->br_blockcount -= elen; + } else { + /* + * There's an unshared extent (agbno, ebno - agbno) + * followed by shared extent at (ebno, elen). Shrink + * the reported extent length to cover only the unshared + * extent and prepare to move up the start of map[i] to + * ebno, with the aim of (re)formatting the new map[i] + * the next time through the inner loop. + */ + *next_map = *map; + nlen = ebno - agbno; + out->bmv_length = XFS_FSB_TO_BB(mp, nlen); + next_map->br_startblock += nlen; + next_map->br_startoff += nlen; + next_map->br_blockcount -= nlen; + map->br_blockcount -= nlen; + } + + return 0; +} + /* * Get inode's extents as described in bmv, and format for output. * Calls formatter to fill the user's buffer until all extents @@ -459,12 +540,28 @@ xfs_getbmap( int iflags; /* interface flags */ int bmapi_flags; /* flags for xfs_bmapi */ int cur_ext = 0; + struct xfs_bmbt_irec inject_map; mp = ip->i_mount; iflags = bmv->bmv_iflags; - whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - if (whichfork == XFS_ATTR_FORK) { +#ifndef DEBUG + /* Only allow CoW fork queries if we're debugging. */ + if (iflags & BMV_IF_COWFORK) + return -EINVAL; +#endif + if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK)) + return -EINVAL; + + if (iflags & BMV_IF_ATTRFORK) + whichfork = XFS_ATTR_FORK; + else if (iflags & BMV_IF_COWFORK) + whichfork = XFS_COW_FORK; + else + whichfork = XFS_DATA_FORK; + + switch (whichfork) { + case XFS_ATTR_FORK: if (XFS_IFORK_Q(ip)) { if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && @@ -480,7 +577,20 @@ xfs_getbmap( prealloced = 0; fixlen = 1LL << 32; - } else { + break; + case XFS_COW_FORK: + if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS) + return -EINVAL; + + if (xfs_get_cowextsz_hint(ip)) { + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + break; + default: if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && ip->i_d.di_format != XFS_DINODE_FMT_BTREE && ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) @@ -494,6 +604,7 @@ xfs_getbmap( prealloced = 0; fixlen = XFS_ISIZE(ip); } + break; } if (bmv->bmv_length == -1) { @@ -520,7 +631,8 @@ xfs_getbmap( return -ENOMEM; xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (whichfork == XFS_DATA_FORK) { + switch (whichfork) { + case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { error = filemap_write_and_wait(VFS_I(ip)->i_mapping); @@ -538,8 +650,14 @@ xfs_getbmap( } lock = xfs_ilock_data_map_shared(ip); - } else { + break; + case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + break; + case XFS_ATTR_FORK: lock = xfs_ilock_attr_map_shared(ip); + break; } /* @@ -581,7 +699,8 @@ xfs_getbmap( goto out_free_map; ASSERT(nmap <= subnex); - for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + for (i = 0; i < nmap && nexleft && bmv->bmv_length && + cur_ext < bmv->bmv_count; i++) { out[cur_ext].bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; @@ -614,9 +733,16 @@ xfs_getbmap( goto out_free_map; } - if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], - prealloced, bmvend, - map[i].br_startblock)) + /* Is this a shared block? */ + error = xfs_getbmap_adjust_shared(ip, whichfork, + &map[i], &out[cur_ext], &inject_map); + if (error) + goto out_free_map; + + if (!xfs_getbmapx_fix_eof_hole(ip, whichfork, + &out[cur_ext], prealloced, bmvend, + map[i].br_startblock, + inject_map.br_startblock != NULLFSBLOCK)) goto out_free_map; bmv->bmv_offset = @@ -636,11 +762,16 @@ xfs_getbmap( continue; } - nexleft--; + if (inject_map.br_startblock != NULLFSBLOCK) { + map[i] = inject_map; + i--; + } else + nexleft--; bmv->bmv_entries++; cur_ext++; } - } while (nmap && nexleft && bmv->bmv_length); + } while (nmap && nexleft && bmv->bmv_length && + cur_ext < bmv->bmv_count); out_free_map: kmem_free(map); @@ -1433,8 +1564,8 @@ xfs_insert_file_space( */ static int xfs_swap_extents_check_format( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip) /* tmp inode */ + struct xfs_inode *ip, /* target inode */ + struct xfs_inode *tip) /* tmp inode */ { /* Should never get a local format */ @@ -1450,6 +1581,13 @@ xfs_swap_extents_check_format( return -EINVAL; /* + * If we have to use the (expensive) rmap swap method, we can + * handle any number of extents and any format. + */ + if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb)) + return 0; + + /* * if the target inode is in extent form and the temp inode is in btree * form then we will end up with the target inode in the wrong format * as we already know there are less extents in the temp inode. @@ -1518,125 +1656,161 @@ xfs_swap_extent_flush( return 0; } -int -xfs_swap_extents( - xfs_inode_t *ip, /* target inode */ - xfs_inode_t *tip, /* tmp inode */ - xfs_swapext_t *sxp) +/* + * Move extents from one file to another, when rmap is enabled. + */ +STATIC int +xfs_swap_extent_rmap( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tip) { - xfs_mount_t *mp = ip->i_mount; - xfs_trans_t *tp; - xfs_bstat_t *sbp = &sxp->sx_stat; - xfs_ifork_t *tempifp, *ifp, *tifp; - int src_log_flags, target_log_flags; - int error = 0; - int aforkblks = 0; - int taforkblks = 0; - __uint64_t tmp; - int lock_flags; - - /* XXX: we can't do this with rmap, will fix later */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - return -EOPNOTSUPP; - - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); - if (!tempifp) { - error = -ENOMEM; - goto out; - } + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec uirec; + struct xfs_bmbt_irec tirec; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error; + xfs_filblks_t ilen; + xfs_filblks_t rlen; + int nimaps; + __uint64_t tip_flags2; /* - * Lock the inodes against other IO, page faults and truncate to - * begin with. Then we can ensure the inodes are flushed and have no - * page cache safely. Once we have done this we can take the ilocks and - * do the rest of the checks. + * If the source file has shared blocks, we must flag the donor + * file as having shared blocks so that we get the shared-block + * rmap functions when we go to fix up the rmaps. The flags + * will be switch for reals later. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); - xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); - - /* Verify that both files have the same format */ - if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { - error = -EINVAL; - goto out_unlock; - } + tip_flags2 = tip->i_d.di_flags2; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) + tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + + offset_fsb = 0; + end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip))); + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + + while (count_fsb) { + /* Read extent from the donor file */ + nimaps = 1; + error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec, + &nimaps, 0); + if (error) + goto out; + ASSERT(nimaps == 1); + ASSERT(tirec.br_startblock != DELAYSTARTBLOCK); + + trace_xfs_swap_extent_rmap_remap(tip, &tirec); + ilen = tirec.br_blockcount; + + /* Unmap the old blocks in the source file. */ + while (tirec.br_blockcount) { + xfs_defer_init(&dfops, &firstfsb); + trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec); + + /* Read extent from the source file */ + nimaps = 1; + error = xfs_bmapi_read(ip, tirec.br_startoff, + tirec.br_blockcount, &irec, + &nimaps, 0); + if (error) + goto out_defer; + ASSERT(nimaps == 1); + ASSERT(tirec.br_startoff == irec.br_startoff); + trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); + + /* Trim the extent. */ + uirec = tirec; + uirec.br_blockcount = rlen = min_t(xfs_filblks_t, + tirec.br_blockcount, + irec.br_blockcount); + trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec); + + /* Remove the mapping from the donor file. */ + error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, + tip, &uirec); + if (error) + goto out_defer; - /* Verify both files are either real-time or non-realtime */ - if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { - error = -EINVAL; - goto out_unlock; - } + /* Remove the mapping from the source file. */ + error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops, + ip, &irec); + if (error) + goto out_defer; - error = xfs_swap_extent_flush(ip); - if (error) - goto out_unlock; - error = xfs_swap_extent_flush(tip); - if (error) - goto out_unlock; + /* Map the donor file's blocks into the source file. */ + error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, + ip, &uirec); + if (error) + goto out_defer; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); - if (error) - goto out_unlock; + /* Map the source file's blocks into the donor file. */ + error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops, + tip, &irec); + if (error) + goto out_defer; - /* - * Lock and join the inodes to the tansaction so that transaction commit - * or cancel will unlock the inodes from this point onwards. - */ - xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); + error = xfs_defer_finish(tpp, &dfops, ip); + if (error) + goto out_defer; + tirec.br_startoff += rlen; + if (tirec.br_startblock != HOLESTARTBLOCK && + tirec.br_startblock != DELAYSTARTBLOCK) + tirec.br_startblock += rlen; + tirec.br_blockcount -= rlen; + } - /* Verify all data are being swapped */ - if (sxp->sx_offset != 0 || - sxp->sx_length != ip->i_d.di_size || - sxp->sx_length != tip->i_d.di_size) { - error = -EFAULT; - goto out_trans_cancel; + /* Roll on... */ + count_fsb -= ilen; + offset_fsb += ilen; } - trace_xfs_swap_extent_before(ip, 0); - trace_xfs_swap_extent_before(tip, 1); + tip->i_d.di_flags2 = tip_flags2; + return 0; - /* check inode formats now that data is flushed */ - error = xfs_swap_extents_check_format(ip, tip); - if (error) { - xfs_notice(mp, - "%s: inode 0x%llx format is incompatible for exchanging.", - __func__, ip->i_ino); - goto out_trans_cancel; - } +out_defer: + xfs_defer_cancel(&dfops); +out: + trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); + tip->i_d.di_flags2 = tip_flags2; + return error; +} + +/* Swap the extents of two files by swapping data forks. */ +STATIC int +xfs_swap_extent_forks( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_inode *tip, + int *src_log_flags, + int *target_log_flags) +{ + struct xfs_ifork tempifp, *ifp, *tifp; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + int error; - /* - * Compare the current change & modify times with that - * passed in. If they differ, we abort this swap. - * This is the mechanism used to ensure the calling - * process that the file was not changed out from - * under it. - */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { - error = -EBUSY; - goto out_trans_cancel; - } /* * Count the number of extended attribute blocks */ if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, + &aforkblks); if (error) - goto out_trans_cancel; + return error; } if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, - &taforkblks); + &taforkblks); if (error) - goto out_trans_cancel; + return error; } /* @@ -1645,31 +1819,23 @@ xfs_swap_extents( * buffers, and so the validation done on read will expect the owner * field to be correctly set. Once we change the owners, we can swap the * inode forks. - * - * Note the trickiness in setting the log flags - we set the owner log - * flag on the opposite inode (i.e. the inode we are setting the new - * owner to be) because once we swap the forks and log that, log - * recovery is going to see the fork as owned by the swapped inode, - * not the pre-swapped inodes. */ - src_log_flags = XFS_ILOG_CORE; - target_log_flags = XFS_ILOG_CORE; if (ip->i_d.di_version == 3 && ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - target_log_flags |= XFS_ILOG_DOWNER; + (*target_log_flags) |= XFS_ILOG_DOWNER; error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, tip->i_ino, NULL); if (error) - goto out_trans_cancel; + return error; } if (tip->i_d.di_version == 3 && tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - src_log_flags |= XFS_ILOG_DOWNER; + (*src_log_flags) |= XFS_ILOG_DOWNER; error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, ip->i_ino, NULL); if (error) - goto out_trans_cancel; + return error; } /* @@ -1677,9 +1843,9 @@ xfs_swap_extents( */ ifp = &ip->i_df; tifp = &tip->i_df; - *tempifp = *ifp; /* struct copy */ + tempifp = *ifp; /* struct copy */ *ifp = *tifp; /* struct copy */ - *tifp = *tempifp; /* struct copy */ + *tifp = tempifp; /* struct copy */ /* * Fix the on-disk inode values @@ -1719,12 +1885,12 @@ xfs_swap_extents( ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; } - src_log_flags |= XFS_ILOG_DEXT; + (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: ASSERT(ip->i_d.di_version < 3 || - (src_log_flags & XFS_ILOG_DOWNER)); - src_log_flags |= XFS_ILOG_DBROOT; + (*src_log_flags & XFS_ILOG_DOWNER)); + (*src_log_flags) |= XFS_ILOG_DBROOT; break; } @@ -1738,15 +1904,166 @@ xfs_swap_extents( tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; } - target_log_flags |= XFS_ILOG_DEXT; + (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - target_log_flags |= XFS_ILOG_DBROOT; + (*target_log_flags) |= XFS_ILOG_DBROOT; ASSERT(tip->i_d.di_version < 3 || - (target_log_flags & XFS_ILOG_DOWNER)); + (*target_log_flags & XFS_ILOG_DOWNER)); break; } + return 0; +} + +int +xfs_swap_extents( + struct xfs_inode *ip, /* target inode */ + struct xfs_inode *tip, /* tmp inode */ + struct xfs_swapext *sxp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bstat *sbp = &sxp->sx_stat; + int src_log_flags, target_log_flags; + int error = 0; + int lock_flags; + struct xfs_ifork *cowfp; + __uint64_t f; + int resblks; + + /* + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. + */ + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); + + /* Verify that both files have the same format */ + if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { + error = -EINVAL; + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = -EINVAL; + goto out_unlock; + } + + error = xfs_swap_extent_flush(ip); + if (error) + goto out_unlock; + error = xfs_swap_extent_flush(tip); + if (error) + goto out_unlock; + + /* + * Extent "swapping" with rmap requires a permanent reservation and + * a block reservation because it's really just a remap operation + * performed with log redo items! + */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + /* + * Conceptually this shouldn't affect the shape of either + * bmbt, but since we atomically move extents one by one, + * we reserve enough space to rebuild both trees. + */ + resblks = XFS_SWAP_RMAP_SPACE_RES(mp, + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK), + XFS_DATA_FORK) + + XFS_SWAP_RMAP_SPACE_RES(mp, + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), + XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, + 0, 0, &tp); + } else + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, + 0, 0, &tp); + if (error) + goto out_unlock; + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tip, 0); + + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = -EFAULT; + goto out_trans_cancel; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_trans_cancel; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = -EBUSY; + goto out_trans_cancel; + } + + /* + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + error = xfs_swap_extent_rmap(&tp, ip, tip); + else + error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags, + &target_log_flags); + if (error) + goto out_trans_cancel; + + /* Do we have to swap reflink flags? */ + if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^ + (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) { + f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + cowfp = ip->i_cowfp; + ip->i_cowfp = tip->i_cowfp; + tip->i_cowfp = cowfp; + xfs_inode_set_cowblocks_tag(ip); + xfs_inode_set_cowblocks_tag(tip); + } + xfs_trans_log_inode(tp, ip, src_log_flags); xfs_trans_log_inode(tp, tip, target_log_flags); @@ -1761,16 +2078,16 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); -out: - kmem_free(tempifp); - return error; -out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); - goto out; + return error; out_trans_cancel: xfs_trans_cancel(tp); - goto out; + +out_unlock: + xfs_iunlock(ip, lock_flags); + xfs_iunlock(tip, lock_flags); + return error; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index f44f79996978..29816981b50a 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -84,7 +84,8 @@ xfs_dir2_sf_getdents( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) + return -EFSCORRUPTED; /* * If the block number in the offset is out of range, we're done. diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 3d224702fbc0..05f8666733a0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -92,7 +92,11 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERRTAG_BMAPIFORMAT 21 #define XFS_ERRTAG_FREE_EXTENT 22 #define XFS_ERRTAG_RMAP_FINISH_ONE 23 -#define XFS_ERRTAG_MAX 24 +#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE 24 +#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 +#define XFS_ERRTAG_BMAP_FINISH_ONE 26 +#define XFS_ERRTAG_AG_RESV_CRITICAL 27 +#define XFS_ERRTAG_MAX 28 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -121,6 +125,10 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #define XFS_RANDOM_FREE_EXTENT 1 #define XFS_RANDOM_RMAP_FINISH_ONE 1 +#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 +#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 +#define XFS_RANDOM_BMAP_FINISH_ONE 1 +#define XFS_RANDOM_AG_RESV_CRITICAL 4 #ifdef DEBUG extern int xfs_error_test_active; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 2bc58b3fd37d..a314fc7b56fa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,6 +38,7 @@ #include "xfs_icache.h" #include "xfs_pnfs.h" #include "xfs_iomap.h" +#include "xfs_reflink.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -634,6 +635,13 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos); + /* If this is a block-aligned directio CoW, remap immediately. */ + if (xfs_is_reflink_inode(ip) && !unaligned_io) { + ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); + if (ret) + goto out; + } + data = *from; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, xfs_end_io_direct_write, @@ -735,6 +743,9 @@ write_retry: enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; + enospc = xfs_inode_free_quota_cowblocks(ip); + if (enospc) + goto write_retry; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; @@ -774,10 +785,20 @@ xfs_file_write_iter( if (IS_DAX(inode)) ret = xfs_file_dax_write(iocb, from); - else if (iocb->ki_flags & IOCB_DIRECT) + else if (iocb->ki_flags & IOCB_DIRECT) { + /* + * Allow a directio write to fall back to a buffered + * write *only* in the case that we're doing a reflink + * CoW. In all other directio scenarios we do not + * allow an operation to fall back to buffered mode. + */ ret = xfs_file_dio_aio_write(iocb, from); - else + if (ret == -EREMCHG) + goto buffered; + } else { +buffered: ret = xfs_file_buffered_aio_write(iocb, from); + } if (ret > 0) { XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); @@ -791,7 +812,7 @@ xfs_file_write_iter( #define XFS_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE) + FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long xfs_file_fallocate( @@ -881,9 +902,15 @@ xfs_file_fallocate( if (mode & FALLOC_FL_ZERO_RANGE) error = xfs_zero_file_space(ip, offset, len); - else + else { + if (mode & FALLOC_FL_UNSHARE_RANGE) { + error = xfs_reflink_unshare(ip, offset, len); + if (error) + goto out_unlock; + } error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); + } if (error) goto out_unlock; } @@ -920,6 +947,189 @@ out_unlock: return error; } +/* + * Flush all file writes out to disk. + */ +static int +xfs_file_wait_for_io( + struct inode *inode, + loff_t offset, + size_t len) +{ + loff_t rounding; + loff_t ioffset; + loff_t iendoffset; + loff_t bs; + int ret; + + bs = inode->i_sb->s_blocksize; + inode_dio_wait(inode); + + rounding = max_t(xfs_off_t, bs, PAGE_SIZE); + ioffset = round_down(offset, rounding); + iendoffset = round_up(offset + len, rounding) - 1; + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + iendoffset); + return ret; +} + +/* Hook up to the VFS reflink function */ +STATIC int +xfs_file_share_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in; + struct inode *inode_out; + ssize_t ret; + loff_t bs; + loff_t isize; + int same_inode; + loff_t blen; + unsigned int flags = 0; + + inode_in = file_inode(file_in); + inode_out = file_inode(file_out); + bs = inode_out->i_sb->s_blocksize; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + if (IS_SWAPFILE(inode_in) || + IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Reflink only works within this filesystem. */ + if (inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + same_inode = (inode_in->i_ino == inode_out->i_ino); + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) + return -EINVAL; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Don't share DAX file data for now. */ + if (IS_DAX(inode_in) || IS_DAX(inode_out)) + return -EINVAL; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) + return 0; + if (len == 0) + len = isize - pos_in; + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + len < pos_in || pos_out + len < pos_out || + pos_in + len > isize) + return -EINVAL; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + len > disize) + return -EINVAL; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + return -EINVAL; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen) + return -EINVAL; + + /* Wait for the completion of any pending IOs on srcfile */ + ret = xfs_file_wait_for_io(inode_in, pos_in, len); + if (ret) + goto out; + ret = xfs_file_wait_for_io(inode_out, pos_out, len); + if (ret) + goto out; + + if (is_dedupe) + flags |= XFS_REFLINK_DEDUPE; + ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out), + pos_out, len, flags); + if (ret < 0) + goto out; + +out: + return ret; +} + +STATIC ssize_t +xfs_file_copy_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + size_t len, + unsigned int flags) +{ + int error; + + error = xfs_file_share_range(file_in, pos_in, file_out, pos_out, + len, false); + if (error) + return error; + return len; +} + +STATIC int +xfs_file_clone_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) +{ + return xfs_file_share_range(file_in, pos_in, file_out, pos_out, + len, false); +} + +#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +STATIC ssize_t +xfs_file_dedupe_range( + struct file *src_file, + u64 loff, + u64 len, + struct file *dst_file, + u64 dst_loff) +{ + int error; + + /* + * Limit the total length we will dedupe for each operation. + * This is intended to bound the total time spent in this + * ioctl to something sane. + */ + if (len > XFS_MAX_DEDUPE_LEN) + len = XFS_MAX_DEDUPE_LEN; + + error = xfs_file_share_range(src_file, loff, dst_file, dst_loff, + len, true); + if (error) + return error; + return len; +} STATIC int xfs_file_open( @@ -1581,6 +1791,9 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, + .copy_file_range = xfs_file_copy_range, + .clone_file_range = xfs_file_clone_range, + .dedupe_file_range = xfs_file_dedupe_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 94ac06f3d908..93d12fa2670d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -43,6 +43,7 @@ #include "xfs_log.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag_resv.h" /* * File system operations @@ -108,7 +109,9 @@ xfs_fs_geometry( (xfs_sb_version_hassparseinodes(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SPINODES : 0) | (xfs_sb_version_hasrmapbt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_RMAPBT : 0); + XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) | + (xfs_sb_version_hasreflink(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_REFLINK : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -259,6 +262,12 @@ xfs_growfs_data_private( agf->agf_longest = cpu_to_be32(tmpsize); if (xfs_sb_version_hascrc(&mp->m_sb)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agf->agf_refcount_root = cpu_to_be32( + xfs_refc_block(mp)); + agf->agf_refcount_level = cpu_to_be32(1); + agf->agf_refcount_blocks = cpu_to_be32(1); + } error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -450,6 +459,17 @@ xfs_growfs_data_private( rrec->rm_offset = 0; be16_add_cpu(&block->bb_numrecs, 1); + /* account for refc btree root */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + rrec = XFS_RMAP_REC_ADDR(block, 5); + rrec->rm_startblock = cpu_to_be32( + xfs_refc_block(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + } + error = xfs_bwrite(bp); xfs_buf_relse(bp); if (error) @@ -507,6 +527,28 @@ xfs_growfs_data_private( goto error0; } + /* + * refcount btree root block + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_refcountbt_buf_ops); + if (!bp) { + error = -ENOMEM; + goto error0; + } + + xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } } xfs_trans_agblocks_delta(tp, nfree); /* @@ -589,6 +631,11 @@ xfs_growfs_data_private( xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + /* Reserve AG metadata blocks. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + goto out; + /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { error = 0; @@ -639,6 +686,8 @@ xfs_growfs_data_private( continue; } } + + out: return saved_error ? saved_error : error; error0: @@ -948,3 +997,59 @@ xfs_do_force_shutdown( "Please umount the filesystem and rectify the problem(s)"); } } + +/* + * Reserve free space for per-AG metadata. + */ +int +xfs_fs_reserve_ag_blocks( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + int error = 0; + int err2; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_init(pag); + xfs_perag_put(pag); + if (err2 && !error) + error = err2; + } + + if (error && error != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving per-AG metadata reserve pool.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + } + + return error; +} + +/* + * Free space reserved for per-AG metadata. + */ +int +xfs_fs_unreserve_ag_blocks( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; + int error = 0; + int err2; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + err2 = xfs_ag_resv_free(pag); + xfs_perag_put(pag); + if (err2 && !error) + error = err2; + } + + if (error) + xfs_warn(mp, + "Error %d freeing per-AG metadata reserve pool.", error); + + return error; +} diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index f32713f14f9a..f34915898fea 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); + #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d41b241298f..687a4b01fc53 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -21,8 +21,8 @@ /* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second) with the exception of eofb_timer, which is measured in - * seconds. + * 100ths of a second) with the exception of eofb_timer and cowb_timer, which + * are measured in seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ @@ -42,6 +42,7 @@ xfs_param_t xfs_params = { .inherit_nodfrg = { 0, 1, 1 }, .fstrm_timer = { 1, 30*100, 3600*100}, .eofb_timer = { 1, 300, 3600*24}, + .cowb_timer = { 1, 1800, 3600*24}, }; struct xfs_globals xfs_globals = { diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 65b2e3f85f52..14796b744e0a 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,6 +33,7 @@ #include "xfs_bmap_util.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_reflink.h" #include <linux/kthread.h> #include <linux/freezer.h> @@ -76,6 +77,9 @@ xfs_inode_alloc( ip->i_mount = mp; memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); ip->i_afp = NULL; + ip->i_cowfp = NULL; + ip->i_cnextents = 0; + ip->i_cformat = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; ip->i_delayed_blks = 0; @@ -101,6 +105,8 @@ xfs_inode_free_callback( if (ip->i_afp) xfs_idestroy_fork(ip, XFS_ATTR_FORK); + if (ip->i_cowfp) + xfs_idestroy_fork(ip, XFS_COW_FORK); if (ip->i_itemp) { ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); @@ -787,6 +793,33 @@ xfs_eofblocks_worker( xfs_queue_eofblocks(mp); } +/* + * Background scanning to trim preallocated CoW space. This is queued + * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). + * (We'll just piggyback on the post-EOF prealloc space workqueue.) + */ +STATIC void +xfs_queue_cowblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_cowblocks_work, + msecs_to_jiffies(xfs_cowb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_cowblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_cowblocks_work); + xfs_icache_free_cowblocks(mp, NULL); + xfs_queue_cowblocks(mp); +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -1343,18 +1376,30 @@ xfs_inode_free_eofblocks( return ret; } -int -xfs_icache_free_eofblocks( +static int +__xfs_icache_free_eofblocks( struct xfs_mount *mp, - struct xfs_eofblocks *eofb) + struct xfs_eofblocks *eofb, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int tag) { int flags = SYNC_TRYLOCK; if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) flags = SYNC_WAIT; - return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, - eofb, XFS_ICI_EOFBLOCKS_TAG); + return xfs_inode_ag_iterator_tag(mp, execute, flags, + eofb, tag); +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, + XFS_ICI_EOFBLOCKS_TAG); } /* @@ -1363,9 +1408,11 @@ xfs_icache_free_eofblocks( * failure. We make a best effort by including each quota under low free space * conditions (less than 1% free space) in the scan. */ -int -xfs_inode_free_quota_eofblocks( - struct xfs_inode *ip) +static int +__xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip, + int (*execute)(struct xfs_mount *mp, + struct xfs_eofblocks *eofb)) { int scan = 0; struct xfs_eofblocks eofb = {0}; @@ -1401,14 +1448,25 @@ xfs_inode_free_quota_eofblocks( } if (scan) - xfs_icache_free_eofblocks(ip->i_mount, &eofb); + execute(ip->i_mount, &eofb); return scan; } -void -xfs_inode_set_eofblocks_tag( - xfs_inode_t *ip) +int +xfs_inode_free_quota_eofblocks( + struct xfs_inode *ip) +{ + return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); +} + +static void +__xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip, + void (*execute)(struct xfs_mount *mp), + void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, + int error, unsigned long caller_ip), + int tag) { struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; @@ -1426,26 +1484,22 @@ xfs_inode_set_eofblocks_tag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - trace_xfs_inode_set_eofblocks_tag(ip); - tagged = radix_tree_tagged(&pag->pag_ici_root, - XFS_ICI_EOFBLOCKS_TAG); + tagged = radix_tree_tagged(&pag->pag_ici_root, tag); radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); if (!tagged) { /* propagate the eofblocks tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_set(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + tag); spin_unlock(&ip->i_mount->m_perag_lock); /* kick off background trimming */ - xfs_queue_eofblocks(ip->i_mount); + execute(ip->i_mount); - trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); + set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } spin_unlock(&pag->pag_ici_lock); @@ -1453,9 +1507,22 @@ xfs_inode_set_eofblocks_tag( } void -xfs_inode_clear_eofblocks_tag( +xfs_inode_set_eofblocks_tag( xfs_inode_t *ip) { + trace_xfs_inode_set_eofblocks_tag(ip); + return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks, + trace_xfs_perag_set_eofblocks, + XFS_ICI_EOFBLOCKS_TAG); +} + +static void +__xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip, + void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, + int error, unsigned long caller_ip), + int tag) +{ struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; @@ -1465,23 +1532,141 @@ xfs_inode_clear_eofblocks_tag( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); - trace_xfs_inode_clear_eofblocks_tag(ip); radix_tree_tag_clear(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); - if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); + if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { /* clear the eofblocks tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); radix_tree_tag_clear(&ip->i_mount->m_perag_tree, XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), - XFS_ICI_EOFBLOCKS_TAG); + tag); spin_unlock(&ip->i_mount->m_perag_lock); - trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, - -1, _RET_IP_); + clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } spin_unlock(&pag->pag_ici_lock); xfs_perag_put(pag); } +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_clear_eofblocks_tag(ip); + return __xfs_inode_clear_eofblocks_tag(ip, + trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); +} + +/* + * Automatic CoW Reservation Freeing + * + * These functions automatically garbage collect leftover CoW reservations + * that were made on behalf of a cowextsize hint when we start to run out + * of quota or when the reservations sit around for too long. If the file + * has dirty pages or is undergoing writeback, its CoW reservations will + * be retained. + * + * The actual garbage collection piggybacks off the same code that runs + * the speculative EOF preallocation garbage collector. + */ +STATIC int +xfs_inode_free_cowblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + bool need_iolock = true; + int match; + + ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); + + if (!xfs_reflink_has_real_cow_blocks(ip)) { + trace_xfs_inode_free_cowblocks_invalid(ip); + xfs_inode_clear_cowblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty or under writeback we cannot touch the + * CoW fork. Leave it alone if we're in the midst of a directio. + */ + if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || + atomic_read(&VFS_I(ip)->i_dio_count)) + return 0; + + if (eofb) { + if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) + match = xfs_inode_match_id_union(ip, eofb); + else + match = xfs_inode_match_id(ip, eofb); + if (!match) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + + /* + * A scan owner implies we already hold the iolock. Skip it in + * xfs_free_eofblocks() to avoid deadlock. This also eliminates + * the possibility of EAGAIN being returned. + */ + if (eofb->eof_scan_owner == ip->i_ino) + need_iolock = false; + } + + /* Free the CoW blocks */ + if (need_iolock) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + } + + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + + if (need_iolock) { + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + + return ret; +} + +int +xfs_icache_free_cowblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, + XFS_ICI_COWBLOCKS_TAG); +} + +int +xfs_inode_free_quota_cowblocks( + struct xfs_inode *ip) +{ + return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); +} + +void +xfs_inode_set_cowblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_set_eofblocks_tag(ip); + return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks, + trace_xfs_perag_set_eofblocks, + XFS_ICI_COWBLOCKS_TAG); +} + +void +xfs_inode_clear_cowblocks_tag( + xfs_inode_t *ip) +{ + trace_xfs_inode_clear_eofblocks_tag(ip); + return __xfs_inode_clear_eofblocks_tag(ip, + trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 05bac99bef75..a1e02f4708ab 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -40,6 +40,7 @@ struct xfs_eofblocks { in xfs_inode_ag_iterator */ #define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ #define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ +#define XFS_ICI_COWBLOCKS_TAG 2 /* inode can have cow blocks to gc */ /* * Flags for xfs_iget() @@ -70,6 +71,12 @@ int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); void xfs_eofblocks_worker(struct work_struct *); void xfs_queue_eofblocks(struct xfs_mount *); +void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *); +int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip); +void xfs_cowblocks_worker(struct work_struct *); + int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 624e1dfa716b..4e560e6a12c1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -49,6 +49,7 @@ #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_bmap_btree.h" +#include "xfs_reflink.h" kmem_zone_t *xfs_inode_zone; @@ -77,6 +78,29 @@ xfs_get_extsz_hint( } /* + * Helper function to extract CoW extent size hint from inode. + * Between the extent size hint and the CoW extent size hint, we + * return the greater of the two. If the value is zero (automatic), + * use the default size. + */ +xfs_extlen_t +xfs_get_cowextsz_hint( + struct xfs_inode *ip) +{ + xfs_extlen_t a, b; + + a = 0; + if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + a = ip->i_d.di_cowextsize; + b = xfs_get_extsz_hint(ip); + + a = max(a, b); + if (a == 0) + return XFS_DEFAULT_COWEXTSZ_HINT; + return a; +} + +/* * These two are wrapper routines around the xfs_ilock() routine used to * centralize some grungy code. They are used in places that wish to lock the * inode solely for reading the extents. The reason these places can't just @@ -651,6 +675,8 @@ _xfs_dic2xflags( if (di_flags2 & XFS_DIFLAG2_ANY) { if (di_flags2 & XFS_DIFLAG2_DAX) flags |= FS_XFLAG_DAX; + if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + flags |= FS_XFLAG_COWEXTSIZE; } if (has_attr) @@ -834,6 +860,7 @@ xfs_ialloc( if (ip->i_d.di_version == 3) { inode->i_version = 1; ip->i_d.di_flags2 = 0; + ip->i_d.di_cowextsize = 0; ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec; } @@ -896,6 +923,15 @@ xfs_ialloc( ip->i_d.di_flags |= di_flags; ip->i_d.di_flags2 |= di_flags2; } + if (pip && + (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && + pip->i_d.di_version == 3 && + ip->i_d.di_version == 3) { + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } + } /* FALLTHROUGH */ case S_IFLNK: ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; @@ -1586,6 +1622,20 @@ xfs_itruncate_extents( goto out; } + /* Remove all pending CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, + last_block); + if (error) + goto out; + + /* + * Clear the reflink flag if we truncated everything. + */ + if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) { + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + } + /* * Always re-log the inode so that our permanent transaction can keep * on rolling it forward in the log. @@ -1850,6 +1900,7 @@ xfs_inactive( } mp = ip->i_mount; + ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); /* If this is a read-only mount, don't do this (would generate I/O) */ if (mp->m_flags & XFS_MOUNT_RDONLY) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8f30d2533b48..f14c1de2549d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -47,6 +47,7 @@ typedef struct xfs_inode { /* Extent information. */ xfs_ifork_t *i_afp; /* attribute fork pointer */ + xfs_ifork_t *i_cowfp; /* copy on write extents */ xfs_ifork_t i_df; /* data fork */ /* operations vectors */ @@ -65,6 +66,9 @@ typedef struct xfs_inode { struct xfs_icdinode i_d; /* most of ondisk inode */ + xfs_extnum_t i_cnextents; /* # of extents in cow fork */ + unsigned int i_cformat; /* format of cow fork */ + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; @@ -202,6 +206,11 @@ xfs_get_initial_prid(struct xfs_inode *dp) return XFS_PROJID_DEFAULT; } +static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) +{ + return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; +} + /* * In-core inode flags. */ @@ -217,6 +226,12 @@ xfs_get_initial_prid(struct xfs_inode *dp) #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ #define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ +/* + * If this unlinked inode is in the middle of recovery, don't let drop_inode + * truncate and free the inode. This can happen if we iget the inode during + * log recovery to replay a bmap operation on the inode. + */ +#define XFS_IRECOVERY (1 << 11) /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during @@ -411,6 +426,7 @@ int xfs_iflush(struct xfs_inode *, struct xfs_buf **); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); +xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, xfs_dev_t, prid_t, int, @@ -474,4 +490,7 @@ do { \ extern struct kmem_zone *xfs_inode_zone; +/* The default CoW extent size hint. */ +#define XFS_DEFAULT_COWEXTSZ_HINT 32 + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 892c2aced207..9610e9c00952 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -368,7 +368,7 @@ xfs_inode_to_log_dinode( to->di_crtime.t_sec = from->di_crtime.t_sec; to->di_crtime.t_nsec = from->di_crtime.t_nsec; to->di_flags2 = from->di_flags2; - + to->di_cowextsize = from->di_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; memset(to->di_pad2, 0, sizeof(to->di_pad2)); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0d9021f0551e..c245bed3249b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -903,6 +903,8 @@ xfs_ioc_fsgetxattr( xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_cowextsize = ip->i_d.di_cowextsize << + ip->i_mount->m_sb.sb_blocklog; fa.fsx_projid = xfs_get_projid(ip); if (attr) { @@ -973,12 +975,13 @@ xfs_set_diflags( if (ip->i_d.di_version < 3) return; - di_flags2 = 0; + di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; + if (xflags & FS_XFLAG_COWEXTSIZE) + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_flags2 = di_flags2; - } STATIC void @@ -1031,6 +1034,14 @@ xfs_ioctl_setattr_xflags( return -EINVAL; } + /* Clear reflink if we are actually able to set the rt flag. */ + if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + + /* Don't allow us to set DAX mode for a reflinked file for now. */ + if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) + return -EINVAL; + /* * Can't modify an immutable/append-only file unless * we have appropriate permission. @@ -1219,6 +1230,56 @@ xfs_ioctl_setattr_check_extsize( return 0; } +/* + * CoW extent size hint validation rules are: + * + * 1. CoW extent size hint can only be set if reflink is enabled on the fs. + * The inode does not have to have any shared blocks, but it must be a v3. + * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files; + * for a directory, the hint is propagated to new files. + * 3. Can be changed on files & directories at any time. + * 4. CoW extsize hint of 0 turns off hints, clears inode flags. + * 5. Extent size must be a multiple of the appropriate block size. + * 6. The extent size hint must be limited to half the AG size to avoid + * alignment extending the extent beyond the limits of the AG. + */ +static int +xfs_ioctl_setattr_check_cowextsize( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE)) + return 0; + + if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) || + ip->i_d.di_version != 3) + return -EINVAL; + + if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode)) + return -EINVAL; + + if (fa->fsx_cowextsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t cowextsize_fsb; + + cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); + if (cowextsize_fsb > MAXEXTLEN) + return -EINVAL; + + size = mp->m_sb.sb_blocksize; + if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2) + return -EINVAL; + + if (fa->fsx_cowextsize % size) + return -EINVAL; + } else + fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE; + + return 0; +} + static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, @@ -1311,6 +1372,10 @@ xfs_ioctl_setattr( if (code) goto error_trans_cancel; + code = xfs_ioctl_setattr_check_cowextsize(ip, fa); + if (code) + goto error_trans_cancel; + code = xfs_ioctl_setattr_xflags(tp, ip, fa); if (code) goto error_trans_cancel; @@ -1346,6 +1411,12 @@ xfs_ioctl_setattr( ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; else ip->i_d.di_extsize = 0; + if (ip->i_d.di_version == 3 && + (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + ip->i_d.di_cowextsize = fa->fsx_cowextsize >> + mp->m_sb.sb_blocklog; + else + ip->i_d.di_cowextsize = 0; code = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index c08253e11545..d907eb9f8ef3 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -39,6 +39,7 @@ #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" +#include "xfs_reflink.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -70,7 +71,7 @@ xfs_bmbt_to_iomap( iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); } -static xfs_extlen_t +xfs_extlen_t xfs_eof_alignment( struct xfs_inode *ip, xfs_extlen_t extsize) @@ -609,7 +610,7 @@ xfs_file_iomap_begin_delay( } retry: - error = xfs_bmapi_reserve_delalloc(ip, offset_fsb, + error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, end_fsb - offset_fsb, &got, &prev, &idx, eof); switch (error) { @@ -666,6 +667,7 @@ out_unlock: int xfs_iomap_write_allocate( xfs_inode_t *ip, + int whichfork, xfs_off_t offset, xfs_bmbt_irec_t *imap) { @@ -678,8 +680,12 @@ xfs_iomap_write_allocate( xfs_trans_t *tp; int nimaps; int error = 0; + int flags = 0; int nres; + if (whichfork == XFS_COW_FORK) + flags |= XFS_BMAPI_COWFORK; + /* * Make sure that the dquots are there. */ @@ -773,7 +779,7 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, &first_block, + count_fsb, flags, &first_block, nres, imap, &nimaps, &dfops); if (error) @@ -955,14 +961,22 @@ xfs_file_iomap_begin( struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; + bool shared, trimmed; int nimaps = 1, error = 0; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & IOMAP_WRITE) && - !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + error = xfs_reflink_reserve_cow_range(ip, offset, length); + if (error < 0) + return error; + } + + if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && + !xfs_get_extsz_hint(ip)) { + /* Reserve delalloc blocks for regular writeback. */ return xfs_file_iomap_begin_delay(inode, offset, length, flags, iomap); } @@ -976,7 +990,14 @@ xfs_file_iomap_begin( end_fsb = XFS_B_TO_FSB(mp, offset + length); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE); + &nimaps, 0); + if (error) { + xfs_iunlock(ip, lockmode); + return error; + } + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) { xfs_iunlock(ip, lockmode); return error; @@ -1015,6 +1036,8 @@ xfs_file_iomap_begin( } xfs_bmbt_to_iomap(ip, iomap, &imap); + if (shared) + iomap->flags |= IOMAP_F_SHARED; return 0; } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 6498be485932..6d45cf01fcff 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -25,12 +25,13 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, +int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); +xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize); extern struct iomap_ops xfs_iomap_ops; extern struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c5da95eb79b8..405a65cd9d6b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1159,6 +1159,7 @@ xfs_diflags_to_iflags( inode->i_flags |= S_NOATIME; if (S_ISREG(inode->i_mode) && ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && + !xfs_is_reflink_inode(ip) && (ip->i_mount->m_flags & XFS_MOUNT_DAX || ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) inode->i_flags |= S_DAX; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ce73eb34620d..66e881790c17 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -66,7 +66,7 @@ xfs_bulkstat_one_int( if (!buffer || xfs_internal_inum(mp, ino)) return -EINVAL; - buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); if (!buf) return -ENOMEM; @@ -111,6 +111,12 @@ xfs_bulkstat_one_int( buf->bs_aextents = dic->di_anextents; buf->bs_forkoff = XFS_IFORK_BOFF(ip); + if (dic->di_version == 3) { + if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE) + buf->bs_cowextsize = dic->di_cowextsize << + mp->m_sb.sb_blocklog; + } + switch (dic->di_format) { case XFS_DINODE_FMT_DEV: buf->bs_rdev = ip->i_df.if_u2.if_rdev; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index b8d64d520e12..68640fb63a54 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -116,6 +116,7 @@ typedef __u32 xfs_nlink_t; #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val #define xfs_fstrm_centisecs xfs_params.fstrm_timer.val #define xfs_eofb_secs xfs_params.eofb_timer.val +#define xfs_cowb_secs xfs_params.cowb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 846483d56949..9b3d7c76915d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -45,6 +45,8 @@ #include "xfs_dir2.h" #include "xfs_rmap_item.h" #include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -1924,6 +1926,10 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); list_move_tail(&item->ri_list, &inode_list); @@ -2242,6 +2248,7 @@ xlog_recover_get_buf_lsn( case XFS_ABTB_MAGIC: case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: + case XFS_REFC_CRC_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; @@ -2415,6 +2422,9 @@ xlog_recover_validate_buf_type( case XFS_RMAP_CRC_MAGIC: bp->b_ops = &xfs_rmapbt_buf_ops; break; + case XFS_REFC_CRC_MAGIC: + bp->b_ops = &xfs_refcountbt_buf_ops; + break; default: warnmsg = "Bad btree block magic!"; break; @@ -3547,6 +3557,242 @@ xlog_recover_rud_pass2( } /* + * Copy an CUI format buffer from the given buf, and into the destination + * CUI format structure. The CUI/CUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_cui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_cui_log_format *dst_cui_fmt) +{ + struct xfs_cui_log_format *src_cui_fmt; + uint len; + + src_cui_fmt = buf->i_addr; + len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents); + + if (buf->i_len == len) { + memcpy(dst_cui_fmt, src_cui_fmt, len); + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent refcount update + * item from the cui format structure which was logged on disk. + * It allocates an in-core cui, copies the extents from the format + * structure into it, and adds the cui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_cui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_cui_log_item *cuip; + struct xfs_cui_log_format *cui_formatp; + + cui_formatp = item->ri_buf[0].i_addr; + + cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); + error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format); + if (error) { + xfs_cui_item_free(cuip); + return error; + } + atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The CUI has two references. One for the CUD and one for CUI to ensure + * it makes it into the AIL. Insert the CUI into the AIL directly and + * drop the CUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn); + xfs_cui_release(cuip); + return 0; +} + + +/* + * This routine is called when an CUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding CUI if it + * was still in the log. To do this it searches the AIL for the CUI with an id + * equal to that in the CUD format structure. If we find it we drop the CUD + * reference, which removes the CUI from the AIL and frees it. + */ +STATIC int +xlog_recover_cud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_cud_log_format *cud_formatp; + struct xfs_cui_log_item *cuip = NULL; + struct xfs_log_item *lip; + __uint64_t cui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + cud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) + return -EFSCORRUPTED; + cui_id = cud_formatp->cud_cui_id; + + /* + * Search for the CUI with the id in the CUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_CUI) { + cuip = (struct xfs_cui_log_item *)lip; + if (cuip->cui_format.cui_id == cui_id) { + /* + * Drop the CUD reference to the CUI. This + * removes the CUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_cui_release(cuip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* + * Copy an BUI format buffer from the given buf, and into the destination + * BUI format structure. The BUI/BUD items were designed not to need any + * special alignment handling. + */ +static int +xfs_bui_copy_format( + struct xfs_log_iovec *buf, + struct xfs_bui_log_format *dst_bui_fmt) +{ + struct xfs_bui_log_format *src_bui_fmt; + uint len; + + src_bui_fmt = buf->i_addr; + len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents); + + if (buf->i_len == len) { + memcpy(dst_bui_fmt, src_bui_fmt, len); + return 0; + } + return -EFSCORRUPTED; +} + +/* + * This routine is called to create an in-core extent bmap update + * item from the bui format structure which was logged on disk. + * It allocates an in-core bui, copies the extents from the format + * structure into it, and adds the bui to the AIL with the given + * LSN. + */ +STATIC int +xlog_recover_bui_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) +{ + int error; + struct xfs_mount *mp = log->l_mp; + struct xfs_bui_log_item *buip; + struct xfs_bui_log_format *bui_formatp; + + bui_formatp = item->ri_buf[0].i_addr; + + if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) + return -EFSCORRUPTED; + buip = xfs_bui_init(mp); + error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format); + if (error) { + xfs_bui_item_free(buip); + return error; + } + atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); + + spin_lock(&log->l_ailp->xa_lock); + /* + * The RUI has two references. One for the RUD and one for RUI to ensure + * it makes it into the AIL. Insert the RUI into the AIL directly and + * drop the RUI reference. Note that xfs_trans_ail_update() drops the + * AIL lock. + */ + xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn); + xfs_bui_release(buip); + return 0; +} + + +/* + * This routine is called when an BUD format structure is found in a committed + * transaction in the log. Its purpose is to cancel the corresponding BUI if it + * was still in the log. To do this it searches the AIL for the BUI with an id + * equal to that in the BUD format structure. If we find it we drop the BUD + * reference, which removes the BUI from the AIL and frees it. + */ +STATIC int +xlog_recover_bud_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_bud_log_format *bud_formatp; + struct xfs_bui_log_item *buip = NULL; + struct xfs_log_item *lip; + __uint64_t bui_id; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; + + bud_formatp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) + return -EFSCORRUPTED; + bui_id = bud_formatp->bud_bui_id; + + /* + * Search for the BUI with the id in the BUD format structure in the + * AIL. + */ + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + while (lip != NULL) { + if (lip->li_type == XFS_LI_BUI) { + buip = (struct xfs_bui_log_item *)lip; + if (buip->bui_format.bui_id == bui_id) { + /* + * Drop the BUD reference to the BUI. This + * removes the BUI from the AIL and frees it. + */ + spin_unlock(&ailp->xa_lock); + xfs_bui_release(buip); + spin_lock(&ailp->xa_lock); + break; + } + } + lip = xfs_trans_ail_cursor_next(ailp, &cur); + } + + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + + return 0; +} + +/* * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that @@ -3773,6 +4019,10 @@ xlog_recover_ra_pass2( case XFS_LI_QUOTAOFF: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: default: break; } @@ -3798,6 +4048,10 @@ xlog_recover_commit_pass1( case XFS_LI_ICREATE: case XFS_LI_RUI: case XFS_LI_RUD: + case XFS_LI_CUI: + case XFS_LI_CUD: + case XFS_LI_BUI: + case XFS_LI_BUD: /* nothing to do in pass 1 */ return 0; default: @@ -3832,6 +4086,14 @@ xlog_recover_commit_pass2( return xlog_recover_rui_pass2(log, item, trans->r_lsn); case XFS_LI_RUD: return xlog_recover_rud_pass2(log, item); + case XFS_LI_CUI: + return xlog_recover_cui_pass2(log, item, trans->r_lsn); + case XFS_LI_CUD: + return xlog_recover_cud_pass2(log, item); + case XFS_LI_BUI: + return xlog_recover_bui_pass2(log, item, trans->r_lsn); + case XFS_LI_BUD: + return xlog_recover_bud_pass2(log, item); case XFS_LI_DQUOT: return xlog_recover_dquot_pass2(log, buffer_list, item, trans->r_lsn); @@ -4419,12 +4681,94 @@ xlog_recover_cancel_rui( spin_lock(&ailp->xa_lock); } +/* Recover the CUI if necessary. */ +STATIC int +xlog_recover_process_cui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_cui_log_item *cuip; + int error; + + /* + * Skip CUIs that we've already processed. + */ + cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_cui_recover(mp, cuip); + spin_lock(&ailp->xa_lock); + + return error; +} + +/* Release the CUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_cui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_cui_log_item *cuip; + + cuip = container_of(lip, struct xfs_cui_log_item, cui_item); + + spin_unlock(&ailp->xa_lock); + xfs_cui_release(cuip); + spin_lock(&ailp->xa_lock); +} + +/* Recover the BUI if necessary. */ +STATIC int +xlog_recover_process_bui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_bui_log_item *buip; + int error; + + /* + * Skip BUIs that we've already processed. + */ + buip = container_of(lip, struct xfs_bui_log_item, bui_item); + if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) + return 0; + + spin_unlock(&ailp->xa_lock); + error = xfs_bui_recover(mp, buip); + spin_lock(&ailp->xa_lock); + + return error; +} + +/* Release the BUI since we're cancelling everything. */ +STATIC void +xlog_recover_cancel_bui( + struct xfs_mount *mp, + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_bui_log_item *buip; + + buip = container_of(lip, struct xfs_bui_log_item, bui_item); + + spin_unlock(&ailp->xa_lock); + xfs_bui_release(buip); + spin_lock(&ailp->xa_lock); +} + /* Is this log item a deferred action intent? */ static inline bool xlog_item_is_intent(struct xfs_log_item *lip) { switch (lip->li_type) { case XFS_LI_EFI: case XFS_LI_RUI: + case XFS_LI_CUI: + case XFS_LI_BUI: return true; default: return false; @@ -4488,6 +4832,12 @@ xlog_recover_process_intents( case XFS_LI_RUI: error = xlog_recover_process_rui(log->l_mp, ailp, lip); break; + case XFS_LI_CUI: + error = xlog_recover_process_cui(log->l_mp, ailp, lip); + break; + case XFS_LI_BUI: + error = xlog_recover_process_bui(log->l_mp, ailp, lip); + break; } if (error) goto out; @@ -4535,6 +4885,12 @@ xlog_recover_cancel_intents( case XFS_LI_RUI: xlog_recover_cancel_rui(log->l_mp, ailp, lip); break; + case XFS_LI_CUI: + xlog_recover_cancel_cui(log->l_mp, ailp, lip); + break; + case XFS_LI_BUI: + xlog_recover_cancel_bui(log->l_mp, ailp, lip); + break; } lip = xfs_trans_ail_cursor_next(ailp, &cur); @@ -4613,6 +4969,7 @@ xlog_recover_process_one_iunlink( if (error) goto fail_iput; + xfs_iflags_clear(ip, XFS_IRECOVERY); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 56e85a6c85c7..fc7873942bea 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,6 +43,8 @@ #include "xfs_icache.h" #include "xfs_sysfs.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_reflink.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); @@ -684,6 +686,7 @@ xfs_mountfs( xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); xfs_ialloc_compute_maxlevels(mp); xfs_rmapbt_compute_maxlevels(mp); + xfs_refcountbt_compute_maxlevels(mp); xfs_set_maxicount(mp); @@ -923,6 +926,15 @@ xfs_mountfs( } /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion + * of inodes before we're done replaying log items on those + * inodes. + */ + mp->m_super->s_flags |= MS_ACTIVE; + + /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently * read in. @@ -974,10 +986,28 @@ xfs_mountfs( if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_quota; + } + + /* Reserve AG blocks for future btree expansion. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + goto out_agresv; } return 0; + out_agresv: + xfs_fs_unreserve_ag_blocks(mp); + out_quota: + xfs_qm_unmount_quotas(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: @@ -1019,7 +1049,9 @@ xfs_unmountfs( int error; cancel_delayed_work_sync(&mp->m_eofblocks_work); + cancel_delayed_work_sync(&mp->m_cowblocks_work); + xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); IRELE(mp->m_rootip); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 041d9493e798..819b80b15bfb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -124,10 +124,13 @@ typedef struct xfs_mount { uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_rmap_mxr[2]; /* max rmap btree records */ uint m_rmap_mnr[2]; /* min rmap btree records */ + uint m_refc_mxr[2]; /* max refc btree records */ + uint m_refc_mnr[2]; /* min refc btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ uint m_in_maxlevels; /* max inobt btree levels. */ uint m_rmap_maxlevels; /* max rmap btree levels */ + uint m_refc_maxlevels; /* max refcount btree level */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ @@ -161,6 +164,8 @@ typedef struct xfs_mount { struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ + struct delayed_work m_cowblocks_work; /* background cow blocks + trimming */ bool m_update_sb; /* sb needs update in mount */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ @@ -399,6 +404,9 @@ typedef struct xfs_perag { struct xfs_ag_resv pag_meta_resv; /* Blocks reserved for just AGFL-based metadata. */ struct xfs_ag_resv pag_agfl_resv; + + /* reference count */ + __uint8_t pagf_refcount_level; } xfs_perag_t; static inline struct xfs_ag_resv * diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 69e2986a3776..0c381d71b242 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -49,6 +49,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key, 4); XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec, 16); + XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key, 4); + XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec, 12); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key, 20); XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec, 24); XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp, 8); @@ -56,6 +58,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t, 8); XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t, 4); + XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t, 4); XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t, 4); /* dir/attr trees */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 0f14b2e4bf6c..93a7aafa56d6 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -114,6 +114,13 @@ xfs_fs_map_blocks( return -ENXIO; /* + * The pNFS block layout spec actually supports reflink like + * functionality, but the Linux pNFS server doesn't implement it yet. + */ + if (xfs_is_reflink_inode(ip)) + return -ENXIO; + + /* * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c new file mode 100644 index 000000000000..fe86a668a57e --- /dev/null +++ b/fs/xfs/xfs_refcount_item.c @@ -0,0 +1,539 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_buf_item.h" +#include "xfs_refcount_item.h" +#include "xfs_log.h" +#include "xfs_refcount.h" + + +kmem_zone_t *xfs_cui_zone; +kmem_zone_t *xfs_cud_zone; + +static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_cui_log_item, cui_item); +} + +void +xfs_cui_item_free( + struct xfs_cui_log_item *cuip) +{ + if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) + kmem_free(cuip); + else + kmem_zone_free(xfs_cui_zone, cuip); +} + +STATIC void +xfs_cui_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + + *nvecs += 1; + *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cui log item. We use only 1 iovec, and we point that + * at the cui_log_format structure embedded in the cui item. + * It is at this point that we assert that all of the extent + * slots in the cui item have been filled. + */ +STATIC void +xfs_cui_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + ASSERT(atomic_read(&cuip->cui_next_extent) == + cuip->cui_format.cui_nextents); + + cuip->cui_format.cui_type = XFS_LI_CUI; + cuip->cui_format.cui_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format, + xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents)); +} + +/* + * Pinning has no meaning for an cui item, so just return. + */ +STATIC void +xfs_cui_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * The unpin operation is the last place an CUI is manipulated in the log. It is + * either inserted in the AIL or aborted in the event of a log I/O error. In + * either case, the CUI transaction has been successfully committed to make it + * this far. Therefore, we expect whoever committed the CUI to either construct + * and commit the CUD or drop the CUD's reference in the event of error. Simply + * drop the log's CUI reference now that the log is done with it. + */ +STATIC void +xfs_cui_item_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_cui_log_item *cuip = CUI_ITEM(lip); + + xfs_cui_release(cuip); +} + +/* + * CUI items have no locking or pushing. However, since CUIs are pulled from + * the AIL when their corresponding CUDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the CUI out of + * the AIL. + */ +STATIC uint +xfs_cui_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The CUI has been either committed or aborted if the transaction has been + * cancelled. If the transaction was cancelled, an CUD isn't going to be + * constructed and thus we free the CUI here directly. + */ +STATIC void +xfs_cui_item_unlock( + struct xfs_log_item *lip) +{ + if (lip->li_flags & XFS_LI_ABORTED) + xfs_cui_item_free(CUI_ITEM(lip)); +} + +/* + * The CUI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_cui_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +/* + * The CUI dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cui_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all cui log items. + */ +static const struct xfs_item_ops xfs_cui_item_ops = { + .iop_size = xfs_cui_item_size, + .iop_format = xfs_cui_item_format, + .iop_pin = xfs_cui_item_pin, + .iop_unpin = xfs_cui_item_unpin, + .iop_unlock = xfs_cui_item_unlock, + .iop_committed = xfs_cui_item_committed, + .iop_push = xfs_cui_item_push, + .iop_committing = xfs_cui_item_committing, +}; + +/* + * Allocate and initialize an cui item with the given number of extents. + */ +struct xfs_cui_log_item * +xfs_cui_init( + struct xfs_mount *mp, + uint nextents) + +{ + struct xfs_cui_log_item *cuip; + + ASSERT(nextents > 0); + if (nextents > XFS_CUI_MAX_FAST_EXTENTS) + cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), + KM_SLEEP); + else + cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP); + + xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); + cuip->cui_format.cui_nextents = nextents; + cuip->cui_format.cui_id = (uintptr_t)(void *)cuip; + atomic_set(&cuip->cui_next_extent, 0); + atomic_set(&cuip->cui_refcount, 2); + + return cuip; +} + +/* + * Freeing the CUI requires that we remove it from the AIL if it has already + * been placed there. However, the CUI may not yet have been placed in the AIL + * when called by xfs_cui_release() from CUD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the CUI. + */ +void +xfs_cui_release( + struct xfs_cui_log_item *cuip) +{ + if (atomic_dec_and_test(&cuip->cui_refcount)) { + xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); + xfs_cui_item_free(cuip); + } +} + +static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_cud_log_item, cud_item); +} + +STATIC void +xfs_cud_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_cud_log_format); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given cud log item. We use only 1 iovec, and we point that + * at the cud_log_format structure embedded in the cud item. + * It is at this point that we assert that all of the extent + * slots in the cud item have been filled. + */ +STATIC void +xfs_cud_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + cudp->cud_format.cud_type = XFS_LI_CUD; + cudp->cud_format.cud_size = 1; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format, + sizeof(struct xfs_cud_log_format)); +} + +/* + * Pinning has no meaning for an cud item, so just return. + */ +STATIC void +xfs_cud_item_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an cud item, unpinning does + * not either. + */ +STATIC void +xfs_cud_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push on an cud item. It is simply stuck + * waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_cud_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_PINNED; +} + +/* + * The CUD is either committed or aborted if the transaction is cancelled. If + * the transaction is cancelled, drop our reference to the CUI and free the + * CUD. + */ +STATIC void +xfs_cud_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + if (lip->li_flags & XFS_LI_ABORTED) { + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); + } +} + +/* + * When the cud item is committed to disk, all we need to do is delete our + * reference to our partner cui item and then free ourselves. Since we're + * freeing ourselves we must return -1 to keep the transaction code from + * further referencing this item. + */ +STATIC xfs_lsn_t +xfs_cud_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_cud_log_item *cudp = CUD_ITEM(lip); + + /* + * Drop the CUI reference regardless of whether the CUD has been + * aborted. Once the CUD transaction is constructed, it is the sole + * responsibility of the CUD to release the CUI (even if the CUI is + * aborted due to log I/O error). + */ + xfs_cui_release(cudp->cud_cuip); + kmem_zone_free(xfs_cud_zone, cudp); + + return (xfs_lsn_t)-1; +} + +/* + * The CUD dependency tracking op doesn't do squat. It can't because + * it doesn't know where the free extent is coming from. The dependency + * tracking has to be handled by the "enclosing" metadata object. For + * example, for inodes, the inode is locked throughout the extent freeing + * so the dependency should be recorded there. + */ +STATIC void +xfs_cud_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all cud log items. + */ +static const struct xfs_item_ops xfs_cud_item_ops = { + .iop_size = xfs_cud_item_size, + .iop_format = xfs_cud_item_format, + .iop_pin = xfs_cud_item_pin, + .iop_unpin = xfs_cud_item_unpin, + .iop_unlock = xfs_cud_item_unlock, + .iop_committed = xfs_cud_item_committed, + .iop_push = xfs_cud_item_push, + .iop_committing = xfs_cud_item_committing, +}; + +/* + * Allocate and initialize an cud item with the given number of extents. + */ +struct xfs_cud_log_item * +xfs_cud_init( + struct xfs_mount *mp, + struct xfs_cui_log_item *cuip) + +{ + struct xfs_cud_log_item *cudp; + + cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP); + xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return cudp; +} + +/* + * Process a refcount update intent item that was recovered from the log. + * We need to update the refcountbt. + */ +int +xfs_cui_recover( + struct xfs_mount *mp, + struct xfs_cui_log_item *cuip) +{ + int i; + int error = 0; + unsigned int refc_type; + struct xfs_phys_extent *refc; + xfs_fsblock_t startblock_fsb; + bool op_ok; + struct xfs_cud_log_item *cudp; + struct xfs_trans *tp; + struct xfs_btree_cur *rcur = NULL; + enum xfs_refcount_intent_type type; + xfs_fsblock_t firstfsb; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_len; + struct xfs_bmbt_irec irec; + struct xfs_defer_ops dfops; + bool requeue_only = false; + + ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)); + + /* + * First check the validity of the extents described by the + * CUI. If any are bad, then assume that all are bad and + * just toss the CUI. + */ + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { + refc = &cuip->cui_format.cui_extents[i]; + startblock_fsb = XFS_BB_TO_FSB(mp, + XFS_FSB_TO_DADDR(mp, refc->pe_startblock)); + switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + op_ok = true; + break; + default: + op_ok = false; + break; + } + if (!op_ok || startblock_fsb == 0 || + refc->pe_len == 0 || + startblock_fsb >= mp->m_sb.sb_dblocks || + refc->pe_len >= mp->m_sb.sb_agblocks || + (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) { + /* + * This will pull the CUI from the AIL and + * free the memory associated with it. + */ + set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); + xfs_cui_release(cuip); + return -EIO; + } + } + + /* + * Under normal operation, refcount updates are deferred, so we + * wouldn't be adding them directly to a transaction. All + * refcount updates manage reservation usage internally and + * dynamically by deferring work that won't fit in the + * transaction. Normally, any work that needs to be deferred + * gets attached to the same defer_ops that scheduled the + * refcount update. However, we're in log recovery here, so we + * we create our own defer_ops and use that to finish up any + * work that doesn't fit. + */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) + return error; + cudp = xfs_trans_get_cud(tp, cuip); + + xfs_defer_init(&dfops, &firstfsb); + for (i = 0; i < cuip->cui_format.cui_nextents; i++) { + refc = &cuip->cui_format.cui_extents[i]; + refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + switch (refc_type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + type = refc_type; + break; + default: + error = -EFSCORRUPTED; + goto abort_error; + } + if (requeue_only) { + new_fsb = refc->pe_startblock; + new_len = refc->pe_len; + } else + error = xfs_trans_log_finish_refcount_update(tp, cudp, + &dfops, type, refc->pe_startblock, refc->pe_len, + &new_fsb, &new_len, &rcur); + if (error) + goto abort_error; + + /* Requeue what we didn't finish. */ + if (new_len > 0) { + irec.br_startblock = new_fsb; + irec.br_blockcount = new_len; + switch (type) { + case XFS_REFCOUNT_INCREASE: + error = xfs_refcount_increase_extent( + tp->t_mountp, &dfops, &irec); + break; + case XFS_REFCOUNT_DECREASE: + error = xfs_refcount_decrease_extent( + tp->t_mountp, &dfops, &irec); + break; + case XFS_REFCOUNT_ALLOC_COW: + error = xfs_refcount_alloc_cow_extent( + tp->t_mountp, &dfops, + irec.br_startblock, + irec.br_blockcount); + break; + case XFS_REFCOUNT_FREE_COW: + error = xfs_refcount_free_cow_extent( + tp->t_mountp, &dfops, + irec.br_startblock, + irec.br_blockcount); + break; + default: + ASSERT(0); + } + if (error) + goto abort_error; + requeue_only = true; + } + } + + xfs_refcount_finish_one_cleanup(tp, rcur, error); + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto abort_error; + set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); + error = xfs_trans_commit(tp); + return error; + +abort_error: + xfs_refcount_finish_one_cleanup(tp, rcur, error); + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h new file mode 100644 index 000000000000..5b74dddfa64b --- /dev/null +++ b/fs/xfs/xfs_refcount_item.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFCOUNT_ITEM_H__ +#define __XFS_REFCOUNT_ITEM_H__ + +/* + * There are (currently) two pairs of refcount btree redo item types: + * increase and decrease. The log items for these are CUI (refcount + * update intent) and CUD (refcount update done). The redo item type + * is encoded in the flags field of each xfs_map_extent. + * + * *I items should be recorded in the *first* of a series of rolled + * transactions, and the *D items should be recorded in the same + * transaction that records the associated refcountbt updates. + * + * Should the system crash after the commit of the first transaction + * but before the commit of the final transaction in a series, log + * recovery will use the redo information recorded by the intent items + * to replay the refcountbt metadata updates. + */ + +/* kernel only CUI/CUD definitions */ + +struct xfs_mount; +struct kmem_zone; + +/* + * Max number of extents in fast allocation path. + */ +#define XFS_CUI_MAX_FAST_EXTENTS 16 + +/* + * Define CUI flag bits. Manipulated by set/clear/test_bit operators. + */ +#define XFS_CUI_RECOVERED 1 + +/* + * This is the "refcount update intent" log item. It is used to log + * the fact that some reverse mappings need to change. It is used in + * conjunction with the "refcount update done" log item described + * below. + * + * These log items follow the same rules as struct xfs_efi_log_item; + * see the comments about that structure (in xfs_extfree_item.h) for + * more details. + */ +struct xfs_cui_log_item { + struct xfs_log_item cui_item; + atomic_t cui_refcount; + atomic_t cui_next_extent; + unsigned long cui_flags; /* misc flags */ + struct xfs_cui_log_format cui_format; +}; + +static inline size_t +xfs_cui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_cui_log_item, cui_format) + + xfs_cui_log_format_sizeof(nr); +} + +/* + * This is the "refcount update done" log item. It is used to log the + * fact that some refcountbt updates mentioned in an earlier cui item + * have been performed. + */ +struct xfs_cud_log_item { + struct xfs_log_item cud_item; + struct xfs_cui_log_item *cud_cuip; + struct xfs_cud_log_format cud_format; +}; + +extern struct kmem_zone *xfs_cui_zone; +extern struct kmem_zone *xfs_cud_zone; + +struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint); +struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *, + struct xfs_cui_log_item *); +void xfs_cui_item_free(struct xfs_cui_log_item *); +void xfs_cui_release(struct xfs_cui_log_item *); +int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip); + +#endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c new file mode 100644 index 000000000000..5965e9455d91 --- /dev/null +++ b/fs/xfs/xfs_reflink.c @@ -0,0 +1,1688 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_icache.h" +#include "xfs_pnfs.h" +#include "xfs_btree.h" +#include "xfs_refcount_btree.h" +#include "xfs_refcount.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_bit.h" +#include "xfs_alloc.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_reflink.h" +#include "xfs_iomap.h" +#include "xfs_rmap_btree.h" +#include "xfs_sb.h" +#include "xfs_ag_resv.h" + +/* + * Copy on Write of Shared Blocks + * + * XFS must preserve "the usual" file semantics even when two files share + * the same physical blocks. This means that a write to one file must not + * alter the blocks in a different file; the way that we'll do that is + * through the use of a copy-on-write mechanism. At a high level, that + * means that when we want to write to a shared block, we allocate a new + * block, write the data to the new block, and if that succeeds we map the + * new block into the file. + * + * XFS provides a "delayed allocation" mechanism that defers the allocation + * of disk blocks to dirty-but-not-yet-mapped file blocks as long as + * possible. This reduces fragmentation by enabling the filesystem to ask + * for bigger chunks less often, which is exactly what we want for CoW. + * + * The delalloc mechanism begins when the kernel wants to make a block + * writable (write_begin or page_mkwrite). If the offset is not mapped, we + * create a delalloc mapping, which is a regular in-core extent, but without + * a real startblock. (For delalloc mappings, the startblock encodes both + * a flag that this is a delalloc mapping, and a worst-case estimate of how + * many blocks might be required to put the mapping into the BMBT.) delalloc + * mappings are a reservation against the free space in the filesystem; + * adjacent mappings can also be combined into fewer larger mappings. + * + * When dirty pages are being written out (typically in writepage), the + * delalloc reservations are converted into real mappings by allocating + * blocks and replacing the delalloc mapping with real ones. A delalloc + * mapping can be replaced by several real ones if the free space is + * fragmented. + * + * We want to adapt the delalloc mechanism for copy-on-write, since the + * write paths are similar. The first two steps (creating the reservation + * and allocating the blocks) are exactly the same as delalloc except that + * the mappings must be stored in a separate CoW fork because we do not want + * to disturb the mapping in the data fork until we're sure that the write + * succeeded. IO completion in this case is the process of removing the old + * mapping from the data fork and moving the new mapping from the CoW fork to + * the data fork. This will be discussed shortly. + * + * For now, unaligned directio writes will be bounced back to the page cache. + * Block-aligned directio writes will use the same mechanism as buffered + * writes. + * + * CoW remapping must be done after the data block write completes, + * because we don't want to destroy the old data fork map until we're sure + * the new block has been written. Since the new mappings are kept in a + * separate fork, we can simply iterate these mappings to find the ones + * that cover the file blocks that we just CoW'd. For each extent, simply + * unmap the corresponding range in the data fork, map the new range into + * the data fork, and remove the extent from the CoW fork. + * + * Since the remapping operation can be applied to an arbitrary file + * range, we record the need for the remap step as a flag in the ioend + * instead of declaring a new IO type. This is required for direct io + * because we only have ioend for the whole dio, and we have to be able to + * remember the presence of unwritten blocks and CoW blocks with a single + * ioend structure. Better yet, the more ground we can cover with one + * ioend, the better. + */ + +/* + * Given an AG extent, find the lowest-numbered run of shared blocks + * within that range and return the range in fbno/flen. If + * find_end_of_shared is true, return the longest contiguous extent of + * shared blocks. If there are no shared extents, fbno and flen will + * be set to NULLAGBLOCK and 0, respectively. + */ +int +xfs_reflink_find_shared( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_extlen_t aglen, + xfs_agblock_t *fbno, + xfs_extlen_t *flen, + bool find_end_of_shared) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error) + return error; + + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); + + error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, + find_end_of_shared); + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + xfs_buf_relse(agbp); + return error; +} + +/* + * Trim the mapping to the next block where there's a change in the + * shared/unshared status. More specifically, this means that we + * find the lowest-numbered extent of shared blocks that coincides with + * the given block mapping. If the shared extent overlaps the start of + * the mapping, trim the mapping to the end of the shared extent. If + * the shared region intersects the mapping, trim the mapping to the + * start of the shared extent. If there are no shared regions that + * overlap, just return the original extent. + */ +int +xfs_reflink_trim_around_shared( + struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, + bool *shared, + bool *trimmed) +{ + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error = 0; + + /* Holes, unwritten, and delalloc extents cannot be shared */ + if (!xfs_is_reflink_inode(ip) || + ISUNWRITTEN(irec) || + irec->br_startblock == HOLESTARTBLOCK || + irec->br_startblock == DELAYSTARTBLOCK) { + *shared = false; + return 0; + } + + trace_xfs_reflink_trim_around_shared(ip, irec); + + agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); + agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); + aglen = irec->br_blockcount; + + error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, + aglen, &fbno, &flen, true); + if (error) + return error; + + *shared = *trimmed = false; + if (fbno == NULLAGBLOCK) { + /* No shared blocks at all. */ + return 0; + } else if (fbno == agbno) { + /* + * The start of this extent is shared. Truncate the + * mapping at the end of the shared region so that a + * subsequent iteration starts at the start of the + * unshared region. + */ + irec->br_blockcount = flen; + *shared = true; + if (flen != aglen) + *trimmed = true; + return 0; + } else { + /* + * There's a shared extent midway through this extent. + * Truncate the mapping at the start of the shared + * extent so that a subsequent iteration starts at the + * start of the shared region. + */ + irec->br_blockcount = fbno - agbno; + *trimmed = true; + return 0; + } +} + +/* Create a CoW reservation for a range of blocks within a file. */ +static int +__xfs_reflink_reserve_cow( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb, + bool *skipped) +{ + struct xfs_bmbt_irec got, prev, imap; + xfs_fileoff_t orig_end_fsb; + int nimaps, eof = 0, error = 0; + bool shared = false, trimmed = false; + xfs_extnum_t idx; + xfs_extlen_t align; + + /* Already reserved? Skip the refcount btree access. */ + xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, + &got, &prev); + if (!eof && got.br_startoff <= *offset_fsb) { + end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; + trace_xfs_reflink_cow_found(ip, &got); + goto done; + } + + /* Read extent from the source file. */ + nimaps = 1; + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, + &imap, &nimaps, 0); + if (error) + goto out_unlock; + ASSERT(nimaps == 1); + + /* Trim the mapping to the nearest shared extent boundary. */ + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); + if (error) + goto out_unlock; + + end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; + + /* Not shared? Just report the (potentially capped) extent. */ + if (!shared) { + *skipped = true; + goto done; + } + + /* + * Fork all the shared blocks from our write offset until the end of + * the extent. + */ + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out_unlock; + + align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); + if (align) + end_fsb = roundup_64(end_fsb, align); + +retry: + error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, + end_fsb - *offset_fsb, &got, + &prev, &idx, eof); + switch (error) { + case 0: + break; + case -ENOSPC: + case -EDQUOT: + /* retry without any preallocation */ + trace_xfs_reflink_cow_enospc(ip, &imap); + if (end_fsb != orig_end_fsb) { + end_fsb = orig_end_fsb; + goto retry; + } + /*FALLTHRU*/ + default: + goto out_unlock; + } + + if (end_fsb != orig_end_fsb) + xfs_inode_set_cowblocks_tag(ip); + + trace_xfs_reflink_cow_alloc(ip, &got); +done: + *offset_fsb = end_fsb; +out_unlock: + return error; +} + +/* Create a CoW reservation for part of a file. */ +int +xfs_reflink_reserve_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + bool skipped = false; + int error; + + trace_xfs_reflink_reserve_cow_range(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + count); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + while (offset_fsb < end_fsb) { + error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb, + &skipped); + if (error) { + trace_xfs_reflink_reserve_cow_range_error(ip, error, + _RET_IP_); + break; + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + +/* Allocate all CoW reservations covering a range of blocks in a file. */ +static int +__xfs_reflink_allocate_cow( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + struct xfs_defer_ops dfops; + struct xfs_trans *tp; + xfs_fsblock_t first_block; + xfs_fileoff_t next_fsb; + int nimaps = 1, error; + bool skipped = false; + + xfs_defer_init(&dfops, &first_block); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + next_fsb = *offset_fsb; + error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped); + if (error) + goto out_trans_cancel; + + if (skipped) { + *offset_fsb = next_fsb; + goto out_trans_cancel; + } + + xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb, + XFS_BMAPI_COWFORK, &first_block, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), + &imap, &nimaps, &dfops); + if (error) + goto out_trans_cancel; + + /* We might not have been able to map the whole delalloc extent */ + *offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb); + + error = xfs_defer_finish(&tp, &dfops, NULL); + if (error) + goto out_trans_cancel; + + error = xfs_trans_commit(tp); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_trans_cancel: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + goto out_unlock; +} + +/* Allocate all CoW reservations covering a part of a file. */ +int +xfs_reflink_allocate_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + int error; + + ASSERT(xfs_is_reflink_inode(ip)); + + trace_xfs_reflink_allocate_cow_range(ip, offset, count); + + /* + * Make sure that the dquots are there. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + while (offset_fsb < end_fsb) { + error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb); + if (error) { + trace_xfs_reflink_allocate_cow_range_error(ip, error, + _RET_IP_); + break; + } + } + + return error; +} + +/* + * Find the CoW reservation (and whether or not it needs block allocation) + * for a given byte offset of a file. + */ +bool +xfs_reflink_find_cow_mapping( + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + bool *need_alloc) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_fileoff_t bno; + xfs_extnum_t idx; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + ASSERT(xfs_is_reflink_inode(ip)); + + /* Find the extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + bno = XFS_B_TO_FSBT(ip->i_mount, offset); + gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); + if (!gotp) + return false; + + xfs_bmbt_get_all(gotp, &irec); + if (bno >= irec.br_startoff + irec.br_blockcount || + bno < irec.br_startoff) + return false; + + trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, + &irec); + + /* If it's still delalloc, we must allocate later. */ + *imap = irec; + *need_alloc = !!(isnullstartblock(irec.br_startblock)); + + return true; +} + +/* + * Trim an extent to end at the next CoW reservation past offset_fsb. + */ +int +xfs_reflink_trim_irec_to_next_cow( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + struct xfs_bmbt_irec *imap) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_extnum_t idx; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + /* Find the extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); + if (!gotp) + return 0; + xfs_bmbt_get_all(gotp, &irec); + + /* This is the extent before; try sliding up one. */ + if (irec.br_startoff < offset_fsb) { + idx++; + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + return 0; + gotp = xfs_iext_get_ext(ifp, idx); + xfs_bmbt_get_all(gotp, &irec); + } + + if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) + return 0; + + imap->br_blockcount = irec.br_startoff - imap->br_startoff; + trace_xfs_reflink_trim_irec(ip, imap); + + return 0; +} + +/* + * Cancel all pending CoW reservations for some block range of an inode. + */ +int +xfs_reflink_cancel_cow_blocks( + struct xfs_inode *ip, + struct xfs_trans **tpp, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_bmbt_irec irec; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error = 0; + int nimaps; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + /* Go find the old extent in the CoW fork. */ + while (offset_fsb < end_fsb) { + nimaps = 1; + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, + &nimaps, XFS_BMAPI_COWFORK); + if (error) + break; + ASSERT(nimaps == 1); + + trace_xfs_reflink_cancel_cow(ip, &irec); + + if (irec.br_startblock == DELAYSTARTBLOCK) { + /* Free a delayed allocation. */ + xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount, + false); + ip->i_delayed_blks -= irec.br_blockcount; + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &irec); + if (error) + break; + } else if (irec.br_startblock == HOLESTARTBLOCK) { + /* empty */ + } else { + xfs_trans_ijoin(*tpp, ip, 0); + xfs_defer_init(&dfops, &firstfsb); + + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(ip->i_mount, + &dfops, irec.br_startblock, + irec.br_blockcount); + if (error) + break; + + xfs_bmap_add_free(ip->i_mount, &dfops, + irec.br_startblock, irec.br_blockcount, + NULL); + + /* Update quota accounting */ + xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT, + -(long)irec.br_blockcount); + + /* Roll the transaction */ + error = xfs_defer_finish(tpp, &dfops, ip); + if (error) { + xfs_defer_cancel(&dfops); + break; + } + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &irec); + if (error) + break; + } + + /* Roll on... */ + offset_fsb = irec.br_startoff + irec.br_blockcount; + } + + return error; +} + +/* + * Cancel all pending CoW reservations for some byte range of an inode. + */ +int +xfs_reflink_cancel_cow_range( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error; + + trace_xfs_reflink_cancel_cow_range(ip, offset, count); + ASSERT(xfs_is_reflink_inode(ip)); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + if (count == NULLFILEOFF) + end_fsb = NULLFILEOFF; + else + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + + /* Start a rolling transaction to remove the mappings */ + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + 0, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* Scrape out the old CoW reservations */ + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + if (error) + goto out_cancel; + + error = xfs_trans_commit(tp); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_); + return error; +} + +/* + * Remap parts of a file's data fork after a successful CoW. + */ +int +xfs_reflink_end_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec uirec; + struct xfs_trans *tp; + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstfsb; + struct xfs_defer_ops dfops; + int error; + unsigned int resblks; + xfs_filblks_t ilen; + xfs_filblks_t rlen; + int nimaps; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + + /* Start a rolling transaction to switch the mappings */ + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, + resblks, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* Go find the old extent in the CoW fork. */ + while (offset_fsb < end_fsb) { + /* Read extent from the source file */ + nimaps = 1; + count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb); + error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec, + &nimaps, XFS_BMAPI_COWFORK); + if (error) + goto out_cancel; + ASSERT(nimaps == 1); + + ASSERT(irec.br_startblock != DELAYSTARTBLOCK); + trace_xfs_reflink_cow_remap(ip, &irec); + + /* + * We can have a hole in the CoW fork if part of a directio + * write is CoW but part of it isn't. + */ + rlen = ilen = irec.br_blockcount; + if (irec.br_startblock == HOLESTARTBLOCK) + goto next_extent; + + /* Unmap the old blocks in the data fork. */ + while (rlen) { + xfs_defer_init(&dfops, &firstfsb); + error = __xfs_bunmapi(tp, ip, irec.br_startoff, + &rlen, 0, 1, &firstfsb, &dfops); + if (error) + goto out_defer; + + /* + * Trim the extent to whatever got unmapped. + * Remember, bunmapi works backwards. + */ + uirec.br_startblock = irec.br_startblock + rlen; + uirec.br_startoff = irec.br_startoff + rlen; + uirec.br_blockcount = irec.br_blockcount - rlen; + irec.br_blockcount = rlen; + trace_xfs_reflink_cow_remap_piece(ip, &uirec); + + /* Free the CoW orphan record. */ + error = xfs_refcount_free_cow_extent(tp->t_mountp, + &dfops, uirec.br_startblock, + uirec.br_blockcount); + if (error) + goto out_defer; + + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(tp->t_mountp, &dfops, + ip, &uirec); + if (error) + goto out_defer; + + /* Remove the mapping from the CoW fork. */ + error = xfs_bunmapi_cow(ip, &uirec); + if (error) + goto out_defer; + + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; + } + +next_extent: + /* Roll on... */ + offset_fsb = irec.br_startoff + ilen; + } + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + return 0; + +out_defer: + xfs_defer_cancel(&dfops); +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + return error; +} + +/* + * Free leftover CoW reservations that didn't get cleaned out. + */ +int +xfs_reflink_recover_cow( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + int error = 0; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + error = xfs_refcount_recover_cow_leftovers(mp, agno); + if (error) + break; + } + + return error; +} + +/* + * Reflinking (Block) Ranges of Two Files Together + * + * First, ensure that the reflink flag is set on both inodes. The flag is an + * optimization to avoid unnecessary refcount btree lookups in the write path. + * + * Now we can iteratively remap the range of extents (and holes) in src to the + * corresponding ranges in dest. Let drange and srange denote the ranges of + * logical blocks in dest and src touched by the reflink operation. + * + * While the length of drange is greater than zero, + * - Read src's bmbt at the start of srange ("imap") + * - If imap doesn't exist, make imap appear to start at the end of srange + * with zero length. + * - If imap starts before srange, advance imap to start at srange. + * - If imap goes beyond srange, truncate imap to end at the end of srange. + * - Punch (imap start - srange start + imap len) blocks from dest at + * offset (drange start). + * - If imap points to a real range of pblks, + * > Increase the refcount of the imap's pblks + * > Map imap's pblks into dest at the offset + * (drange start + imap start - srange start) + * - Advance drange and srange by (imap start - srange start + imap len) + * + * Finally, if the reflink made dest longer, update both the in-core and + * on-disk file sizes. + * + * ASCII Art Demonstration: + * + * Let's say we want to reflink this source file: + * + * ----SSSSSSS-SSSSS----SSSSSS (src file) + * <--------------------> + * + * into this destination file: + * + * --DDDDDDDDDDDDDDDDDDD--DDD (dest file) + * <--------------------> + * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest. + * Observe that the range has different logical offsets in either file. + * + * Consider that the first extent in the source file doesn't line up with our + * reflink range. Unmapping and remapping are separate operations, so we can + * unmap more blocks from the destination file than we remap. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-------> + * --DDDDD---------DDDDD--DDD + * <-------> + * + * Now remap the source extent into the destination file: + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-------> + * --DDDDD--SSSSSSSDDDDD--DDD + * <-------> + * + * Do likewise with the second hole and extent in our range. Holes in the + * unmap range don't affect our operation. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <----> + * --DDDDD--SSSSSSS-SSSSS-DDD + * <----> + * + * Finally, unmap and remap part of the third extent. This will increase the + * size of the destination file. + * + * ----SSSSSSS-SSSSS----SSSSSS + * <-----> + * --DDDDD--SSSSSSS-SSSSS----SSS + * <-----> + * + * Once we update the destination file's i_size, we're done. + */ + +/* + * Ensure the reflink bit is set in both inodes. + */ +STATIC int +xfs_reflink_set_inode_flag( + struct xfs_inode *src, + struct xfs_inode *dest) +{ + struct xfs_mount *mp = src->i_mount; + int error; + struct xfs_trans *tp; + + if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest)) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + goto out_error; + + /* Lock both files against IO */ + if (src->i_ino == dest->i_ino) + xfs_ilock(src, XFS_ILOCK_EXCL); + else + xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL); + + if (!xfs_is_reflink_inode(src)) { + trace_xfs_reflink_set_inode_flag(src); + xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL); + src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, src, XFS_ILOG_CORE); + xfs_ifork_init_cow(src); + } else + xfs_iunlock(src, XFS_ILOCK_EXCL); + + if (src->i_ino == dest->i_ino) + goto commit_flags; + + if (!xfs_is_reflink_inode(dest)) { + trace_xfs_reflink_set_inode_flag(dest); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK; + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + xfs_ifork_init_cow(dest); + } else + xfs_iunlock(dest, XFS_ILOCK_EXCL); + +commit_flags: + error = xfs_trans_commit(tp); + if (error) + goto out_error; + return error; + +out_error: + trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_); + return error; +} + +/* + * Update destination inode size & cowextsize hint, if necessary. + */ +STATIC int +xfs_reflink_update_dest( + struct xfs_inode *dest, + xfs_off_t newlen, + xfs_extlen_t cowextsize) +{ + struct xfs_mount *mp = dest->i_mount; + struct xfs_trans *tp; + int error; + + if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + return 0; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); + if (error) + goto out_error; + + xfs_ilock(dest, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL); + + if (newlen > i_size_read(VFS_I(dest))) { + trace_xfs_reflink_update_inode_size(dest, newlen); + i_size_write(VFS_I(dest), newlen); + dest->i_d.di_size = newlen; + } + + if (cowextsize) { + dest->i_d.di_cowextsize = cowextsize; + dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + } + + xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); + + error = xfs_trans_commit(tp); + if (error) + goto out_error; + return error; + +out_error: + trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_); + return error; +} + +/* + * Do we have enough reserve in this AG to handle a reflink? The refcount + * btree already reserved all the space it needs, but the rmap btree can grow + * infinitely, so we won't allow more reflinks when the AG is down to the + * btree reserves. + */ +static int +xfs_reflink_ag_has_free_space( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int error = 0; + + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return 0; + + pag = xfs_perag_get(mp, agno); + if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || + xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) + error = -ENOSPC; + xfs_perag_put(pag); + return error; +} + +/* + * Unmap a range of blocks from a file, then map other blocks into the hole. + * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount). + * The extent irec is mapped into dest at irec->br_startoff. + */ +STATIC int +xfs_reflink_remap_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, + xfs_fileoff_t destoff, + xfs_off_t new_isize) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + xfs_fsblock_t firstfsb; + unsigned int resblks; + struct xfs_defer_ops dfops; + struct xfs_bmbt_irec uirec; + bool real_extent; + xfs_filblks_t rlen; + xfs_filblks_t unmap_len; + xfs_off_t newlen; + int error; + + unmap_len = irec->br_startoff + irec->br_blockcount - destoff; + trace_xfs_reflink_punch_range(ip, destoff, unmap_len); + + /* Only remap normal extents. */ + real_extent = (irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !ISUNWRITTEN(irec)); + + /* No reflinking if we're low on space */ + if (real_extent) { + error = xfs_reflink_ag_has_free_space(mp, + XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + if (error) + goto out; + } + + /* Start a rolling transaction to switch the mappings */ + resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + if (error) + goto out; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* If we're not just clearing space, then do we have enough quota? */ + if (real_extent) { + error = xfs_trans_reserve_quota_nblks(tp, ip, + irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto out_cancel; + } + + trace_xfs_reflink_remap(ip, irec->br_startoff, + irec->br_blockcount, irec->br_startblock); + + /* Unmap the old blocks in the data fork. */ + rlen = unmap_len; + while (rlen) { + xfs_defer_init(&dfops, &firstfsb); + error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1, + &firstfsb, &dfops); + if (error) + goto out_defer; + + /* + * Trim the extent to whatever got unmapped. + * Remember, bunmapi works backwards. + */ + uirec.br_startblock = irec->br_startblock + rlen; + uirec.br_startoff = irec->br_startoff + rlen; + uirec.br_blockcount = unmap_len - rlen; + unmap_len = rlen; + + /* If this isn't a real mapping, we're done. */ + if (!real_extent || uirec.br_blockcount == 0) + goto next_extent; + + trace_xfs_reflink_remap(ip, uirec.br_startoff, + uirec.br_blockcount, uirec.br_startblock); + + /* Update the refcount tree */ + error = xfs_refcount_increase_extent(mp, &dfops, &uirec); + if (error) + goto out_defer; + + /* Map the new blocks into the data fork. */ + error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec); + if (error) + goto out_defer; + + /* Update quota accounting. */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, + uirec.br_blockcount); + + /* Update dest isize if needed. */ + newlen = XFS_FSB_TO_B(mp, + uirec.br_startoff + uirec.br_blockcount); + newlen = min_t(xfs_off_t, newlen, new_isize); + if (newlen > i_size_read(VFS_I(ip))) { + trace_xfs_reflink_update_inode_size(ip, newlen); + i_size_write(VFS_I(ip), newlen); + ip->i_d.di_size = newlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + +next_extent: + /* Process all the deferred stuff. */ + error = xfs_defer_finish(&tp, &dfops, ip); + if (error) + goto out_defer; + } + + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + return 0; + +out_defer: + xfs_defer_cancel(&dfops); +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_); + return error; +} + +/* + * Iteratively remap one file's extents (and holes) to another's. + */ +STATIC int +xfs_reflink_remap_blocks( + struct xfs_inode *src, + xfs_fileoff_t srcoff, + struct xfs_inode *dest, + xfs_fileoff_t destoff, + xfs_filblks_t len, + xfs_off_t new_isize) +{ + struct xfs_bmbt_irec imap; + int nimaps; + int error = 0; + xfs_filblks_t range_len; + + /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ + while (len) { + trace_xfs_reflink_remap_blocks_loop(src, srcoff, len, + dest, destoff); + /* Read extent from the source file */ + nimaps = 1; + xfs_ilock(src, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); + xfs_iunlock(src, XFS_ILOCK_EXCL); + if (error) + goto err; + ASSERT(nimaps == 1); + + trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, + &imap); + + /* Translate imap into the destination file. */ + range_len = imap.br_startoff + imap.br_blockcount - srcoff; + imap.br_startoff += destoff - srcoff; + + /* Clear dest from destoff to the end of imap and map it in. */ + error = xfs_reflink_remap_extent(dest, &imap, destoff, + new_isize); + if (error) + goto err; + + if (fatal_signal_pending(current)) { + error = -EINTR; + goto err; + } + + /* Advance drange/srange */ + srcoff += range_len; + destoff += range_len; + len -= range_len; + } + + return 0; + +err: + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + return error; +} + +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page * +xfs_get_page( + struct inode *inode, + xfs_off_t offset) +{ + struct address_space *mapping; + struct page *page; + pgoff_t n; + + n = offset >> PAGE_SHIFT; + mapping = inode->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + */ +static int +xfs_compare_extents( + struct inode *src, + xfs_off_t srcoff, + struct inode *dest, + xfs_off_t destoff, + xfs_off_t len, + bool *is_same) +{ + xfs_off_t src_poff; + xfs_off_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + xfs_off_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + ASSERT(cmp_len > 0); + + trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, + XFS_I(dest), destoff); + + src_page = xfs_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = xfs_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); + return error; +} + +/* + * Link a range of blocks from one file to another. + */ +int +xfs_reflink_remap_range( + struct xfs_inode *src, + xfs_off_t srcoff, + struct xfs_inode *dest, + xfs_off_t destoff, + xfs_off_t len, + unsigned int flags) +{ + struct xfs_mount *mp = src->i_mount; + xfs_fileoff_t sfsbno, dfsbno; + xfs_filblks_t fsblen; + int error; + xfs_extlen_t cowextsize; + bool is_same; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Don't reflink realtime inodes */ + if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) + return -EINVAL; + + if (flags & ~XFS_REFLINK_ALL) + return -EINVAL; + + trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff); + + /* Lock both files against IO */ + if (src->i_ino == dest->i_ino) { + xfs_ilock(src, XFS_IOLOCK_EXCL); + xfs_ilock(src, XFS_MMAPLOCK_EXCL); + } else { + xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); + } + + /* + * Check that the extents are the same. + */ + if (flags & XFS_REFLINK_DEDUPE) { + is_same = false; + error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest), + destoff, len, &is_same); + if (error) + goto out_error; + if (!is_same) { + error = -EBADE; + goto out_error; + } + } + + error = xfs_reflink_set_inode_flag(src, dest); + if (error) + goto out_error; + + /* + * Invalidate the page cache so that we can clear any CoW mappings + * in the destination file. + */ + truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff, + PAGE_ALIGN(destoff + len) - 1); + + dfsbno = XFS_B_TO_FSBT(mp, destoff); + sfsbno = XFS_B_TO_FSBT(mp, srcoff); + fsblen = XFS_B_TO_FSB(mp, len); + error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, + destoff + len); + if (error) + goto out_error; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (srcoff == 0 && len == i_size_read(VFS_I(src)) && + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + destoff == 0 && len >= i_size_read(VFS_I(dest)) && + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + + error = xfs_reflink_update_dest(dest, destoff + len, cowextsize); + if (error) + goto out_error; + +out_error: + xfs_iunlock(src, XFS_MMAPLOCK_EXCL); + xfs_iunlock(src, XFS_IOLOCK_EXCL); + if (src->i_ino != dest->i_ino) { + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + xfs_iunlock(dest, XFS_IOLOCK_EXCL); + } + if (error) + trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_); + return error; +} + +/* + * The user wants to preemptively CoW all shared blocks in this file, + * which enables us to turn off the reflink flag. Iterate all + * extents which are not prealloc/delalloc to see which ranges are + * mentioned in the refcount tree, then read those blocks into the + * pagecache, dirty them, fsync them back out, and then we can update + * the inode flag. What happens if we run out of memory? :) + */ +STATIC int +xfs_reflink_dirty_extents( + struct xfs_inode *ip, + xfs_fileoff_t fbno, + xfs_filblks_t end, + xfs_off_t isize) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + xfs_off_t fpos; + xfs_off_t flen; + struct xfs_bmbt_irec map[2]; + int nmaps; + int error = 0; + + while (end - fbno > 0) { + nmaps = 1; + /* + * Look for extents in the file. Skip holes, delalloc, or + * unwritten extents; they can't be reflinked. + */ + error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0); + if (error) + goto out; + if (nmaps == 0) + break; + if (map[0].br_startblock == HOLESTARTBLOCK || + map[0].br_startblock == DELAYSTARTBLOCK || + ISUNWRITTEN(&map[0])) + goto next; + + map[1] = map[0]; + while (map[1].br_blockcount) { + agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock); + aglen = map[1].br_blockcount; + + error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + &rbno, &rlen, true); + if (error) + goto out; + if (rbno == NULLAGBLOCK) + break; + + /* Dirty the pages */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); + fpos = XFS_FSB_TO_B(mp, map[1].br_startoff + + (rbno - agbno)); + flen = XFS_FSB_TO_B(mp, rlen); + if (fpos + flen > isize) + flen = isize - fpos; + error = iomap_file_dirty(VFS_I(ip), fpos, flen, + &xfs_iomap_ops); + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (error) + goto out; + + map[1].br_blockcount -= (rbno - agbno + rlen); + map[1].br_startoff += (rbno - agbno + rlen); + map[1].br_startblock += (rbno - agbno + rlen); + } + +next: + fbno = map[0].br_startoff + map[0].br_blockcount; + } +out: + return error; +} + +/* Clear the inode reflink flag if there are no shared extents. */ +int +xfs_reflink_clear_inode_flag( + struct xfs_inode *ip, + struct xfs_trans **tpp) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t fbno; + xfs_filblks_t end; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + struct xfs_bmbt_irec map; + int nmaps; + int error = 0; + + ASSERT(xfs_is_reflink_inode(ip)); + + fbno = 0; + end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip))); + while (end - fbno > 0) { + nmaps = 1; + /* + * Look for extents in the file. Skip holes, delalloc, or + * unwritten extents; they can't be reflinked. + */ + error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0); + if (error) + return error; + if (nmaps == 0) + break; + if (map.br_startblock == HOLESTARTBLOCK || + map.br_startblock == DELAYSTARTBLOCK || + ISUNWRITTEN(&map)) + goto next; + + agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock); + aglen = map.br_blockcount; + + error = xfs_reflink_find_shared(mp, agno, agbno, aglen, + &rbno, &rlen, false); + if (error) + return error; + /* Is there still a shared block here? */ + if (rbno != NULLAGBLOCK) + return 0; +next: + fbno = map.br_startoff + map.br_blockcount; + } + + /* + * We didn't find any shared blocks so turn off the reflink flag. + * First, get rid of any leftover CoW mappings. + */ + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + if (error) + return error; + + /* Clear the inode flag. */ + trace_xfs_reflink_unset_inode_flag(ip); + ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; + xfs_inode_clear_cowblocks_tag(ip); + xfs_trans_ijoin(*tpp, ip, 0); + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + + return error; +} + +/* + * Clear the inode reflink flag if there are no shared extents and the size + * hasn't changed. + */ +STATIC int +xfs_reflink_try_clear_inode_flag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error = 0; + + /* Start a rolling transaction to remove the mappings */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_clear_inode_flag(ip, &tp); + if (error) + goto cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +cancel: + xfs_trans_cancel(tp); +out: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Pre-COW all shared blocks within a given byte range of a file and turn off + * the reflink flag if we unshare all of the file's blocks. + */ +int +xfs_reflink_unshare( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t fbno; + xfs_filblks_t end; + xfs_off_t isize; + int error; + + if (!xfs_is_reflink_inode(ip)) + return 0; + + trace_xfs_reflink_unshare(ip, offset, len); + + inode_dio_wait(VFS_I(ip)); + + /* Try to CoW the selected ranges */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + fbno = XFS_B_TO_FSBT(mp, offset); + isize = i_size_read(VFS_I(ip)); + end = XFS_B_TO_FSB(mp, offset + len); + error = xfs_reflink_dirty_extents(ip, fbno, end, isize); + if (error) + goto out_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* Wait for the IO to finish */ + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out; + + /* Turn off the reflink flag if possible. */ + error = xfs_reflink_try_clear_inode_flag(ip); + if (error) + goto out; + + return 0; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out: + trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); + return error; +} + +/* + * Does this inode have any real CoW reservations? + */ +bool +xfs_reflink_has_real_cow_blocks( + struct xfs_inode *ip) +{ + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp; + struct xfs_bmbt_rec_host *gotp; + xfs_extnum_t idx; + + if (!xfs_is_reflink_inode(ip)) + return false; + + /* Go find the old extent in the CoW fork. */ + ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); + while (gotp) { + xfs_bmbt_get_all(gotp, &irec); + + if (!isnullstartblock(irec.br_startblock)) + return true; + + /* Roll on... */ + idx++; + if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + break; + gotp = xfs_iext_get_ext(ifp, idx); + } + + return false; +} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h new file mode 100644 index 000000000000..5dc3c8ac12aa --- /dev/null +++ b/fs/xfs/xfs_reflink.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_REFLINK_H +#define __XFS_REFLINK_H 1 + +extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, + xfs_extlen_t *flen, bool find_maximal); +extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); + +extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip, + xfs_off_t offset, xfs_off_t count); +extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, + xfs_off_t offset, xfs_off_t count); +extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, + struct xfs_bmbt_irec *imap, bool *need_alloc); +extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); + +extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, + struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb); +extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); +extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); +extern int xfs_reflink_recover_cow(struct xfs_mount *mp); +#define XFS_REFLINK_DEDUPE 1 /* only reflink if contents match */ +#define XFS_REFLINK_ALL (XFS_REFLINK_DEDUPE) +extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff, + struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len, + unsigned int flags); +extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, + struct xfs_trans **tpp); +extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); + +extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); + +#endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 0432a459871c..73c827831551 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -441,8 +441,11 @@ xfs_rui_recover( XFS_FSB_TO_DADDR(mp, rmap->me_startblock)); switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: + case XFS_RMAP_EXTENT_MAP_SHARED: case XFS_RMAP_EXTENT_UNMAP: + case XFS_RMAP_EXTENT_UNMAP_SHARED: case XFS_RMAP_EXTENT_CONVERT: + case XFS_RMAP_EXTENT_CONVERT_SHARED: case XFS_RMAP_EXTENT_ALLOC: case XFS_RMAP_EXTENT_FREE: op_ok = true; @@ -481,12 +484,21 @@ xfs_rui_recover( case XFS_RMAP_EXTENT_MAP: type = XFS_RMAP_MAP; break; + case XFS_RMAP_EXTENT_MAP_SHARED: + type = XFS_RMAP_MAP_SHARED; + break; case XFS_RMAP_EXTENT_UNMAP: type = XFS_RMAP_UNMAP; break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + type = XFS_RMAP_UNMAP_SHARED; + break; case XFS_RMAP_EXTENT_CONVERT: type = XFS_RMAP_CONVERT; break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + type = XFS_RMAP_CONVERT_SHARED; + break; case XFS_RMAP_EXTENT_ALLOC: type = XFS_RMAP_ALLOC; break; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 6e812fe0fd43..12d48cd8f8a4 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -62,6 +62,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "ibt2", XFSSTAT_END_IBT_V2 }, { "fibt2", XFSSTAT_END_FIBT_V2 }, { "rmapbt", XFSSTAT_END_RMAP_V2 }, + { "refcntbt", XFSSTAT_END_REFCOUNT }, /* we print both series of quota information together */ { "qm", XFSSTAT_END_QM }, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 657865f51e78..79ad2e69fc33 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -213,7 +213,23 @@ struct xfsstats { __uint32_t xs_rmap_2_alloc; __uint32_t xs_rmap_2_free; __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_RMAP_V2+6) +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) + __uint32_t xs_refcbt_2_lookup; + __uint32_t xs_refcbt_2_compare; + __uint32_t xs_refcbt_2_insrec; + __uint32_t xs_refcbt_2_delrec; + __uint32_t xs_refcbt_2_newroot; + __uint32_t xs_refcbt_2_killroot; + __uint32_t xs_refcbt_2_increment; + __uint32_t xs_refcbt_2_decrement; + __uint32_t xs_refcbt_2_lshift; + __uint32_t xs_refcbt_2_rshift; + __uint32_t xs_refcbt_2_split; + __uint32_t xs_refcbt_2_join; + __uint32_t xs_refcbt_2_alloc; + __uint32_t xs_refcbt_2_free; + __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; __uint32_t xs_qm_dquot_dups; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2d092f9577ca..ade4691e3f74 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,9 @@ #include "xfs_sysfs.h" #include "xfs_ondisk.h" #include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_bmap_item.h" +#include "xfs_reflink.h" #include <linux/namei.h> #include <linux/init.h> @@ -936,6 +939,7 @@ xfs_fs_destroy_inode( struct inode *inode) { struct xfs_inode *ip = XFS_I(inode); + int error; trace_xfs_destroy_inode(ip); @@ -943,6 +947,14 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); + if (xfs_is_reflink_inode(ip)) { + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) + xfs_warn(ip->i_mount, +"Error %d while evicting CoW blocks for inode %llu.", + error, ip->i_ino); + } + xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -1006,6 +1018,16 @@ xfs_fs_drop_inode( { struct xfs_inode *ip = XFS_I(inode); + /* + * If this unlinked inode is in the middle of recovery, don't + * drop the inode just yet; log recovery will take care of + * that. See the comment for this inode flag. + */ + if (ip->i_flags & XFS_IRECOVERY) { + ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED); + return 0; + } + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); } @@ -1296,10 +1318,31 @@ xfs_fs_remount( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); xfs_queue_eofblocks(mp); + + /* Recover any CoW blocks that never got remapped. */ + error = xfs_reflink_recover_cow(mp); + if (error) { + xfs_err(mp, + "Error %d recovering leftover CoW allocations.", error); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + + /* Create the per-AG metadata reservation pool .*/ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) + return error; } /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* Free the per-AG metadata reservation pool. */ + error = xfs_fs_unreserve_ag_blocks(mp); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; + } + /* * Before we sync the metadata, we need to free up the reserve * block pool so that the used block count in the superblock on @@ -1490,6 +1533,7 @@ xfs_fs_fill_super( atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; mp->m_super = sb; @@ -1572,6 +1616,9 @@ xfs_fs_fill_super( "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; } + if (xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_alert(mp, + "DAX and reflink have not been tested together!"); } if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { @@ -1585,6 +1632,10 @@ xfs_fs_fill_super( "EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!"); } + if (xfs_sb_version_hasreflink(&mp->m_sb)) + xfs_alert(mp, + "EXPERIMENTAL reflink feature enabled. Use at your own risk!"); + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1788,8 +1839,38 @@ xfs_init_zones(void) if (!xfs_rui_zone) goto out_destroy_rud_zone; + xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item), + "xfs_cud_item"); + if (!xfs_cud_zone) + goto out_destroy_rui_zone; + + xfs_cui_zone = kmem_zone_init( + xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), + "xfs_cui_item"); + if (!xfs_cui_zone) + goto out_destroy_cud_zone; + + xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item), + "xfs_bud_item"); + if (!xfs_bud_zone) + goto out_destroy_cui_zone; + + xfs_bui_zone = kmem_zone_init( + xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), + "xfs_bui_item"); + if (!xfs_bui_zone) + goto out_destroy_bud_zone; + return 0; + out_destroy_bud_zone: + kmem_zone_destroy(xfs_bud_zone); + out_destroy_cui_zone: + kmem_zone_destroy(xfs_cui_zone); + out_destroy_cud_zone: + kmem_zone_destroy(xfs_cud_zone); + out_destroy_rui_zone: + kmem_zone_destroy(xfs_rui_zone); out_destroy_rud_zone: kmem_zone_destroy(xfs_rud_zone); out_destroy_icreate_zone: @@ -1832,6 +1913,10 @@ xfs_destroy_zones(void) * destroy caches. */ rcu_barrier(); + kmem_zone_destroy(xfs_bui_zone); + kmem_zone_destroy(xfs_bud_zone); + kmem_zone_destroy(xfs_cui_zone); + kmem_zone_destroy(xfs_cud_zone); kmem_zone_destroy(xfs_rui_zone); kmem_zone_destroy(xfs_rud_zone); kmem_zone_destroy(xfs_icreate_zone); @@ -1885,6 +1970,8 @@ init_xfs_fs(void) xfs_extent_free_init_defer_op(); xfs_rmap_update_init_defer_op(); + xfs_refcount_update_init_defer_op(); + xfs_bmap_update_init_defer_op(); xfs_dir_startup(); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index aed74d3f8da9..afe1f66aaa69 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -184,6 +184,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.eofb_timer.min, .extra2 = &xfs_params.eofb_timer.max, }, + { + .procname = "speculative_cow_prealloc_lifetime", + .data = &xfs_params.cowb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.cowb_timer.min, + .extra2 = &xfs_params.cowb_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index ffef45375754..984a3499cfe3 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -48,6 +48,7 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ + xfs_sysctl_val_t cowb_timer; /* Interval between cowb scan wakeups */ } xfs_param_t; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 16093c7dacde..ad188d3a83f3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -39,6 +39,7 @@ struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; +struct xfs_refcount_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -135,6 +136,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks); DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), @@ -268,10 +271,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __field(unsigned long, caller_ip) ), TP_fast_assign( - struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? - ip->i_afp : &ip->i_df; + struct xfs_ifork *ifp; struct xfs_bmbt_irec r; + ifp = xfs_iext_state_to_fork(ip, state); xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; @@ -686,6 +689,9 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach); DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); @@ -2581,10 +2587,20 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_delete); DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error); DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error); DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error); + +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate); +DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range); DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); +/* deferred bmbt updates */ +#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer); +DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred); + /* per-AG reservation */ DECLARE_EVENT_CLASS(xfs_ag_resv_class, TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv, @@ -2639,6 +2655,728 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); +/* refcount tracepoint classes */ + +/* reuse the discard trace class for agbno/aglen-based traces */ +#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name) + +/* ag btree lookup tracepoint class */ +#define XFS_AG_BTREE_CMP_FORMAT_STR \ + { XFS_LOOKUP_EQ, "eq" }, \ + { XFS_LOOKUP_LE, "le" }, \ + { XFS_LOOKUP_GE, "ge" } +DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_lookup_t dir), + TP_ARGS(mp, agno, agbno, dir), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_lookup_t, dir) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->dir = dir; + ), + TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR), + __entry->dir) +) + +#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \ +DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_lookup_t dir), \ + TP_ARGS(mp, agno, agbno, dir)) + +/* single-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount) +) + +#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *irec), \ + TP_ARGS(mp, agno, irec)) + +/* single-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec, xfs_agblock_t agbno), + TP_ARGS(mp, agno, irec, agbno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + __entry->agbno = agbno; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount, + __entry->agbno) +) + +#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_extent_at_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \ + TP_ARGS(mp, agno, irec, agbno)) + +/* double-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), + TP_ARGS(mp, agno, i1, i2), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \ + TP_ARGS(mp, agno, i1, i2)) + +/* double-rcext and an agbno tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, + xfs_agblock_t agbno), + TP_ARGS(mp, agno, i1, i2, agbno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + __entry->agbno = agbno; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u @ agbno %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount, + __entry->agbno) +) + +#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ + xfs_agblock_t agbno), \ + TP_ARGS(mp, agno, i1, i2, agbno)) + +/* triple-rcext tracepoint class */ +DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, + struct xfs_refcount_irec *i3), + TP_ARGS(mp, agno, i1, i2, i3), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, i1_startblock) + __field(xfs_extlen_t, i1_blockcount) + __field(xfs_nlink_t, i1_refcount) + __field(xfs_agblock_t, i2_startblock) + __field(xfs_extlen_t, i2_blockcount) + __field(xfs_nlink_t, i2_refcount) + __field(xfs_agblock_t, i3_startblock) + __field(xfs_extlen_t, i3_blockcount) + __field(xfs_nlink_t, i3_refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->i1_startblock = i1->rc_startblock; + __entry->i1_blockcount = i1->rc_blockcount; + __entry->i1_refcount = i1->rc_refcount; + __entry->i2_startblock = i2->rc_startblock; + __entry->i2_blockcount = i2->rc_blockcount; + __entry->i2_refcount = i2->rc_refcount; + __entry->i3_startblock = i3->rc_startblock; + __entry->i3_blockcount = i3->rc_blockcount; + __entry->i3_refcount = i3->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u -- " + "agbno %u len %u refcount %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->i1_startblock, + __entry->i1_blockcount, + __entry->i1_refcount, + __entry->i2_startblock, + __entry->i2_blockcount, + __entry->i2_refcount, + __entry->i3_startblock, + __entry->i3_blockcount, + __entry->i3_refcount) +); + +#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \ + struct xfs_refcount_irec *i3), \ + TP_ARGS(mp, agno, i1, i2, i3)) + +/* refcount btree tracepoints */ +DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block); +DEFINE_BUSY_EVENT(xfs_refcountbt_free_block); +DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete); +DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error); + +/* refcount adjustment tracepoints */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease); +DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent); +DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent); +DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent); +DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error); + +/* reflink helpers */ +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); +DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); +DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error); +#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer); +DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); + +TRACE_EVENT(xfs_refcount_finish_one_leftover, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int type, xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t new_agbno, xfs_extlen_t new_len), + TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_agblock_t, new_agbno) + __field(xfs_extlen_t, new_len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; + __entry->new_agbno = new_agbno; + __entry->new_len = new_len; + ), + TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, + __entry->len, + __entry->new_agbno, + __entry->new_len) +); + +/* simple inode-based error/%ip tracepoint class */ +DECLARE_EVENT_CLASS(xfs_inode_error_class, + TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip), + TP_ARGS(ip, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->error = error; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino %llx error %d caller %ps", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->error, + (char *)__entry->caller_ip) +); + +#define DEFINE_INODE_ERROR_EVENT(name) \ +DEFINE_EVENT(xfs_inode_error_class, name, \ + TP_PROTO(struct xfs_inode *ip, int error, \ + unsigned long caller_ip), \ + TP_ARGS(ip, error, caller_ip)) + +/* reflink allocator */ +TRACE_EVENT(xfs_bmap_remap_alloc, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno, + xfs_extlen_t len), + TP_ARGS(ip, fsbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, fsbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->fsbno = fsbno; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->fsbno, + __entry->len) +); +DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error); + +/* reflink tracepoint classes */ + +/* two-file io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_io_class, + TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, + struct xfs_inode *dest, xfs_off_t doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_ino) + __field(loff_t, src_isize) + __field(loff_t, src_disize) + __field(loff_t, src_offset) + __field(size_t, len) + __field(xfs_ino_t, dest_ino) + __field(loff_t, dest_isize) + __field(loff_t, dest_disize) + __field(loff_t, dest_offset) + ), + TP_fast_assign( + __entry->dev = VFS_I(src)->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = VFS_I(src)->i_size; + __entry->src_disize = src->i_d.di_size; + __entry->src_offset = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = VFS_I(dest)->i_size; + __entry->dest_disize = dest->i_d.di_size; + __entry->dest_offset = doffset; + ), + TP_printk("dev %d:%d count %zd " + "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> " + "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_isize, + __entry->src_disize, + __entry->src_offset, + __entry->dest_ino, + __entry->dest_isize, + __entry->dest_disize, + __entry->dest_offset) +) + +#define DEFINE_DOUBLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_double_io_class, name, \ + TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \ + struct xfs_inode *dest, xfs_off_t doffset), \ + TP_ARGS(src, soffset, len, dest, doffset)) + +/* two-file vfs io tracepoint class */ +DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, + TP_PROTO(struct inode *src, u64 soffset, u64 len, + struct inode *dest, u64 doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, src_ino) + __field(loff_t, src_isize) + __field(loff_t, src_offset) + __field(size_t, len) + __field(unsigned long, dest_ino) + __field(loff_t, dest_isize) + __field(loff_t, dest_offset) + ), + TP_fast_assign( + __entry->dev = src->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = i_size_read(src); + __entry->src_offset = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = i_size_read(dest); + __entry->dest_offset = doffset; + ), + TP_printk("dev %d:%d count %zd " + "ino 0x%lx isize 0x%llx offset 0x%llx -> " + "ino 0x%lx isize 0x%llx offset 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_isize, + __entry->src_offset, + __entry->dest_ino, + __entry->dest_isize, + __entry->dest_offset) +) + +#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \ +DEFINE_EVENT(xfs_double_vfs_io_class, name, \ + TP_PROTO(struct inode *src, u64 soffset, u64 len, \ + struct inode *dest, u64 doffset), \ + TP_ARGS(src, soffset, len, dest, doffset)) + +/* CoW write tracepoint */ +DECLARE_EVENT_CLASS(xfs_copy_on_write_class, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, + xfs_extlen_t len, xfs_fsblock_t new_pblk), + TP_ARGS(ip, lblk, pblk, len, new_pblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_fsblock_t, pblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, new_pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->pblk = pblk; + __entry->len = len; + __entry->new_pblk = new_pblk; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " + "len 0x%x new_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->pblk, + __entry->len, + __entry->new_pblk) +) + +#define DEFINE_COW_EVENT(name) \ +DEFINE_EVENT(xfs_copy_on_write_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ + xfs_extlen_t len, xfs_fsblock_t new_pblk), \ + TP_ARGS(ip, lblk, pblk, len, new_pblk)) + +/* inode/irec events */ +DECLARE_EVENT_CLASS(xfs_inode_irec_class, + TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, + __entry->pblk) +); +#define DEFINE_INODE_IREC_EVENT(name) \ +DEFINE_EVENT(xfs_inode_irec_class, name, \ + TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, irec)) + +/* refcount/reflink tracepoint definitions */ + +/* reflink tracepoints */ +DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag); +DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag); +DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size); +DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap); +TRACE_EVENT(xfs_reflink_remap_blocks_loop, + TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset, + xfs_filblks_t len, struct xfs_inode *dest, + xfs_fileoff_t doffset), + TP_ARGS(src, soffset, len, dest, doffset), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_ino) + __field(xfs_fileoff_t, src_lblk) + __field(xfs_filblks_t, len) + __field(xfs_ino_t, dest_ino) + __field(xfs_fileoff_t, dest_lblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(src)->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_lblk = soffset; + __entry->len = len; + __entry->dest_ino = dest->i_ino; + __entry->dest_lblk = doffset; + ), + TP_printk("dev %d:%d len 0x%llx " + "ino 0x%llx offset 0x%llx blocks -> " + "ino 0x%llx offset 0x%llx blocks", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->len, + __entry->src_ino, + __entry->src_lblk, + __entry->dest_ino, + __entry->dest_lblk) +); +TRACE_EVENT(xfs_reflink_punch_range, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, + xfs_extlen_t len), + TP_ARGS(ip, lblk, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->len = len; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len) +); +TRACE_EVENT(xfs_reflink_remap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, + xfs_extlen_t len, xfs_fsblock_t new_pblk), + TP_ARGS(ip, lblk, len, new_pblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fileoff_t, lblk) + __field(xfs_extlen_t, len) + __field(xfs_fsblock_t, new_pblk) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lblk = lblk; + __entry->len = len; + __entry->new_pblk = new_pblk; + ), + TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->lblk, + __entry->len, + __entry->new_pblk) +); +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); + +/* dedupe tracepoints */ +DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); + +/* ioctl tracepoints */ +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); +DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); +TRACE_EVENT(xfs_ioctl_clone, + TP_PROTO(struct inode *src, struct inode *dest), + TP_ARGS(src, dest), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, src_ino) + __field(loff_t, src_isize) + __field(unsigned long, dest_ino) + __field(loff_t, dest_isize) + ), + TP_fast_assign( + __entry->dev = src->i_sb->s_dev; + __entry->src_ino = src->i_ino; + __entry->src_isize = i_size_read(src); + __entry->dest_ino = dest->i_ino; + __entry->dest_isize = i_size_read(dest); + ), + TP_printk("dev %d:%d " + "ino 0x%lx isize 0x%llx -> " + "ino 0x%lx isize 0x%llx\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_ino, + __entry->src_isize, + __entry->dest_ino, + __entry->dest_isize) +); + +/* unshare tracepoints */ +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); +DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); + +/* copy on write */ +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); + +DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range); +DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range); + +DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write); +DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping); +DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec); + +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); +DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece); + +DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); + +DEFINE_COW_EVENT(xfs_reflink_fork_buf); +DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); + +DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); +DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); + +/* rmap swapext tracepoints */ +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); +DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); +DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e2bf86aad33d..61b7fbdd3ebd 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -36,6 +36,11 @@ struct xfs_busy_extent; struct xfs_rud_log_item; struct xfs_rui_log_item; struct xfs_btree_cur; +struct xfs_cui_log_item; +struct xfs_cud_log_item; +struct xfs_defer_ops; +struct xfs_bui_log_item; +struct xfs_bud_log_item; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ @@ -248,4 +253,28 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp, xfs_fsblock_t startblock, xfs_filblks_t blockcount, xfs_exntst_t state, struct xfs_btree_cur **pcur); +/* refcount updates */ +enum xfs_refcount_intent_type; + +void xfs_refcount_update_init_defer_op(void); +struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp, + struct xfs_cui_log_item *cuip); +int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, struct xfs_defer_ops *dfops, + enum xfs_refcount_intent_type type, xfs_fsblock_t startblock, + xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, struct xfs_btree_cur **pcur); + +/* mapping updates */ +enum xfs_bmap_intent_type; + +void xfs_bmap_update_init_defer_op(void); +struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp, + struct xfs_bui_log_item *buip); +int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp, + struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops, + enum xfs_bmap_intent_type type, struct xfs_inode *ip, + int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock, + xfs_filblks_t blockcount, xfs_exntst_t state); + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c new file mode 100644 index 000000000000..6408e7d7c08c --- /dev/null +++ b/fs/xfs/xfs_trans_bmap.c @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_bmap_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_inode.h" + +/* + * This routine is called to allocate a "bmap update done" + * log item. + */ +struct xfs_bud_log_item * +xfs_trans_get_bud( + struct xfs_trans *tp, + struct xfs_bui_log_item *buip) +{ + struct xfs_bud_log_item *budp; + + budp = xfs_bud_init(tp->t_mountp, buip); + xfs_trans_add_item(tp, &budp->bud_item); + return budp; +} + +/* + * Finish an bmap update and log it to the BUD. Note that the + * transaction is marked dirty regardless of whether the bmap update + * succeeds or fails to support the BUI/BUD lifecycle rules. + */ +int +xfs_trans_log_finish_bmap_update( + struct xfs_trans *tp, + struct xfs_bud_log_item *budp, + struct xfs_defer_ops *dop, + enum xfs_bmap_intent_type type, + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) +{ + int error; + + error = xfs_bmap_finish_one(tp, dop, ip, type, whichfork, startoff, + startblock, blockcount, state); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the BUI and frees the BUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort bmap intents by inode. */ +static int +xfs_bmap_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_bmap_intent *ba; + struct xfs_bmap_intent *bb; + + ba = container_of(a, struct xfs_bmap_intent, bi_list); + bb = container_of(b, struct xfs_bmap_intent, bi_list); + return ba->bi_owner->i_ino - bb->bi_owner->i_ino; +} + +/* Get an BUI. */ +STATIC void * +xfs_bmap_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_bui_log_item *buip; + + ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); + ASSERT(tp != NULL); + + buip = xfs_bui_init(tp->t_mountp); + ASSERT(buip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &buip->bui_item); + return buip; +} + +/* Set the map extent flags for this mapping. */ +static void +xfs_trans_set_bmap_flags( + struct xfs_map_extent *bmap, + enum xfs_bmap_intent_type type, + int whichfork, + xfs_exntst_t state) +{ + bmap->me_flags = 0; + switch (type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + bmap->me_flags = type; + break; + default: + ASSERT(0); + } + if (state == XFS_EXT_UNWRITTEN) + bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + if (whichfork == XFS_ATTR_FORK) + bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; +} + +/* Log bmap updates in the intent item. */ +STATIC void +xfs_bmap_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_bui_log_item *buip = intent; + struct xfs_bmap_intent *bmap; + uint next_extent; + struct xfs_map_extent *map; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&buip->bui_next_extent) - 1; + ASSERT(next_extent < buip->bui_format.bui_nextents); + map = &buip->bui_format.bui_extents[next_extent]; + map->me_owner = bmap->bi_owner->i_ino; + map->me_startblock = bmap->bi_bmap.br_startblock; + map->me_startoff = bmap->bi_bmap.br_startoff; + map->me_len = bmap->bi_bmap.br_blockcount; + xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork, + bmap->bi_bmap.br_state); +} + +/* Get an BUD so we can process all the deferred rmap updates. */ +STATIC void * +xfs_bmap_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_bud(tp, intent); +} + +/* Process a deferred rmap update. */ +STATIC int +xfs_bmap_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_bmap_intent *bmap; + int error; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + error = xfs_trans_log_finish_bmap_update(tp, done_item, dop, + bmap->bi_type, + bmap->bi_owner, bmap->bi_whichfork, + bmap->bi_bmap.br_startoff, + bmap->bi_bmap.br_startblock, + bmap->bi_bmap.br_blockcount, + bmap->bi_bmap.br_state); + kmem_free(bmap); + return error; +} + +/* Abort all pending BUIs. */ +STATIC void +xfs_bmap_update_abort_intent( + void *intent) +{ + xfs_bui_release(intent); +} + +/* Cancel a deferred rmap update. */ +STATIC void +xfs_bmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_bmap_intent *bmap; + + bmap = container_of(item, struct xfs_bmap_intent, bi_list); + kmem_free(bmap); +} + +static const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_BMAP, + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .diff_items = xfs_bmap_update_diff_items, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .log_item = xfs_bmap_update_log_item, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_bmap_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_bmap_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c new file mode 100644 index 000000000000..94c1877af834 --- /dev/null +++ b/fs/xfs/xfs_trans_refcount.c @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_refcount_item.h" +#include "xfs_alloc.h" +#include "xfs_refcount.h" + +/* + * This routine is called to allocate a "refcount update done" + * log item. + */ +struct xfs_cud_log_item * +xfs_trans_get_cud( + struct xfs_trans *tp, + struct xfs_cui_log_item *cuip) +{ + struct xfs_cud_log_item *cudp; + + cudp = xfs_cud_init(tp->t_mountp, cuip); + xfs_trans_add_item(tp, &cudp->cud_item); + return cudp; +} + +/* + * Finish an refcount update and log it to the CUD. Note that the + * transaction is marked dirty regardless of whether the refcount + * update succeeds or fails to support the CUI/CUD lifecycle rules. + */ +int +xfs_trans_log_finish_refcount_update( + struct xfs_trans *tp, + struct xfs_cud_log_item *cudp, + struct xfs_defer_ops *dop, + enum xfs_refcount_intent_type type, + xfs_fsblock_t startblock, + xfs_extlen_t blockcount, + xfs_fsblock_t *new_fsb, + xfs_extlen_t *new_len, + struct xfs_btree_cur **pcur) +{ + int error; + + error = xfs_refcount_finish_one(tp, dop, type, startblock, + blockcount, new_fsb, new_len, pcur); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the CUI and frees the CUD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + return error; +} + +/* Sort refcount intents by AG. */ +static int +xfs_refcount_update_diff_items( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_mount *mp = priv; + struct xfs_refcount_intent *ra; + struct xfs_refcount_intent *rb; + + ra = container_of(a, struct xfs_refcount_intent, ri_list); + rb = container_of(b, struct xfs_refcount_intent, ri_list); + return XFS_FSB_TO_AGNO(mp, ra->ri_startblock) - + XFS_FSB_TO_AGNO(mp, rb->ri_startblock); +} + +/* Get an CUI. */ +STATIC void * +xfs_refcount_update_create_intent( + struct xfs_trans *tp, + unsigned int count) +{ + struct xfs_cui_log_item *cuip; + + ASSERT(tp != NULL); + ASSERT(count > 0); + + cuip = xfs_cui_init(tp->t_mountp, count); + ASSERT(cuip != NULL); + + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &cuip->cui_item); + return cuip; +} + +/* Set the phys extent flags for this reverse mapping. */ +static void +xfs_trans_set_refcount_flags( + struct xfs_phys_extent *refc, + enum xfs_refcount_intent_type type) +{ + refc->pe_flags = 0; + switch (type) { + case XFS_REFCOUNT_INCREASE: + case XFS_REFCOUNT_DECREASE: + case XFS_REFCOUNT_ALLOC_COW: + case XFS_REFCOUNT_FREE_COW: + refc->pe_flags |= type; + break; + default: + ASSERT(0); + } +} + +/* Log refcount updates in the intent item. */ +STATIC void +xfs_refcount_update_log_item( + struct xfs_trans *tp, + void *intent, + struct list_head *item) +{ + struct xfs_cui_log_item *cuip = intent; + struct xfs_refcount_intent *refc; + uint next_extent; + struct xfs_phys_extent *ext; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + + tp->t_flags |= XFS_TRANS_DIRTY; + cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1; + ASSERT(next_extent < cuip->cui_format.cui_nextents); + ext = &cuip->cui_format.cui_extents[next_extent]; + ext->pe_startblock = refc->ri_startblock; + ext->pe_len = refc->ri_blockcount; + xfs_trans_set_refcount_flags(ext, refc->ri_type); +} + +/* Get an CUD so we can process all the deferred refcount updates. */ +STATIC void * +xfs_refcount_update_create_done( + struct xfs_trans *tp, + void *intent, + unsigned int count) +{ + return xfs_trans_get_cud(tp, intent); +} + +/* Process a deferred refcount update. */ +STATIC int +xfs_refcount_update_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_refcount_intent *refc; + xfs_fsblock_t new_fsb; + xfs_extlen_t new_aglen; + int error; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + error = xfs_trans_log_finish_refcount_update(tp, done_item, dop, + refc->ri_type, + refc->ri_startblock, + refc->ri_blockcount, + &new_fsb, &new_aglen, + (struct xfs_btree_cur **)state); + /* Did we run out of reservation? Requeue what we didn't finish. */ + if (!error && new_aglen > 0) { + ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE || + refc->ri_type == XFS_REFCOUNT_DECREASE); + refc->ri_startblock = new_fsb; + refc->ri_blockcount = new_aglen; + return -EAGAIN; + } + kmem_free(refc); + return error; +} + +/* Clean up after processing deferred refcounts. */ +STATIC void +xfs_refcount_update_finish_cleanup( + struct xfs_trans *tp, + void *state, + int error) +{ + struct xfs_btree_cur *rcur = state; + + xfs_refcount_finish_one_cleanup(tp, rcur, error); +} + +/* Abort all pending CUIs. */ +STATIC void +xfs_refcount_update_abort_intent( + void *intent) +{ + xfs_cui_release(intent); +} + +/* Cancel a deferred refcount update. */ +STATIC void +xfs_refcount_update_cancel_item( + struct list_head *item) +{ + struct xfs_refcount_intent *refc; + + refc = container_of(item, struct xfs_refcount_intent, ri_list); + kmem_free(refc); +} + +static const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .type = XFS_DEFER_OPS_TYPE_REFCOUNT, + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .diff_items = xfs_refcount_update_diff_items, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .log_item = xfs_refcount_update_log_item, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_update_finish_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, +}; + +/* Register the deferred op type. */ +void +xfs_refcount_update_init_defer_op(void) +{ + xfs_defer_init_op_type(&xfs_refcount_update_defer_type); +} diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 5a50ef881568..9ead064b5e90 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -48,12 +48,21 @@ xfs_trans_set_rmap_flags( case XFS_RMAP_MAP: rmap->me_flags |= XFS_RMAP_EXTENT_MAP; break; + case XFS_RMAP_MAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED; + break; case XFS_RMAP_UNMAP: rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP; break; + case XFS_RMAP_UNMAP_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED; + break; case XFS_RMAP_CONVERT: rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT; break; + case XFS_RMAP_CONVERT_SHARED: + rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED; + break; case XFS_RMAP_ALLOC: rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC; break; |