From c2b93e0699723700f886ce17bb65ffd771195a6d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 May 2013 11:28:31 -0400 Subject: cifs: only set ops for inodes in I_NEW state It's generally not safe to reset the inode ops once they've been set. In the case where the inode was originally thought to be a directory and then later found to be a DFS referral, this can lead to an oops when we try to trigger an inode op on it after changing the ops to the blank referral operations. Cc: Reported-and-Tested-by: Sachin Prabhu Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index fc3025199cb3..20efd81266c6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -171,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) inode->i_flags |= S_AUTOMOUNT; - cifs_set_ops(inode); + if (inode->i_state & I_NEW) + cifs_set_ops(inode); } void -- cgit v1.2.3 From 3fab70c165795431f00ddf9be8b84ddd07bd1f8f Mon Sep 17 00:00:00 2001 From: Lingzhu Xiang Date: Fri, 10 May 2013 18:29:21 +0800 Subject: efivarfs: Never return ENOENT from firmware again Previously in 1fa7e69 efi_status_to_err() translated firmware status EFI_NOT_FOUND to -EIO instead of -ENOENT for efivarfs operations to avoid confusion. After refactoring in e14ab23, it is also used in other places where the translation may be unnecessary. So move the translation to efivarfs specific code. Also return EOF for reading zero-length files, which is what users would expect. Cc: Josh Boyer Cc: Jeremy Kerr Cc: Lee, Chun-Yi Cc: Andy Whitcroft Signed-off-by: Lingzhu Xiang Signed-off-by: Matt Fleming --- fs/efivarfs/file.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index bfb531564319..8dd524f32284 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -44,8 +44,11 @@ static ssize_t efivarfs_file_write(struct file *file, bytes = efivar_entry_set_get_size(var, attributes, &datasize, data, &set); - if (!set && bytes) + if (!set && bytes) { + if (bytes == -ENOENT) + bytes = -EIO; goto out; + } if (bytes == -ENOENT) { drop_nlink(inode); @@ -76,7 +79,14 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, int err; err = efivar_entry_size(var, &datasize); - if (err) + + /* + * efivarfs represents uncommitted variables with + * zero-length files. Reading them should return EOF. + */ + if (err == -ENOENT) + return 0; + else if (err) return err; data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); -- cgit v1.2.3 From de82b923012ff8790bcfff381eb3ca9069d00f49 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 14 May 2013 11:25:39 -0400 Subject: fuse: allocate for_background dio requests based on io->async state Commit 8b41e671 introduced explicit background checking for fuse_req structures with BUG_ON() checks for the appropriate type of request in in the associated send functions. Commit bcba24cc introduced the ability to send dio requests as background requests but does not update the request allocation based on the type of I/O request. As a result, a BUG_ON() triggers in the fuse_request_send_background() background path if an async I/O is sent. Allocate a request based on the async state of the fuse_io_priv to avoid the BUG. Signed-off-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d1c9b85b3f58..fe191325fefa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1278,7 +1278,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1314,7 +1317,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, + fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } -- cgit v1.2.3 From 3634a6327815d39dd93e5c44a602daae91c66297 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 17 May 2013 09:30:32 -0400 Subject: fuse: truncate pagecache range on hole punch fuse supports hole punch via the fallocate() FALLOC_FL_PUNCH_HOLE interface. When a hole punch is passed through, the page cache is not cleared and thus allows reading stale data from the cache. This is easily demonstrable (using FOPEN_KEEP_CACHE) by reading a smallish random data file into cache, punching a hole and creating a copy of the file. Drop caches or remount and observe that the original file no longer matches the file copied after the hole punch. The original file contains a zeroed range and the latter file contains stale data. Protect against writepage requests in progress and punch out the associated page cache range after a successful client fs hole punch. Signed-off-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index fe191325fefa..a200a2d80377 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -16,6 +16,7 @@ #include #include #include +#include static const struct file_operations fuse_direct_io_file_operations; @@ -2453,6 +2454,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; + struct inode *inode = file->f_inode; struct fuse_conn *fc = ff->fc; struct fuse_req *req; struct fuse_fallocate_in inarg = { @@ -2466,9 +2468,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (fc->no_fallocate) return -EOPNOTSUPP; + if (mode & FALLOC_FL_PUNCH_HOLE) { + mutex_lock(&inode->i_mutex); + fuse_set_nowrite(inode); + } + req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } req->in.h.opcode = FUSE_FALLOCATE; req->in.h.nodeid = ff->nodeid; @@ -2483,6 +2492,15 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } fuse_put_request(fc, req); +out: + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (!err) + truncate_pagecache_range(inode, offset, + offset + length - 1); + fuse_release_nowrite(inode); + mutex_unlock(&inode->i_mutex); + } + return err; } -- cgit v1.2.3 From bee6c307800bbb26ba1a855b1841c2f0c4b7622a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 17 May 2013 15:27:34 -0400 Subject: fuse: update inode size and invalidate attributes on fallocate An fallocate request without FALLOC_FL_KEEP_SIZE set can extend the size of a file. Update the inode size after a successful fallocate. Also invalidate the inode attributes after a successful fallocate to ensure we pick up the latest attribute values (i.e., i_blocks). Signed-off-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a200a2d80377..d9f467907791 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2492,11 +2492,20 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } fuse_put_request(fc, req); + if (err) + goto out; + + /* we could have extended the file */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) + fuse_write_update_size(inode, offset + length); + + if (mode & FALLOC_FL_PUNCH_HOLE) + truncate_pagecache_range(inode, offset, offset + length - 1); + + fuse_invalidate_attr(inode); + out: if (mode & FALLOC_FL_PUNCH_HOLE) { - if (!err) - truncate_pagecache_range(inode, offset, - offset + length - 1); fuse_release_nowrite(inode); mutex_unlock(&inode->i_mutex); } -- cgit v1.2.3 From 774d5f14ee1ecac55f42a84ff35eb00b896b00b6 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Mon, 20 May 2013 14:13:50 -0400 Subject: NFSv4.1 Fix a pNFS session draining deadlock On a CB_RECALL the callback service thread flushes the inode using filemap_flush prior to scheduling the state manager thread to return the delegation. When pNFS is used and I/O has not yet gone to the data server servicing the inode, a LAYOUTGET can preceed the I/O. Unlike the async filemap_flush call, the LAYOUTGET must proceed to completion. If the state manager starts to recover data while the inode flush is sending the LAYOUTGET, a deadlock occurs as the callback service thread holds the single callback session slot until the flushing is done which blocks the state manager thread, and the state manager thread has set the session draining bit which puts the inode flush LAYOUTGET RPC to sleep on the forechannel slot table waitq. Separate the draining of the back channel from the draining of the fore channel by moving the NFS4_SESSION_DRAINING bit from session scope into the fore and back slot tables. Drain the back channel first allowing the LAYOUTGET call to proceed (and fail) so the callback service thread frees the callback slot. Then proceed with draining the forechannel. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 +- fs/nfs/callback_xdr.c | 2 +- fs/nfs/nfs4proc.c | 2 +- fs/nfs/nfs4session.c | 4 ++-- fs/nfs/nfs4session.h | 13 ++++++++----- fs/nfs/nfs4state.c | 15 +++++++-------- 6 files changed, 20 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index a13d26ede254..0bc27684ebfa 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -414,7 +414,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ - if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); /* Return NFS4ERR_BADSESSION if we're draining the session diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 59461c957d9d..a35582c9d444 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -763,7 +763,7 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * A single slot, so highest used slotid is either 0 or -1 */ tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(session, tbl); + nfs4_slot_tbl_drain_complete(tbl); spin_unlock(&tbl->slot_tbl_lock); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8fbc10054115..4e2fe714d5c2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -572,7 +572,7 @@ int nfs41_setup_sequence(struct nfs4_session *session, task->tk_timeout = 0; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && + if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state) && !args->sa_privileged) { /* The state manager will wait until the slot table is empty */ dprintk("%s session is draining\n", __func__); diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index ebda5f4a031b..c4e225e4a9af 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -73,7 +73,7 @@ void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) tbl->highest_used_slotid = new_max; else { tbl->highest_used_slotid = NFS4_NO_SLOT; - nfs4_session_drain_complete(tbl->session, tbl); + nfs4_slot_tbl_drain_complete(tbl); } } dprintk("%s: slotid %u highest_used_slotid %d\n", __func__, @@ -226,7 +226,7 @@ static bool nfs41_assign_slot(struct rpc_task *task, void *pslot) struct nfs4_slot *slot = pslot; struct nfs4_slot_table *tbl = slot->table; - if (nfs4_session_draining(tbl->session) && !args->sa_privileged) + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) return false; slot->generation = tbl->generation; args->sa_slot = slot; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 6f3cb39386d4..ff7d9f0f8a65 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -25,6 +25,10 @@ struct nfs4_slot { }; /* Sessions */ +enum nfs4_slot_tbl_state { + NFS4_SLOT_TBL_DRAINING, +}; + #define SLOT_TABLE_SZ DIV_ROUND_UP(NFS4_MAX_SLOT_TABLE, 8*sizeof(long)) struct nfs4_slot_table { struct nfs4_session *session; /* Parent session */ @@ -43,6 +47,7 @@ struct nfs4_slot_table { unsigned long generation; /* Generation counter for target_highest_slotid */ struct completion complete; + unsigned long slot_tbl_state; }; /* @@ -68,7 +73,6 @@ struct nfs4_session { enum nfs4_session_state { NFS4_SESSION_INITING, - NFS4_SESSION_DRAINING, }; #if defined(CONFIG_NFS_V4_1) @@ -88,12 +92,11 @@ extern void nfs4_destroy_session(struct nfs4_session *session); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_init_ds_session(struct nfs_client *, unsigned long); -extern void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl); +extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl); -static inline bool nfs4_session_draining(struct nfs4_session *session) +static inline bool nfs4_slot_tbl_draining(struct nfs4_slot_table *tbl) { - return !!test_bit(NFS4_SESSION_DRAINING, &session->session_state); + return !!test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); } bool nfs41_wake_and_assign_slot(struct nfs4_slot_table *tbl, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 300d17d85c0e..1fab140764c4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -241,7 +241,7 @@ static void nfs4_end_drain_session(struct nfs_client *clp) if (ses == NULL) return; tbl = &ses->fc_slot_table; - if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { + if (test_and_clear_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { spin_lock(&tbl->slot_tbl_lock); nfs41_wake_slot_table(tbl); spin_unlock(&tbl->slot_tbl_lock); @@ -251,15 +251,15 @@ static void nfs4_end_drain_session(struct nfs_client *clp) /* * Signal state manager thread if session fore channel is drained */ -void nfs4_session_drain_complete(struct nfs4_session *session, - struct nfs4_slot_table *tbl) +void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) { - if (nfs4_session_draining(session)) + if (nfs4_slot_tbl_draining(tbl)) complete(&tbl->complete); } -static int nfs4_wait_on_slot_tbl(struct nfs4_slot_table *tbl) +static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl) { + set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state); spin_lock(&tbl->slot_tbl_lock); if (tbl->highest_used_slotid != NFS4_NO_SLOT) { INIT_COMPLETION(tbl->complete); @@ -275,13 +275,12 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) struct nfs4_session *ses = clp->cl_session; int ret = 0; - set_bit(NFS4_SESSION_DRAINING, &ses->session_state); /* back channel */ - ret = nfs4_wait_on_slot_tbl(&ses->bc_slot_table); + ret = nfs4_drain_slot_tbl(&ses->bc_slot_table); if (ret) return ret; /* fore channel */ - return nfs4_wait_on_slot_tbl(&ses->fc_slot_table); + return nfs4_drain_slot_tbl(&ses->fc_slot_table); } static void nfs41_finish_session_reset(struct nfs_client *clp) -- cgit v1.2.3 From 83c168bf8017212a9d502536f9dcd0b54d24e330 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 15 May 2013 22:00:10 -0400 Subject: NFS: Fix SETCLIENTID fallback if GSS is not available Commit 79d852bf "NFS: Retry SETCLIENTID with AUTH_SYS instead of AUTH_NONE" did not take into account commit 23631227 "NFSv4: Fix the fallback to AUTH_NULL if krb5i is not available". Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 947b0c908aa9..4cbad5d6b276 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -203,7 +203,7 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_NULL); + error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) goto error; -- cgit v1.2.3 From 62106e96279f66c52e2123782f9420af9dbe8cbe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 7 May 2013 11:28:31 -0400 Subject: cifs: only set ops for inodes in I_NEW state It's generally not safe to reset the inode ops once they've been set. In the case where the inode was originally thought to be a directory and then later found to be a DFS referral, this can lead to an oops when we try to trigger an inode op on it after changing the ops to the blank referral operations. Cc: Reported-and-Tested-by: Sachin Prabhu Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Steve French --- fs/cifs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index fc3025199cb3..20efd81266c6 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -171,7 +171,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL) inode->i_flags |= S_AUTOMOUNT; - cifs_set_ops(inode); + if (inode->i_state & I_NEW) + cifs_set_ops(inode); } void -- cgit v1.2.3 From 166faf21bd14bc5c5295a44874bf7f3930c30b20 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:04 -0400 Subject: cifs: fix potential buffer overrun when composing a new options string Consider the case where we have a very short ip= string in the original mount options, and when we chase a referral we end up with a very long IPv6 address. Be sure to allow for that possibility when estimating the size of the string to allocate. Cc: Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 8e33ec65847b..8e5a1817496d 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cifsglob.h" #include "cifsproto.h" #include "cifsfs.h" @@ -150,7 +151,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata, * assuming that we have 'unc=' and 'ip=' in * the original sb_mountdata */ - md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; + md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12 + + INET6_ADDRSTRLEN; mountdata = kzalloc(md_len+1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; -- cgit v1.2.3 From 539673fff76af73c3aee96e0de10edcc97d84db3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:04 -0400 Subject: cifs: allow sec=none mounts to work against servers that don't support extended security In the case of sec=none, we're not sending a username or password, so there's little benefit to mandating NTLMSSP auth. Allow it to use unencapsulated auth in that case. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 99eeaa17ee00..0a7fdc31f253 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1061,6 +1061,7 @@ static int cifs_parse_security_flavors(char *value, #endif case Opt_sec_none: vol->nullauth = 1; + vol->secFlg |= CIFSSEC_MAY_NTLM; break; default: cifs_dbg(VFS, "bad security option: %s\n", value); -- cgit v1.2.3 From 37d4f99b55d46d9f71f4769faf74c95adb2c1daf Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:05 -0400 Subject: cifs: fix error handling when calling cifs_parse_devname When we allowed separate unc= and prefixpath= mount options, we could ignore EINVAL errors from cifs_parse_devname. Now that they are deprecated, we need to check for that as well and fail the mount if it's malformed. Also fix a later error message that refers to the unc= option. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0a7fdc31f253..5b97e56ddbca 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1258,14 +1258,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->backupuid_specified = false; /* no backup intent for a user */ vol->backupgid_specified = false; /* no backup intent for a group */ - /* - * For now, we ignore -EINVAL errors under the assumption that the - * unc= and prefixpath= options will be usable. - */ - if (cifs_parse_devname(devname, vol) == -ENOMEM) { - printk(KERN_ERR "CIFS: Unable to allocate memory to parse " - "device string.\n"); - goto out_nomem; + switch (cifs_parse_devname(devname, vol)) { + case 0: + break; + case -ENOMEM: + cifs_dbg(VFS, "Unable to allocate memory for devname.\n"); + goto cifs_parse_mount_err; + case -EINVAL: + cifs_dbg(VFS, "Malformed UNC in devname.\n"); + goto cifs_parse_mount_err; + default: + cifs_dbg(VFS, "Unknown error parsing devname.\n"); + goto cifs_parse_mount_err; } while ((data = strsep(&options, separator)) != NULL) { @@ -1827,7 +1831,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } #endif if (!vol->UNC) { - cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string or in unc= option!\n"); + cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n"); goto cifs_parse_mount_err; } -- cgit v1.2.3 From 9c9c29e1af2ff0459087876e3800078555794b60 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:05 -0400 Subject: cifs: stop printing the unc= option in /proc/mounts Since we no longer recognize that option, stop printing it out. The devicename is now the canonical source for this info. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 72e4efee1389..3752b9f6d9e4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -372,9 +372,6 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_show_security(s, tcon->ses->server); cifs_show_cache_flavor(s, cifs_sb); - seq_printf(s, ",unc="); - seq_escape(s, tcon->treeName, " \t\n\\"); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) seq_printf(s, ",multiuser"); else if (tcon->ses->user_name) -- cgit v1.2.3 From d9deef0a3f38bcc1c155e8d9a8b522404e5e648c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 24 May 2013 07:40:06 -0400 Subject: cifs: fix composing of mount options for DFS referrals With the change to ignore the unc= and prefixpath= mount options, there is no longer any need to add them to the options string when mounting. By the same token, we now need to build a device name that includes the prefixpath when mounting. To make things neater, the delimiters on the devicename are changed to '/' since that's preferred when mounting anyway. v2: fix some comments and don't bother looking at whether there is a prepath in the ref->node_name when deciding whether to pass a prepath to cifs_build_devname. v3: rebase on top of potential buffer overrun fix for stable Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 141 +++++++++++++++++++++++++------------------------ fs/cifs/dns_resolve.c | 4 +- 2 files changed, 73 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 8e5a1817496d..58df174deb10 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -49,58 +49,74 @@ void cifs_dfs_release_automount_timer(void) } /** - * cifs_get_share_name - extracts share name from UNC - * @node_name: pointer to UNC string + * cifs_build_devname - build a devicename from a UNC and optional prepath + * @nodename: pointer to UNC string + * @prepath: pointer to prefixpath (or NULL if there isn't one) * - * Extracts sharename form full UNC. - * i.e. strips from UNC trailing path that is not part of share - * name and fixup missing '\' in the beginning of DFS node refferal - * if necessary. - * Returns pointer to share name on success or ERR_PTR on error. - * Caller is responsible for freeing returned string. + * Build a new cifs devicename after chasing a DFS referral. Allocate a buffer + * big enough to hold the final thing. Copy the UNC from the nodename, and + * concatenate the prepath onto the end of it if there is one. + * + * Returns pointer to the built string, or a ERR_PTR. Caller is responsible + * for freeing the returned string. */ -static char *cifs_get_share_name(const char *node_name) +static char * +cifs_build_devname(char *nodename, const char *prepath) { - int len; - char *UNC; - char *pSep; - - len = strlen(node_name); - UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, - GFP_KERNEL); - if (!UNC) - return ERR_PTR(-ENOMEM); + size_t pplen; + size_t unclen; + char *dev; + char *pos; + + /* skip over any preceding delimiters */ + nodename += strspn(nodename, "\\"); + if (!*nodename) + return ERR_PTR(-EINVAL); - /* get share name and server name */ - if (node_name[1] != '\\') { - UNC[0] = '\\'; - strncpy(UNC+1, node_name, len); - len++; - UNC[len] = 0; - } else { - strncpy(UNC, node_name, len); - UNC[len] = 0; - } + /* get length of UNC and set pos to last char */ + unclen = strlen(nodename); + pos = nodename + unclen - 1; - /* find server name end */ - pSep = memchr(UNC+2, '\\', len-2); - if (!pSep) { - cifs_dbg(VFS, "%s: no server name end in node name: %s\n", - __func__, node_name); - kfree(UNC); - return ERR_PTR(-EINVAL); + /* trim off any trailing delimiters */ + while (*pos == '\\') { + --pos; + --unclen; } - /* find sharename end */ - pSep++; - pSep = memchr(UNC+(pSep-UNC), '\\', len-(pSep-UNC)); - if (pSep) { - /* trim path up to sharename end - * now we have share name in UNC */ - *pSep = 0; + /* allocate a buffer: + * +2 for preceding "//" + * +1 for delimiter between UNC and prepath + * +1 for trailing NULL + */ + pplen = prepath ? strlen(prepath) : 0; + dev = kmalloc(2 + unclen + 1 + pplen + 1, GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + pos = dev; + /* add the initial "//" */ + *pos = '/'; + ++pos; + *pos = '/'; + ++pos; + + /* copy in the UNC portion from referral */ + memcpy(pos, nodename, unclen); + pos += unclen; + + /* copy the prefixpath remainder (if there is one) */ + if (pplen) { + *pos = '/'; + ++pos; + memcpy(pos, prepath, pplen); + pos += pplen; } - return UNC; + /* NULL terminator */ + *pos = '\0'; + + convert_delimiter(dev, '/'); + return dev; } @@ -124,6 +140,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, { int rc; char *mountdata = NULL; + const char *prepath = NULL; int md_len; char *tkn_e; char *srvIP = NULL; @@ -133,7 +150,10 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (sb_mountdata == NULL) return ERR_PTR(-EINVAL); - *devname = cifs_get_share_name(ref->node_name); + if (strlen(fullpath) - ref->path_consumed) + prepath = fullpath + ref->path_consumed; + + *devname = cifs_build_devname(ref->node_name, prepath); if (IS_ERR(*devname)) { rc = PTR_ERR(*devname); *devname = NULL; @@ -147,13 +167,14 @@ char *cifs_compose_mount_options(const char *sb_mountdata, goto compose_mount_options_err; } - /* md_len = strlen(...) + 12 for 'sep+prefixpath=' - * assuming that we have 'unc=' and 'ip=' in - * the original sb_mountdata + /* + * In most cases, we'll be building a shorter string than the original, + * but we do have to assume that the address in the ip= option may be + * much longer than the original. Add the max length of an address + * string to the length of the original string to allow for worst case. */ - md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12 + - INET6_ADDRSTRLEN; - mountdata = kzalloc(md_len+1, GFP_KERNEL); + md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN; + mountdata = kzalloc(md_len + 1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; goto compose_mount_options_err; @@ -197,26 +218,6 @@ char *cifs_compose_mount_options(const char *sb_mountdata, strncat(mountdata, &sep, 1); strcat(mountdata, "ip="); strcat(mountdata, srvIP); - strncat(mountdata, &sep, 1); - strcat(mountdata, "unc="); - strcat(mountdata, *devname); - - /* find & copy prefixpath */ - tkn_e = strchr(ref->node_name + 2, '\\'); - if (tkn_e == NULL) { - /* invalid unc, missing share name*/ - rc = -EINVAL; - goto compose_mount_options_err; - } - - tkn_e = strchr(tkn_e + 1, '\\'); - if (tkn_e || (strlen(fullpath) - ref->path_consumed)) { - strncat(mountdata, &sep, 1); - strcat(mountdata, "prefixpath="); - if (tkn_e) - strcat(mountdata, tkn_e + 1); - strcat(mountdata, fullpath + ref->path_consumed); - } /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index e7512e497611..7ede7306599f 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -34,7 +34,7 @@ /** * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. - * @unc: UNC path specifying the server + * @unc: UNC path specifying the server (with '/' as delimiter) * @ip_addr: Where to return the IP address. * * The IP address will be returned in string form, and the caller is @@ -64,7 +64,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) hostname = unc + 2; /* Search for server name delimiter */ - sep = memchr(hostname, '\\', len); + sep = memchr(hostname, '/', len); if (sep) len = sep - hostname; else -- cgit v1.2.3 From e9b376671910d105c5e61103111b96209c729529 Mon Sep 17 00:00:00 2001 From: Vahram Martirosyan Date: Fri, 24 May 2013 13:57:12 +0500 Subject: jfs: Several bugs in jfs_freeze() and jfs_unfreeze() The mentioned functions do not pay attention to the error codes returned by the functions updateSuper(), lmLogInit() and lmLogShutdown(). It brings to system crash later when writing to log. The patch adds corresponding code to check and return the error codes and to print correct error messages in case of errors. Found by Linux File System Verification project (linuxtesting.org). Signed-off-by: Vahram Martirosyan Reviewed-by: Gu Zheng Signed-off-by: Dave Kleikamp --- fs/jfs/super.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2003e830ed1c..788e0a9c1fb0 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -611,11 +611,28 @@ static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; + int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { txQuiesce(sb); - lmLogShutdown(log); - updateSuper(sb, FM_CLEAN); + rc = lmLogShutdown(log); + if (rc) { + jfs_error(sb, "jfs_freeze: lmLogShutdown failed"); + + /* let operations fail rather than hang */ + txResume(sb); + + return rc; + } + rc = updateSuper(sb, FM_CLEAN); + if (rc) { + jfs_err("jfs_freeze: updateSuper failed\n"); + /* + * Don't fail here. Everything succeeded except + * marking the superblock clean, so there's really + * no harm in leaving it frozen for now. + */ + } } return 0; } @@ -627,13 +644,18 @@ static int jfs_unfreeze(struct super_block *sb) int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { - updateSuper(sb, FM_MOUNT); - if ((rc = lmLogInit(log))) - jfs_err("jfs_unlock failed with return code %d", rc); - else - txResume(sb); + rc = updateSuper(sb, FM_MOUNT); + if (rc) { + jfs_error(sb, "jfs_unfreeze: updateSuper failed"); + goto out; + } + rc = lmLogInit(log); + if (rc) + jfs_error(sb, "jfs_unfreeze: lmLogInit failed"); +out: + txResume(sb); } - return 0; + return rc; } static struct dentry *jfs_do_mount(struct file_system_type *fs_type, -- cgit v1.2.3 From 95bbb82f60c80808e5a49d8233c2de8451901531 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Thu, 23 May 2013 16:14:19 +0800 Subject: fs/jfs: Add check if journaling to disk has been disabled in lbmRead() Signed-off-by: Gu Zheng Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_logmgr.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index c57499dca89c..360d27c48887 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2009,7 +2009,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - submit_bio(READ_SYNC, bio); + /*check if journaling to disk has been disabled*/ + if (log->no_integrity) { + bio->bi_size = 0; + lbmIODone(bio, 0); + } else { + submit_bio(READ_SYNC, bio); + } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); -- cgit v1.2.3 From 480d7467e4aaa3dc38088baf56bc3eb3599f5d26 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:08 +1000 Subject: xfs: fix sub-page blocksize data integrity writes FSX on 512 byte block size filesystems has been failing for some time with corrupted data. The fault dates back to the change in the writeback data integrity algorithm that uses a mark-and-sweep approach to avoid data writeback livelocks. Unfortunately, a side effect of this mark-and-sweep approach is that each page will only be written once for a data integrity sync, and there is a condition in writeback in XFS where a page may require two writeback attempts to be fully written. As a result of the high level change, we now only get a partial page writeback during the integrity sync because the first pass through writeback clears the mark left on the page index to tell writeback that the page needs writeback.... The cause is writing a partial page in the clustering code. This can happen when a mapping boundary falls in the middle of a page - we end up writing back the first part of the page that the mapping covers, but then never revisit the page to have the remainder mapped and written. The fix is simple - if the mapping boundary falls inside a page, then simple abort clustering without touching the page. This means that the next ->writepage entry that write_cache_pages() will make is the page we aborted on, and xfs_vm_writepage() will map all sections of the page correctly. This behaviour is also optimal for non-data integrity writes, as it results in contiguous sequential writeback of the file rather than missing small holes and having to write them a "random" writes in a future pass. With this fix, all the fsx tests in xfstests now pass on a 512 byte block size filesystem on a 4k page machine. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 49b137cbbcc836ef231866c137d24f42c42bb483) --- fs/xfs/xfs_aops.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 2b2691b73428..41a695048be7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -725,6 +725,25 @@ xfs_convert_page( (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, i_size_read(inode)); + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + len = 1 << inode->i_blkbits; p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), PAGE_CACHE_SIZE); -- cgit v1.2.3 From 7031d0e1c46e2b1c869458233dd216cb72af41b2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:09 +1000 Subject: xfs: fix rounding in xfs_free_file_space The offset passed into xfs_free_file_space() needs to be rounded down to a certain size, but the rounding mask is built by a 32 bit variable. Hence the mask will always mask off the upper 32 bits of the offset and lead to incorrect writeback and invalidation ranges. This is not actually exposed as a bug because we writeback and invalidate from the rounded offset to the end of the file, and hence the offset we are actually punching a hole out of will always be covered by the code. This needs fixing, however, if we ever want to use exact ranges for writeback/invalidation here... Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 28ca489c63e9aceed8801d2f82d731b3c9aa50f5) --- fs/xfs/xfs_vnodeops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 1501f4fa51a6..0176bb21f09a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1453,7 +1453,7 @@ xfs_free_file_space( xfs_mount_t *mp; int nimap; uint resblks; - uint rounding; + xfs_off_t rounding; int rt; xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; @@ -1482,7 +1482,7 @@ xfs_free_file_space( inode_dio_wait(VFS_I(ip)); } - rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); ioffset = offset & ~(rounding - 1); error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, -1); -- cgit v1.2.3 From 509e708a8929c5b75a16c985c03db5329e09cad4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:10 +1000 Subject: xfs: Don't reference the EFI after it is freed Checking the EFI for whether it is being released from recovery after we've already released the known active reference is a mistake worthy of a brown paper bag. Fix the (now) obvious use after free that it can cause. Reported-by: Dave Jones Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 52c24ad39ff02d7bd73c92eb0c926fb44984a41d) --- fs/xfs/xfs_extfree_item.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c0f375087efc..452920a3f03f 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -305,11 +305,12 @@ xfs_efi_release(xfs_efi_log_item_t *efip, { ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { - __xfs_efi_release(efip); - /* recovery needs us to drop the EFI reference, too */ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) __xfs_efi_release(efip); + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ } } -- cgit v1.2.3 From b17cb364dbbbf65add79f1610599d01bcb6851f9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:12 +1000 Subject: xfs: fix missing KM_NOFS tags to keep lockdep happy There are several places where we use KM_SLEEP allocation contexts and use the fact that they are called from transaction context to add KM_NOFS where appropriate. Unfortunately, there are several places where the code makes this assumption but can be called from outside transaction context but with filesystem locks held. These places need explicit KM_NOFS annotations to avoid lockdep complaining about reclaim contexts. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit ac14876cf9255175bf3bdad645bf8aa2b8fb2d7c) --- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_da_btree.c | 6 ++++-- fs/xfs/xfs_dir2_leaf.c | 2 +- fs/xfs/xfs_log_cil.c | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 82b70bda9f47..0d2554299688 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1649,7 +1649,7 @@ xfs_alloc_buftarg( { xfs_buftarg_t *btp; - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9b26a99ebfe9..41ea7e14a7b6 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2464,7 +2464,8 @@ xfs_buf_map_from_irec( ASSERT(nirecs >= 1); if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_SLEEP); + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); if (!map) return ENOMEM; *mapp = map; @@ -2520,7 +2521,8 @@ xfs_dabuf_map( * Optimize the one-block case. */ if (nfsb != 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_SLEEP); + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); nirecs = nfsb; error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 721ba2fe8e54..da71a1819d78 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1336,7 +1336,7 @@ xfs_dir2_leaf_getdents( mp->m_sb.sb_blocksize); map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + (length * sizeof(struct xfs_bmbt_irec)), - KM_SLEEP); + KM_SLEEP | KM_NOFS); map_info->map_size = length; /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index e3d0b85d852b..d0833b54e55d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -139,7 +139,7 @@ xlog_cil_prepare_log_vecs( new_lv = kmem_zalloc(sizeof(*new_lv) + niovecs * sizeof(struct xfs_log_iovec), - KM_SLEEP); + KM_SLEEP|KM_NOFS); /* The allocated iovec region lies beyond the log vector. */ new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1]; -- cgit v1.2.3 From 7ced60cae46cb37273a03c196e6f473b089bd8e1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:13 +1000 Subject: xfs: xfs_da3_node_read_verify() doesn't handle XFS_ATTR3_LEAF_MAGIC Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 72916fb8cbcf0c2928f56cdc2fbe8c7bf5517758) --- fs/xfs/xfs_da_btree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 41ea7e14a7b6..0b8b2a13cd24 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -270,6 +270,7 @@ xfs_da3_node_read_verify( break; return; case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: bp->b_ops = &xfs_attr3_leaf_buf_ops; bp->b_ops->verify_read(bp); return; -- cgit v1.2.3 From cf257abf02709dba3cc745d950f144ce49432b4f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:14 +1000 Subject: xfs: xfs_attr_shortform_allfit() does not handle attr3 format. xfstests generic/117 fails with: XFS: Assertion failed: leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) indicating a function that does not handle the attr3 format correctly. Fix it. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit b38958d715316031fe9ea0cc6c22043072a55f49) --- fs/xfs/xfs_attr_leaf.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 08d5457c948e..8eeb88fb3201 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -931,20 +931,22 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) */ int xfs_attr_shortform_allfit( - struct xfs_buf *bp, - struct xfs_inode *dp) + struct xfs_buf *bp, + struct xfs_inode *dp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; - int bytes, i; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; leaf = bp->b_addr; - ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC)); + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); - entry = &leaf->entries[0]; bytes = sizeof(struct xfs_attr_sf_hdr); - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) @@ -954,15 +956,15 @@ xfs_attr_shortform_allfit( return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); - bytes += sizeof(struct xfs_attr_sf_entry)-1 + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + name_loc->namelen + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) - return(-1); - return(xfs_attr_shortform_bytesfit(dp, bytes)); + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); } /* -- cgit v1.2.3 From 7ae077802c9f12959a81fa1a16c1ec2842dbae05 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 20 May 2013 09:51:16 +1000 Subject: xfs: remote attribute lookups require the value length When reading a remote attribute, to correctly calculate the length of the data buffer for CRC enable filesystems, we need to know the length of the attribute data. We get this information when we look up the attribute, but we don't store it in the args structure along with the other remote attr information we get from the lookup. Add this information to the args structure so we can use it appropriately. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit e461fcb194172b3f709e0b478d2ac1bdac7ab9a3) --- fs/xfs/xfs_attr_leaf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 8eeb88fb3201..0bce1b348580 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2332,9 +2332,10 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; + args->valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); + args->valuelen); return XFS_ERROR(EEXIST); } } -- cgit v1.2.3 From 7b92d03c3239f43e5b86c9cc9630f026d36ee995 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 24 May 2013 15:55:08 -0700 Subject: fat: fix possible overflow for fat_clusters Intermediate value of fat_clusters can be overflowed on 32bits arch. Reported-by: Krzysztof Strasburger Signed-off-by: OGAWA Hirofumi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index dfce656ddb33..5d4513cb1b3c 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1229,6 +1229,19 @@ static int fat_read_root(struct inode *inode) return 0; } +static unsigned long calc_fat_clusters(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + + /* Divide first to avoid overflow */ + if (sbi->fat_bits != 12) { + unsigned long ent_per_sec = sb->s_blocksize * 8 / sbi->fat_bits; + return ent_per_sec * sbi->fat_length; + } + + return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; +} + /* * Read the super block of an MS-DOS FS. */ @@ -1434,7 +1447,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, sbi->dirty = b->fat16.state & FAT_STATE_DIRTY; /* check that FAT table does not overflow */ - fat_clusters = sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits; + fat_clusters = calc_fat_clusters(sb); total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); if (total_clusters > MAX_FAT(sb)) { if (!silent) -- cgit v1.2.3 From afe1bb73f8ed588ab6268c27c5a447fe0484e48f Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 24 May 2013 15:55:12 -0700 Subject: ocfs2: unlock rw lock if inode lock failed In ocfs2_file_aio_write(), it does ocfs2_rw_lock() first and then ocfs2_inode_lock(). But if ocfs2_inode_lock() failed, it goes to out_sems without unlocking rw lock. This will cause a bug in ocfs2_lock_res_free() when testing res->l_ex_holders, which is increased in __ocfs2_cluster_lock() and decreased in __ocfs2_cluster_unlock(). Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Cc: Li Zefan Cc: "Duyongfeng (B)" Acked-by: Sunil Mushran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8a7509f9e6f5..ff54014a24ec 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2288,7 +2288,7 @@ relock: ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { mlog_errno(ret); - goto out_sems; + goto out; } ocfs2_inode_unlock(inode, 1); -- cgit v1.2.3 From fb09c3733a94b5f1dba50359d09c9e217c763fb9 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 24 May 2013 15:55:16 -0700 Subject: hfs: avoid crash in hfs_bnode_create Commit 634725a92938 ("hfs: cleanup HFS+ prints") removed the BUG_ON in hfs_bnode_create in hfsplus. This patch removes it from the hfs version and avoids an fsfuzzer crash. Signed-off-by: Jeff Mahoney Acked-by: Jeff Mahoney Signed-off-by: Jiri Slaby Cc: Vyacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfs/bnode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index f3b1a15ccd59..d3fa6bd9503e 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -415,7 +415,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num) spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, num); spin_unlock(&tree->hash_lock); - BUG_ON(node); + if (node) { + pr_crit("new node %u already hashed?\n", num); + WARN_ON(1); + return node; + } node = __hfs_bnode_create(tree, num); if (!node) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 6900807c6b95dcb004902302b8ac5dbfbf6feb89 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 24 May 2013 15:55:24 -0700 Subject: aio: fix io_getevents documentation In reviewing man pages, I noticed that io_getevents is documented to update the timeout that gets passed into the library call. This doesn't happen in kernel space or in the library (even though it's documented to do so in both places). Unless there is objection, I'd like to fix the comments/docs to match the code (I will also update the man page upon consensus). Signed-off-by: Jeff Moyer Signed-off-by: Benjamin LaHaise Acked-by: Cyril Hrubis Acked-by: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index c5b1a8c10411..3fcdd73b6f1b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1299,8 +1299,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, * < min_nr if the timeout specified by timeout has elapsed * before sufficient events are available, where timeout == NULL * specifies an infinite timeout. Note that the timeout pointed to by - * timeout is relative and will be updated if not NULL and the - * operation blocks. Will fail with -ENOSYS if not implemented. + * timeout is relative. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, -- cgit v1.2.3 From 136e8770cd5d1fe38b3c613100dd6dc4db6d4fa6 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 24 May 2013 15:55:29 -0700 Subject: nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary nilfs2: fix issue of nilfs_set_page_dirty for page at EOF boundary DESCRIPTION: There are use-cases when NILFS2 file system (formatted with block size lesser than 4 KB) can be remounted in RO mode because of encountering of "broken bmap" issue. The issue was reported by Anthony Doggett : "The machine I've been trialling nilfs on is running Debian Testing, Linux version 3.2.0-4-686-pae (debian-kernel@lists.debian.org) (gcc version 4.6.3 (Debian 4.6.3-14) ) #1 SMP Debian 3.2.35-2), but I've also reproduced it (identically) with Debian Unstable amd64 and Debian Experimental (using the 3.8-trunk kernel). The problematic partitions were formatted with "mkfs.nilfs2 -b 1024 -B 8192"." SYMPTOMS: (1) System log contains error messages likewise: [63102.496756] nilfs_direct_assign: invalid pointer: 0 [63102.496786] NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28) [63102.496798] [63102.524403] Remounting filesystem read-only (2) The NILFS2 file system is remounted in RO mode. REPRODUSING PATH: (1) Create volume group with name "unencrypted" by means of vgcreate utility. (2) Run script (prepared by Anthony Doggett ): ----------------[BEGIN SCRIPT]-------------------- VG=unencrypted lvcreate --size 2G --name ntest $VG mkfs.nilfs2 -b 1024 -B 8192 /dev/mapper/$VG-ntest mkdir /var/tmp/n mkdir /var/tmp/n/ntest mount /dev/mapper/$VG-ntest /var/tmp/n/ntest mkdir /var/tmp/n/ntest/thedir cd /var/tmp/n/ntest/thedir sleep 2 date darcs init sleep 2 dmesg|tail -n 5 date darcs whatsnew || true date sleep 2 dmesg|tail -n 5 ----------------[END SCRIPT]-------------------- REPRODUCIBILITY: 100% INVESTIGATION: As it was discovered, the issue takes place during segment construction after executing such sequence of user-space operations: open("_darcs/index", O_RDWR|O_CREAT|O_NOCTTY, 0666) = 7 fstat(7, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 ftruncate(7, 60) The error message "NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28)" takes place because of trying to get block number for third block of the file with logical offset #3072 bytes. As it is possible to see from above output, the file has 60 bytes of the whole size. So, it is enough one block (1 KB in size) allocation for the whole file. Trying to operate with several blocks instead of one takes place because of discovering several dirty buffers for this file in nilfs_segctor_scan_file() method. The root cause of this issue is in nilfs_set_page_dirty function which is called just before writing to an mmapped page. When nilfs_page_mkwrite function handles a page at EOF boundary, it fills hole blocks only inside EOF through __block_page_mkwrite(). The __block_page_mkwrite() function calls set_page_dirty() after filling hole blocks, thus nilfs_set_page_dirty function (= a_ops->set_page_dirty) is called. However, the current implementation of nilfs_set_page_dirty() wrongly marks all buffers dirty even for page at EOF boundary. As a result, buffers outside EOF are inconsistently marked dirty and queued for write even though they are not mapped with nilfs_get_block function. FIX: This modifies nilfs_set_page_dirty() not to mark hole blocks dirty. Thanks to Vyacheslav Dubeyko for his effort on analysis and proposals for this issue. Signed-off-by: Ryusuke Konishi Reported-by: Anthony Doggett Reported-by: Vyacheslav Dubeyko Cc: Vyacheslav Dubeyko Tested-by: Ryusuke Konishi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/inode.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 689fb608648e..bccfec8343c5 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -219,13 +219,32 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc) static int nilfs_set_page_dirty(struct page *page) { - int ret = __set_page_dirty_buffers(page); + int ret = __set_page_dirty_nobuffers(page); - if (ret) { + if (page_has_buffers(page)) { struct inode *inode = page->mapping->host; - unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits); + unsigned nr_dirty = 0; + struct buffer_head *bh, *head; - nilfs_set_file_dirty(inode, nr_dirty); + /* + * This page is locked by callers, and no other thread + * concurrently marks its buffers dirty since they are + * only dirtied through routines in fs/buffer.c in + * which call sites of mark_buffer_dirty are protected + * by page lock. + */ + bh = head = page_buffers(page); + do { + /* Do not mark hole blocks dirty */ + if (buffer_dirty(bh) || !buffer_mapped(bh)) + continue; + + set_buffer_dirty(bh); + nr_dirty++; + } while (bh = bh->b_this_page, bh != head); + + if (nr_dirty) + nilfs_set_file_dirty(inode, nr_dirty); } return ret; } -- cgit v1.2.3 From b4ca2b4b577c3530e34dcfaafccb2cc680ce95d1 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Fri, 24 May 2013 15:55:34 -0700 Subject: ocfs2: goto out_unlock if ocfs2_get_clusters_nocache() failed in ocfs2_fiemap() Last time we found there is lock/unlock bug in ocfs2_file_aio_write, and then we did a thorough search for all lock resources in ocfs2_inode_info, including rw, inode and open lockres and found this bug. My kernel version is 3.0.13, and it is also in the lastest version 3.9. In ocfs2_fiemap, once ocfs2_get_clusters_nocache failed, it should goto out_unlock instead of out, because we need release buffer head, up read alloc sem and unlock inode. Signed-off-by: Joseph Qi Reviewed-by: Jie Liu Cc: Mark Fasheh Cc: Joel Becker Acked-by: Sunil Mushran Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/extent_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 1c39efb71bab..2487116d0d33 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -790,7 +790,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, &hole_size, &rec, &is_last); if (ret) { mlog_errno(ret); - goto out; + goto out_unlock; } if (rec.e_blkno == 0ULL) { -- cgit v1.2.3 From 03e04f048d2774aabd126fbad84729d4ba9dc40a Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Fri, 24 May 2013 15:55:38 -0700 Subject: aio: fix kioctx not being freed after cancellation at exit time The recent changes overhauling fs/aio.c introduced a bug that results in the kioctx not being freed when outstanding kiocbs are cancelled at exit_aio() time. Specifically, a kiocb that is cancelled has its completion events discarded by batch_complete_aio(), which then fails to wake up the process stuck in free_ioctx(). Fix this by modifying the wait_event() condition in free_ioctx() appropriately. This patch was tested with the cancel operation in the thread based code posted yesterday. [akpm@linux-foundation.org: fix build] Signed-off-by: Benjamin LaHaise Signed-off-by: Kent Overstreet Cc: Kent Overstreet Cc: Josh Boyer Cc: Zach Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 3fcdd73b6f1b..7fe5bdee1630 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -307,7 +307,9 @@ static void free_ioctx(struct kioctx *ctx) kunmap_atomic(ring); while (atomic_read(&ctx->reqs_active) > 0) { - wait_event(ctx->wait, head != ctx->tail); + wait_event(ctx->wait, + head != ctx->tail || + atomic_read(&ctx->reqs_active) <= 0); avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; -- cgit v1.2.3 From f448badd34700ae728a32ba024249626d49c10e1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 29 May 2013 15:36:40 -0400 Subject: NFSv4: Fix a thinko in nfs4_try_open_cached We need to pass the full open mode flags to nfs_may_open() when doing a delegated open. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e2fe714d5c2..d7ba5616989c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1078,7 +1078,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; - int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC); + int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; nfs4_stateid stateid; int ret = -EAGAIN; -- cgit v1.2.3 From eb54d43707c69340581940e1fcaecb4d7d17b814 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 14 May 2013 14:37:56 -0400 Subject: NFS: Fix security flavor negotiation with legacy binary mounts Darrick J. Wong reports: > I have a kvm-based testing setup that netboots VMs over NFS, the > client end of which seems to have broken somehow in 3.10-rc1. The > server's exports file looks like this: > > /storage/mtr/x64 192.168.122.0/24(ro,sync,no_root_squash,no_subtree_check) > > On the client end (inside the VM), the initrd runs the following > command to try to mount the rootfs over NFS: > > # mount -o nolock -o ro -o retrans=10 192.168.122.1:/storage/mtr/x64/ /root > > (Note: This is the busybox mount command.) > > The mount fails with -EINVAL. Commit 4580a92d44 "NFS: Use server-recommended security flavor by default (NFSv3)" introduced a behavior regression for NFS mounts done via a legacy binary mount(2) call. Ensure that a default security flavor is specified for legacy binary mount requests, since they do not invoke nfs_select_flavor() in the kernel. Busybox uses klibc's nfsmount command, which performs NFS mounts using the legacy binary mount data format. /sbin/mount.nfs is not affected by this regression. Reported-by: Darrick J. Wong Signed-off-by: Chuck Lever Tested-by: Darrick J. Wong Acked-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a366107a7331..2d7525fbcf25 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1942,6 +1942,7 @@ static int nfs23_validate_mount_data(void *options, args->namlen = data->namlen; args->bsize = data->bsize; + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->flags & NFS_MOUNT_SECFLAVOUR) args->auth_flavors[0] = data->pseudoflavor; if (!args->nfs_server.hostname) @@ -2637,6 +2638,7 @@ static int nfs4_validate_mount_data(void *options, goto out_no_address; args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->auth_flavourlen) { if (data->auth_flavourlen > 1) goto out_inval_auth; -- cgit v1.2.3 From 08fb39051f5581df45ae2a20c6cf2d0c4cddf7c2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:00 +1000 Subject: xfs: avoid nesting transactions in xfs_qm_scall_setqlim() Lockdep reports: ============================================= [ INFO: possible recursive locking detected ] 3.9.0+ #3 Not tainted --------------------------------------------- setquota/28368 is trying to acquire lock: (sb_internal){++++.?}, at: [] xfs_trans_alloc+0x26/0x50 but task is already holding lock: (sb_internal){++++.?}, at: [] xfs_trans_alloc+0x26/0x50 from xfs_qm_scall_setqlim()->xfs_dqread() when a dquot needs to be allocated. xfs_qm_scall_setqlim() is starting a transaction and then not passing it into xfs_qm_dqet() and so it starts it's own transaction when allocating the dquot. Splat! Fix this by not allocating the dquot in xfs_qm_scall_setqlim() inside the setqlim transaction. This requires getting the dquot first (and allocating it if necessary) then dropping and relocking the dquot before joining it to the setqlim transaction. Reported-by: Michael L. Semon Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit f648167f3ac79018c210112508c732ea9bf67c7b) --- fs/xfs/xfs_qm_syscalls.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index c41190cad6e9..6cdf6ffc36a1 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -489,31 +489,36 @@ xfs_qm_scall_setqlim( if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) return 0; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - return (error); - } - /* * We don't want to race with a quotaoff so take the quotaoff lock. - * (We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening). (XXXThis doesn't currently happen - * because we take the vfslock before calling xfs_qm_sysent). + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. */ mutex_lock(&q->qi_quotaofflock); /* - * Get the dquot (locked), and join it to the transaction. - * Allocate the dquot if this doesn't exist. + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { ASSERT(error != ENOENT); goto out_unlock; } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -621,9 +626,10 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp, 0); - xfs_qm_dqrele(dqp); - out_unlock: +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: mutex_unlock(&q->qi_quotaofflock); return error; } -- cgit v1.2.3 From 2962f5a5dcc56f69cbf62121a7be67cc15d6940b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:25 +1000 Subject: xfs: kill suid/sgid through the truncate path. XFS has failed to kill suid/sgid bits correctly when truncating files of non-zero size since commit c4ed4243 ("xfs: split xfs_setattr") introduced in the 3.1 kernel. Fix it. Fix it. cc: stable kernel Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 56c19e89b38618390addfc743d822f99519055c6) --- fs/xfs/xfs_iops.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d82efaa2ac73..ca9ecaa81112 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -455,6 +455,28 @@ xfs_vn_getattr( return 0; } +static void +xfs_setattr_mode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -606,18 +628,8 @@ xfs_setattr_nonsize( /* * Change file access modes. */ - if (mask & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; - } + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); /* * Change file access or modified times. @@ -714,9 +726,8 @@ xfs_setattr_size( return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| - ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; @@ -860,6 +871,12 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; -- cgit v1.2.3 From 7d2ffe80aa000a149246b3745968634192eb5358 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:23 +1000 Subject: xfs: fix split buffer vector log recovery support A long time ago in a galaxy far away.... .. the was a commit made to fix some ilinux specific "fragmented buffer" log recovery problem: http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=b29c0bece51da72fb3ff3b61391a391ea54e1603 That problem occurred when a contiguous dirty region of a buffer was split across across two pages of an unmapped buffer. It's been a long time since that has been done in XFS, and the changes to log the entire inode buffers for CRC enabled filesystems has re-introduced that corner case. And, of course, it turns out that the above commit didn't actually fix anything - it just ensured that log recovery is guaranteed to fail when this situation occurs. And now for the gory details. xfstest xfs/085 is failing with this assert: XFS (vdb): bad number of regions (0) in inode log format XFS: Assertion failed: 0, file: fs/xfs/xfs_log_recover.c, line: 1583 Largely undocumented factoid #1: Log recovery depends on all log buffer format items starting with this format: struct foo_log_format { __uint16_t type; __uint16_t size; .... As recoery uses the size field and assumptions about 32 bit alignment in decoding format items. So don't pay much attention to the fact log recovery thinks that it decoding an inode log format item - it just uses them to determine what the size of the item is. But why would it see a log format item with a zero size? Well, luckily enough xfs_logprint uses the same code and gives the same error, so with a bit of gdb magic, it turns out that it isn't a log format that is being decoded. What logprint tells us is this: Oper (130): tid: a0375e1a len: 28 clientid: TRANS flags: none BUF: #regs: 2 start blkno: 144 (0x90) len: 16 bmap size: 2 flags: 0x4000 Oper (131): tid: a0375e1a len: 4096 clientid: TRANS flags: none BUF DATA ---------------------------------------------------------------------------- Oper (132): tid: a0375e1a len: 4096 clientid: TRANS flags: none xfs_logprint: unknown log operation type (4e49) ********************************************************************** * ERROR: data block=2 * ********************************************************************** That we've got a buffer format item (oper 130) that has two regions; the format item itself and one dirty region. The subsequent region after the buffer format item and it's data is them what we are tripping over, and the first bytes of it at an inode magic number. Not a log opheader like there is supposed to be. That means there's a problem with the buffer format item. It's dirty data region is 4096 bytes, and it contains - you guessed it - initialised inodes. But inode buffers are 8k, not 4k, and we log them in their entirety. So something is wrong here. The buffer format item contains: (gdb) p /x *(struct xfs_buf_log_format *)in_f $22 = {blf_type = 0x123c, blf_size = 0x2, blf_flags = 0x4000, blf_len = 0x10, blf_blkno = 0x90, blf_map_size = 0x2, blf_data_map = {0xffffffff, 0xffffffff, .... }} Two regions, and a signle dirty contiguous region of 64 bits. 64 * 128 = 8k, so this should be followed by a single 8k region of data. And the blf_flags tell us that the type of buffer is a XFS_BLFT_DINO_BUF. It contains inodes. And because it doesn't have the XFS_BLF_INODE_BUF flag set, that means it's an inode allocation buffer. So, it should be followed by 8k of inode data. But we know that the next region has a header of: (gdb) p /x *ohead $25 = {oh_tid = 0x1a5e37a0, oh_len = 0x100000, oh_clientid = 0x69, oh_flags = 0x0, oh_res2 = 0x0} and so be32_to_cpu(oh_len) = 0x1000 = 4096 bytes. It's simply not long enough to hold all the logged data. There must be another region. There is - there's a following opheader for another 4k of data that contains the other half of the inode cluster data - the one we assert fail on because it's not a log format header. So why is the second part of the data not being accounted to the correct buffer log format structure? It took a little more work with gdb to work out that the buffer log format structure was both expecting it to be there but hadn't accounted for it. It was at that point I went to the kernel code, as clearly this wasn't a bug in xfs_logprint and the kernel was writing bad stuff to the log. First port of call was the buffer item formatting code, and the discontiguous memory/contiguous dirty region handling code immediately stood out. I've wondered for a long time why the code had this comment in it: vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; /* * You would think we need to bump the nvecs here too, but we do not * this number is used by recovery, and it gets confused by the boundary * split here * nvecs++; */ vecp++; And it didn't account for the extra vector pointer. The case being handled here is that a contiguous dirty region lies across a boundary that cannot be memcpy()d across, and so has to be split into two separate operations for xlog_write() to perform. What this code assumes is that what is written to the log is two consecutive blocks of data that are accounted in the buf log format item as the same contiguous dirty region and so will get decoded as such by the log recovery code. The thing is, xlog_write() knows nothing about this, and so just does it's normal thing of adding an opheader for each vector. That means the 8k region gets written to the log as two separate regions of 4k each, but because nvecs has not been incremented, the buf log format item accounts for only one of them. Hence when we come to log recovery, we process the first 4k region and then expect to come across a new item that starts with a log format structure of some kind that tells us whenteh next data is going to be. Instead, we hit raw buffer data and things go bad real quick. So, the commit from 2002 that commented out nvecs++ is just plain wrong. It breaks log recovery completely, and it would seem the only reason this hasn't been since then is that we don't log large contigous regions of multi-page unmapped buffers very often. Never would be a closer estimate, at least until the CRC code came along.... So, lets fix that by restoring the nvecs accounting for the extra region when we hit this case..... .... and there's the problemin log recovery it is apparently working around: XFS: Assertion failed: i == item->ri_total, file: fs/xfs/xfs_log_recover.c, line: 2135 Yup, xlog_recover_do_reg_buffer() doesn't handle contigous dirty regions being broken up into multiple regions by the log formatting code. That's an easy fix, though - if the number of contiguous dirty bits exceeds the length of the region being copied out of the log, only account for the number of dirty bits that region covers, and then loop again and copy more from the next region. It's a 2 line fix. Now xfstests xfs/085 passes, we have one less piece of mystery code, and one more important piece of knowledge about how to structure new log format items.. Signed-off-by: Dave Chinner Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit 709da6a61aaf12181a8eea8443919ae5fc1b731d) --- fs/xfs/xfs_buf_item.c | 7 +------ fs/xfs/xfs_log_recover.c | 11 +++++++++++ 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cf263476d6b4..4ec431777048 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -262,12 +262,7 @@ xfs_buf_item_format_segment( vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* - * You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ + nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 93f03ec17eec..d9e4d3c3991a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2096,6 +2096,17 @@ xlog_recover_do_reg_buffer( ASSERT(BBTOB(bp->b_io_length) >= ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is -- cgit v1.2.3 From 1de09d1ae48152e56399aba0bfd984fb0ddae6b0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:20 +1000 Subject: xfs: fix incorrect remote symlink block count When CRCs are enabled, the number of blocks needed to hold a remote symlink on a 1k block size filesystem may be 2 instead of 1. The transaction reservation for the allocated blocks was not taking this into account and only allocating one block. Hence when trying to read or invalidate such symlinks, we are mapping a hole where there should be a block and things go bad at that point. Fix the reservation to use the correct block count, clean up the block count calculation similar to the remote attribute calculation, and add a debug guard to detect when we don't write the entire symlink to disk. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 321a95839e65db3759a07a3655184b0283af90fe) --- fs/xfs/xfs_symlink.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5f234389327c..195a403e1522 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -56,16 +56,9 @@ xfs_symlink_blocks( struct xfs_mount *mp, int pathlen) { - int fsblocks = 0; - int len = pathlen; + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - do { - fsblocks++; - len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - } while (len > 0); - - ASSERT(fsblocks <= XFS_SYMLINK_MAPS); - return fsblocks; + return (pathlen + buflen - 1) / buflen; } static int @@ -405,7 +398,7 @@ xfs_symlink( if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) fs_blocks = 0; else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); + fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); @@ -512,7 +505,7 @@ xfs_symlink( cur_chunk = target_path; offset = 0; for (n = 0; n < nmaps; n++) { - char *buf; + char *buf; d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); @@ -525,9 +518,7 @@ xfs_symlink( bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } + byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, @@ -542,6 +533,7 @@ xfs_symlink( xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - (char *)bp->b_addr); } + ASSERT(pathlen == 0); } /* -- cgit v1.2.3 From e7927e879d12d27aa06b9bbed57cc32dcd7d17fd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:26 +1000 Subject: xfs: add fsgeom flag for v5 superblock support. Currently userspace has no way of determining that a filesystem is CRC enabled. Add a flag to the XFS_IOC_FSGEOMETRY ioctl output to indicate that the filesystem has v5 superblock support enabled. This will allow xfs_info to correctly report the state of the filesystem. Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 74137fff067961c9aca1e14d073805c3de8549bd) --- fs/xfs/xfs_fs.h | 1 + fs/xfs/xfs_fsops.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 6dda3f949b04..d04695545397 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ #define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 87595b211da1..3c3644ea825b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -99,7 +99,9 @@ xfs_fs_geometry( (xfs_sb_version_hasattr2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; -- cgit v1.2.3 From 7c9950fd2ac97431230544142d5e652e1b948372 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 27 May 2013 16:38:24 +1000 Subject: xfs: disable swap extents ioctl on CRC enabled filesystems Currently, swapping extents from one inode to another is a simple act of switching data and attribute forks from one inode to another. This, unfortunately in no longer so simple with CRC enabled filesystems as there is owner information embedded into the BMBT blocks that are swapped between inodes. Hence swapping the forks between inodes results in the inodes having mapping blocks that point to the wrong owner and hence are considered corrupt. To fix this we need an extent tree block or record based swap algorithm so that the BMBT block owner information can be updated atomically in the swap transaction. This is a significant piece of new work, so for the moment simply don't allow swap extent operations to succeed on CRC enabled filesystems. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Reviewed-by: Brian Foster Signed-off-by: Ben Myers (cherry picked from commit 02f75405a75eadfb072609f6bf839e027de6a29a) --- fs/xfs/xfs_dfrag.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f852b082a084..c407e1ccff43 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -219,6 +219,14 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; + /* + * We have no way of updating owner information in the BMBT blocks for + * each inode on CRC enabled filesystems, so to avoid corrupting the + * this metadata we simply don't allow extent swaps to occur. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return XFS_ERROR(EINVAL); + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); -- cgit v1.2.3 From e400d27d1690d609f203f2d7d8efebc98cbc3089 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 28 May 2013 18:37:17 +1000 Subject: xfs: fix dir3 freespace block corruption When the directory freespace index grows to a second block (2017 4k data blocks in the directory), the initialisation of the second new block header goes wrong. The write verifier fires a corruption error indicating that the block number in the header is zero. This was being tripped by xfs/110. The problem is that the initialisation of the new block is done just fine in xfs_dir3_free_get_buf(), but the caller then users a dirv2 structure to zero on-disk header fields that xfs_dir3_free_get_buf() has already zeroed. These lined up with the block number in the dir v3 header format. While looking at this, I noticed that the struct xfs_dir3_free_hdr() had 4 bytes of padding in it that wasn't defined as padding or being zeroed by the initialisation. Add a pad field declaration and fully zero the on disk and in-core headers in xfs_dir3_free_get_buf() so that this is never an issue in the future. Note that this doesn't change the on-disk layout, just makes the 32 bits of padding in the layout explicit. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 5ae6e6a401957698f2bd8c9f4a86d86d02199fea) --- fs/xfs/xfs_dir2_format.h | 1 + fs/xfs/xfs_dir2_node.c | 13 ++++++------- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index a3b1bd841a80..995f1f505a52 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -715,6 +715,7 @@ struct xfs_dir3_free_hdr { __be32 firstdb; /* db of first entry */ __be32 nvalid; /* count of valid entries */ __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment. */ }; struct xfs_dir3_free { diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5246de4912d4..2226a00acd15 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -263,18 +263,19 @@ xfs_dir3_free_get_buf( * Initialize the new block to be empty, and remember * its first slot as our empty slot. */ - hdr.magic = XFS_DIR2_FREE_MAGIC; - hdr.firstdb = 0; - hdr.nused = 0; - hdr.nvalid = 0; + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; hdr.magic = XFS_DIR3_FREE_MAGIC; + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); - } + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); *bpp = bp; return 0; @@ -1921,8 +1922,6 @@ xfs_dir2_node_addname_int( */ freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * xfs_dir3_free_max_bests(mp); - free->hdr.nvalid = 0; - free->hdr.nused = 0; } else { free = fbp->b_addr; bests = xfs_dir3_free_bests_p(mp, free); -- cgit v1.2.3 From 9531e2de6b7f04bd734b4bbc1e16a6955121615a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:01 +1000 Subject: xfs: remote attribute allocation may be contiguous When CRCs are enabled, there may be multiple allocations made if the headers cause a length overflow. This, however, does not mean that the number of headers required increases, as the second and subsequent extents may be contiguous with the previous extent. Hence when we map the extents to write the attribute data, we may end up with less extents than allocations made. Hence the assertion that we consume the number of headers we calculated in the allocation loop is incorrect and needs to be removed. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 90253cf142469a40f89f989904abf0a1e500e1a6) --- fs/xfs/xfs_attr_remote.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index dee84466dcc9..aad95b08e76b 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -359,6 +359,11 @@ xfs_attr_rmtval_set( * into requiring more blocks. e.g. for 512 byte blocks, we'll * spill for another block every 9 headers we require in this * loop. + * + * Note that this can result in contiguous allocation of blocks, + * so we don't use all the space we allocate for headers as we + * have one less header for each contiguous allocation that + * occurs in the map/write loop below. */ if (crcs && blkcnt == 0) { int total_len; @@ -439,7 +444,6 @@ xfs_attr_rmtval_set( lblkno += map.br_blockcount; } ASSERT(valuelen == 0); - ASSERT(hdrcnt == 0); return 0; } -- cgit v1.2.3 From 551b382f5368900d6d82983505cb52553c946a2b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:02 +1000 Subject: xfs: remote attribute read too short Reading a maximally size remote attribute fails when CRCs are enabled with this verification error: XFS (vdb): remote attribute header does not match required off/len/owner) There are two reasons for this, the first being that the length of the buffer being read is determined from the args->rmtblkcnt which doesn't take into account CRC headers. Hence the mapped length ends up being too short and so we need to calculate it directly from the value length. The second is that the byte count of valid data within a buffer is capped by the length of the data and so doesn't take into account that the buffer might be longer due to headers. Hence we need to calculate the data space in the buffer first before calculating the actual byte count of data. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 913e96bc292e1bb248854686c79d6545ef3ee720) --- fs/xfs/xfs_attr_remote.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index aad95b08e76b..bcdc07c4e8f4 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -52,9 +52,11 @@ xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, - mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); } static bool @@ -206,8 +208,9 @@ xfs_attr_rmtval_get( while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; + blkcnt = xfs_attr3_rmt_blocks(mp, valuelen); error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, + blkcnt, map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; @@ -227,8 +230,8 @@ xfs_attr_rmtval_get( if (error) return error; - byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length)); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); + byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, BBTOB(bp->b_length)); + byte_cnt = min_t(int, valuelen, byte_cnt); src = bp->b_addr; if (xfs_sb_version_hascrc(&mp->m_sb)) { -- cgit v1.2.3 From 26f714450c3907ce07c41a0bd1bea40368e0b4da Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:03 +1000 Subject: xfs: remote attribute tail zeroing does too much When an attribute data does not fill then entire remote block, we zero the remaining part of the buffer. This, however, needs to take into account that the buffer has a header, and so the offset where zeroing starts and the length of zeroing need to take this into account. Otherwise we end up with zeros over the end of the attribute value when CRCs are enabled. While there, make sure we only ask to map an extent that covers the remaining range of the attribute, rather than asking every time for the full length of remote data. If the remote attribute blocks are contiguous with other parts of the attribute tree, it will map those blocks as well and we can potentially zero them incorrectly. We can also get buffer size mistmatches when trying to read or remove the remote attribute, and this can lead to not finding the correct buffer when looking it up in cache. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 4af3644c9a53eb2f1ecf69cc53576561b64be4c6) --- fs/xfs/xfs_attr_remote.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index bcdc07c4e8f4..e207bf0004b6 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -296,10 +296,7 @@ xfs_attr_rmtval_set( * and we may not need that many, so we have to handle this when * allocating the blocks below. */ - if (!crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - else - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); @@ -394,8 +391,11 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; valuelen = args->valuelen; + blkcnt = args->rmtblkcnt; while (valuelen > 0) { int byte_cnt; + int hdr_size; + int dblkcnt; char *buf; /* @@ -404,7 +404,7 @@ xfs_attr_rmtval_set( xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); @@ -413,26 +413,25 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) return ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; - - byte_cnt = BBTOB(bp->b_length); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); - if (valuelen < byte_cnt) - byte_cnt = valuelen; - buf = bp->b_addr; - buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, + + byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, BBTOB(bp->b_length)); + byte_cnt = min_t(int, valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, byte_cnt, bp); - memcpy(buf, src, byte_cnt); + ASSERT(hdr_size + byte_cnt <= BBTOB(bp->b_length)); - if (byte_cnt < BBTOB(bp->b_length)) - xfs_buf_zero(bp, byte_cnt, - BBTOB(bp->b_length) - byte_cnt); + memcpy(buf + hdr_size, src, byte_cnt); + + if (byte_cnt + hdr_size < BBTOB(bp->b_length)) + xfs_buf_zero(bp, byte_cnt + hdr_size, + BBTOB(bp->b_length) - byte_cnt - hdr_size); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); @@ -442,9 +441,9 @@ xfs_attr_rmtval_set( src += byte_cnt; valuelen -= byte_cnt; offset += byte_cnt; - hdrcnt--; lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } ASSERT(valuelen == 0); return 0; -- cgit v1.2.3 From 58a72281555bf301f6dff24db2db205c87ef8db1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:04 +1000 Subject: xfs: correctly map remote attr buffers during removal If we don't map the buffers correctly (same as for get/set operations) then the incore buffer lookup will fail. If a block number matches but a length is wrong, then debug kernels will ASSERT fail in _xfs_buf_find() due to the length mismatch. Ensure that we map the buffers correctly by basing the length of the buffer on the attribute data length rather than the remote block count. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 6863ef8449f1908c19f43db572e4474f24a1e9da) --- fs/xfs/xfs_attr_remote.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index e207bf0004b6..d8bcb2d742d1 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -468,19 +468,25 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) mp = args->dp->i_mount; /* - * Roll through the "value", invalidating the attribute value's - * blocks. + * Roll through the "value", invalidating the attribute value's blocks. + * Note that args->rmtblkcnt is the minimum number of data blocks we'll + * see for a CRC enabled remote attribute. Each extent will have a + * header, and so we may have more blocks than we realise here. If we + * fail to map the blocks correctly, we'll have problems with the buffer + * lookups. */ lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; + valuelen = args->valuelen; + blkcnt = xfs_attr3_rmt_blocks(mp, valuelen); while (valuelen > 0) { + int dblkcnt; + /* * Try to remember where we decided to put the value. */ nmap = 1; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap == 1); @@ -488,28 +494,31 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } - valuelen -= map.br_blockcount; + valuelen -= XFS_ATTR3_RMT_BUF_SPACE(mp, + XFS_FSB_TO_B(mp, map.br_blockcount)); lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + blkcnt = max(blkcnt, xfs_attr3_rmt_blocks(mp, valuelen)); } /* * Keep de-allocating extents until the remote-value region is gone. */ + blkcnt = lblkno - args->rmtblkno; lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; done = 0; while (!done) { xfs_bmap_init(args->flist, args->firstblock); -- cgit v1.2.3 From 9e80c76205b46b338cb56c336148f54b2326342f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:05 +1000 Subject: xfs: fully initialise temp leaf in xfs_attr3_leaf_unbalance xfs_attr3_leaf_unbalance() uses a temporary buffer for recombining the entries in two leaves when the destination leaf requires compaction. The temporary buffer ends up being copied back over the original destination buffer, so the header in the temporary buffer needs to contain all the information that is in the destination buffer. To make sure the temporary buffer is fully initialised, once we've set up the temporary incore header appropriately, write is back to the temporary buffer before starting to move entries around. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 8517de2a81da830f5d90da66b4799f4040c76dc9) --- fs/xfs/xfs_attr_leaf.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 0bce1b348580..79ece72976ae 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -2181,14 +2181,24 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP); - memset(tmp_leaf, 0, state->blocksize); - memset(&tmphdr, 0, sizeof(tmphdr)); + tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + memset(&tmphdr, 0, sizeof(tmphdr)); tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; tmphdr.firstused = state->blocksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, -- cgit v1.2.3 From 634fd5322a3e6ae632dcf5f20eebc0583ba50838 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:06 +1000 Subject: xfs: fully initialise temp leaf in xfs_attr3_leaf_compact xfs_attr3_leaf_compact() uses a temporary buffer for compacting the the entries in a leaf. It copies the the original buffer into the temporary buffer, then zeros the original buffer completely. It then copies the entries back into the original buffer. However, the original buffer has not been correctly initialised, and so the movement of the entries goes horribly wrong. Make sure the zeroed destination buffer is fully initialised, and once we've set up the destination incore header appropriately, write is back to the buffer before starting to move entries around. While debugging this, the _d/_s prefixes weren't sufficient to remind me what buffer was what, so rename then all _src/_dst. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit d4c712bcf26a25c2b67c90e44e0b74c7993b5334) --- fs/xfs/xfs_attr_leaf.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 79ece72976ae..5b03d15b707b 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1445,11 +1445,12 @@ xfs_attr3_leaf_add_work( STATIC void xfs_attr3_leaf_compact( struct xfs_da_args *args, - struct xfs_attr3_icleaf_hdr *ichdr_d, + struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - struct xfs_attr3_icleaf_hdr ichdr_s; + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; @@ -1457,29 +1458,38 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; /* - * Copy basic information + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. */ - leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->b_addr; - ichdr_s = *ichdr_d; /* struct copy */ - ichdr_d->firstused = XFS_LBSIZE(mp); - ichdr_d->usedbytes = 0; - ichdr_d->count = 0; - ichdr_d->holes = 0; - ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s); - ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = XFS_LBSIZE(mp); + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0, - ichdr_s.count, mp); + xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, + ichdr_src.count, mp); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. -- cgit v1.2.3 From 7bc0dc271e494e12be3afd3c6431e5216347c624 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 21 May 2013 18:02:08 +1000 Subject: xfs: rework remote attr CRCs Note: this changes the on-disk remote attribute format. I assert that this is OK to do as CRCs are marked experimental and the first kernel it is included in has not yet reached release yet. Further, the userspace utilities are still evolving and so anyone using this stuff right now is a developer or tester using volatile filesystems for testing this feature. Hence changing the format right now to save longer term pain is the right thing to do. The fundamental change is to move from a header per extent in the attribute to a header per filesytem block in the attribute. This means there are more header blocks and the parsing of the attribute data is slightly more complex, but it has the advantage that we always know the size of the attribute on disk based on the length of the data it contains. This is where the header-per-extent method has problems. We don't know the size of the attribute on disk without first knowing how many extents are used to hold it. And we can't tell from a mapping lookup, either, because remote attributes can be allocated contiguously with other attribute blocks and so there is no obvious way of determining the actual size of the atribute on disk short of walking and mapping buffers. The problem with this approach is that if we map a buffer incorrectly (e.g. we make the last buffer for the attribute data too long), we then get buffer cache lookup failure when we map it correctly. i.e. we get a size mismatch on lookup. This is not necessarily fatal, but it's a cache coherency problem that can lead to returning the wrong data to userspace or writing the wrong data to disk. And debug kernels will assert fail if this occurs. I found lots of niggly little problems trying to fix this issue on a 4k block size filesystem, finally getting it to pass with lots of fixes. The thing is, 1024 byte filesystems still failed, and it was getting really complex handling all the corner cases that were showing up. And there were clearly more that I hadn't found yet. It is complex, fragile code, and if we don't fix it now, it will be complex, fragile code forever more. Hence the simple fix is to add a header to each filesystem block. This gives us the same relationship between the attribute data length and the number of blocks on disk as we have without CRCs - it's a linear mapping and doesn't require us to guess anything. It is simple to implement, too - the remote block count calculated at lookup time can be used by the remote attribute set/get/remove code without modification for both CRC and non-CRC filesystems. The world becomes sane again. Because the copy-in and copy-out now need to iterate over each filesystem block, I moved them into helper functions so we separate the block mapping and buffer manupulations from the attribute data and CRC header manipulations. The code becomes much clearer as a result, and it is a lot easier to understand and debug. It also appears to be much more robust - once it worked on 4k block size filesystems, it has worked without failure on 1k block size filesystems, too. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit ad1858d77771172e08016890f0eb2faedec3ecee) --- fs/xfs/xfs_attr_leaf.c | 13 +- fs/xfs/xfs_attr_remote.c | 381 ++++++++++++++++++++++++++++------------------- fs/xfs/xfs_attr_remote.h | 10 ++ fs/xfs/xfs_buf.c | 1 + 4 files changed, 247 insertions(+), 158 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 5b03d15b707b..d788302e506a 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1412,7 +1412,7 @@ xfs_attr3_leaf_add_work( name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; - args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -2354,8 +2354,9 @@ xfs_attr3_leaf_lookup_int( args->index = probe; args->valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->valuelen); return XFS_ERROR(EEXIST); } } @@ -2406,7 +2407,8 @@ xfs_attr3_leaf_getvalue( ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; return 0; @@ -2732,7 +2734,8 @@ xfs_attr3_leaf_list_int( args.valuelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); if (retval) return retval; diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index d8bcb2d742d1..ef6b0c124528 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -47,7 +47,7 @@ * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ -static int +int xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) @@ -59,12 +59,43 @@ xfs_attr3_rmt_blocks( return XFS_B_TO_FSB(mp, attrlen); } +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + static bool xfs_attr3_rmt_verify( - struct xfs_buf *bp) + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; @@ -72,7 +103,9 @@ xfs_attr3_rmt_verify( return false; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) return false; - if (bp->b_bn != be64_to_cpu(rmt->rm_blkno)) + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) @@ -88,17 +121,40 @@ xfs_attr3_rmt_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + bool corrupt = false; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF) || - !xfs_attr3_rmt_verify(bp)) { + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), + XFS_ATTR3_RMT_CRC_OFF)) { + corrupt = true; + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + corrupt = true; + break; + } + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; + } + + if (corrupt) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); - } + } else + ASSERT(len == 0); } static void @@ -107,23 +163,39 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_attr3_rmt_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; - if (bip) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF); + + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF); + ASSERT(len == 0); } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { @@ -131,15 +203,16 @@ const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .verify_write = xfs_attr3_rmt_write_verify, }; -static int +STATIC int xfs_attr3_rmt_hdr_set( struct xfs_mount *mp, + void *ptr, xfs_ino_t ino, uint32_t offset, uint32_t size, - struct xfs_buf *bp) + xfs_daddr_t bno) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return 0; @@ -149,36 +222,107 @@ xfs_attr3_rmt_hdr_set( rmt->rm_bytes = cpu_to_be32(size); uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); rmt->rm_owner = cpu_to_be64(ino); - rmt->rm_blkno = cpu_to_be64(bp->b_bn); - bp->b_ops = &xfs_attr3_rmt_buf_ops; + rmt->rm_blkno = cpu_to_be64(bno); return sizeof(struct xfs_attr3_rmt_hdr); } /* - * Checking of the remote attribute header is split into two parts. the verifier - * does CRC, location and bounds checking, the unpacking function checks the - * attribute parameters and owner. + * Helper functions to copy attribute data in and out of the one disk extents */ -static bool -xfs_attr3_rmt_hdr_ok( - struct xfs_mount *mp, - xfs_ino_t ino, - uint32_t offset, - uint32_t size, - struct xfs_buf *bp) +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **dst) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); - if (offset != be32_to_cpu(rmt->rm_offset)) - return false; - if (size != be32_to_cpu(rmt->rm_bytes)) - return false; - if (ino != be64_to_cpu(rmt->rm_owner)) - return false; + ASSERT(len >= XFS_LBSIZE(mp)); - /* ok */ - return true; + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min_t(int, *valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + src += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == XFS_LBSIZE(mp)); + memset(dst + hdr_size + byte_cnt, 0, + XFS_LBSIZE(mp) - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + dst += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } } /* @@ -192,13 +336,12 @@ xfs_attr_rmtval_get( struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; struct xfs_mount *mp = args->dp->i_mount; struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno = args->rmtblkno; - void *dst = args->value; + char *dst = args->value; int valuelen = args->valuelen; int nmap; int error; - int blkcnt; + int blkcnt = args->rmtblkcnt; int i; int offset = 0; @@ -208,7 +351,6 @@ xfs_attr_rmtval_get( while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; - blkcnt = xfs_attr3_rmt_blocks(mp, valuelen); error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, blkcnt, map, &nmap, XFS_BMAPI_ATTRFORK); @@ -217,45 +359,29 @@ xfs_attr_rmtval_get( ASSERT(nmap >= 1); for (i = 0; (i < nmap) && (valuelen > 0); i++) { - int byte_cnt; - char *src; + xfs_daddr_t dblkno; + int dblkcnt; ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, + dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) return error; - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, BBTOB(bp->b_length)); - byte_cnt = min_t(int, valuelen, byte_cnt); - - src = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino, - offset, byte_cnt, bp)) { - xfs_alert(mp, -"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", - offset, byte_cnt, args->dp->i_ino); - xfs_buf_relse(bp); - return EFSCORRUPTED; - - } - - src += sizeof(struct xfs_attr3_rmt_hdr); - } - - memcpy(dst, src, byte_cnt); + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); xfs_buf_relse(bp); + if (error) + return error; - offset += byte_cnt; - dst += byte_cnt; - valuelen -= byte_cnt; - + /* roll attribute extent map forwards */ lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; } } ASSERT(valuelen == 0); @@ -273,17 +399,13 @@ xfs_attr_rmtval_set( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; - void *src = args->value; + char *src = args->value; int blkcnt; int valuelen; int nmap; int error; - int hdrcnt = 0; - bool crcs = xfs_sb_version_hascrc(&mp->m_sb); int offset = 0; trace_xfs_attr_rmtval_set(args); @@ -292,21 +414,14 @@ xfs_attr_rmtval_set( * Find a "hole" in the attribute address space large enough for * us to drop the new attribute's value into. Because CRC enable * attributes have headers, we can't just do a straight byte to FSB - * conversion. We calculate the worst case block count in this case - * and we may not need that many, so we have to handle this when - * allocating the blocks below. + * conversion and have to take the header space into account. */ blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); - error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) return error; - /* Start with the attribute data. We'll allocate the rest afterwards. */ - if (crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; @@ -349,31 +464,6 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; - hdrcnt++; - - /* - * If we have enough blocks for the attribute data, calculate - * how many extra blocks we need for headers. We might run - * through this multiple times in the case that the additional - * headers in the blocks needed for the data fragments spills - * into requiring more blocks. e.g. for 512 byte blocks, we'll - * spill for another block every 9 headers we require in this - * loop. - * - * Note that this can result in contiguous allocation of blocks, - * so we don't use all the space we allocate for headers as we - * have one less header for each contiguous allocation that - * occurs in the map/write loop below. - */ - if (crcs && blkcnt == 0) { - int total_len; - - total_len = args->valuelen + - hdrcnt * sizeof(struct xfs_attr3_rmt_hdr); - blkcnt = XFS_B_TO_FSB(mp, total_len); - blkcnt -= args->rmtblkcnt; - args->rmtblkcnt += blkcnt; - } /* * Start the next trans in the chain. @@ -390,17 +480,15 @@ xfs_attr_rmtval_set( * the INCOMPLETE flag. */ lblkno = args->rmtblkno; - valuelen = args->valuelen; blkcnt = args->rmtblkcnt; + valuelen = args->valuelen; while (valuelen > 0) { - int byte_cnt; - int hdr_size; - int dblkcnt; - char *buf; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); - /* - * Try to remember where we decided to put the value. - */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, @@ -419,29 +507,17 @@ xfs_attr_rmtval_set( if (!bp) return ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; - buf = bp->b_addr; - - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, BBTOB(bp->b_length)); - byte_cnt = min_t(int, valuelen, byte_cnt); - hdr_size = xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, - byte_cnt, bp); - ASSERT(hdr_size + byte_cnt <= BBTOB(bp->b_length)); - memcpy(buf + hdr_size, src, byte_cnt); - - if (byte_cnt + hdr_size < BBTOB(bp->b_length)) - xfs_buf_zero(bp, byte_cnt + hdr_size, - BBTOB(bp->b_length) - byte_cnt - hdr_size); + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) return error; - src += byte_cnt; - valuelen -= byte_cnt; - offset += byte_cnt; + /* roll attribute extent map forwards */ lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; } @@ -454,19 +530,17 @@ xfs_attr_rmtval_set( * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove(xfs_da_args_t *args) +xfs_attr_rmtval_remove( + struct xfs_da_args *args) { - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; trace_xfs_attr_rmtval_remove(args); - mp = args->dp->i_mount; - /* * Roll through the "value", invalidating the attribute value's blocks. * Note that args->rmtblkcnt is the minimum number of data blocks we'll @@ -476,10 +550,13 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) * lookups. */ lblkno = args->rmtblkno; - valuelen = args->valuelen; - blkcnt = xfs_attr3_rmt_blocks(mp, valuelen); - while (valuelen > 0) { - int dblkcnt; + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; /* * Try to remember where we decided to put the value. @@ -506,21 +583,19 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) bp = NULL; } - valuelen -= XFS_ATTR3_RMT_BUF_SPACE(mp, - XFS_FSB_TO_B(mp, map.br_blockcount)); - lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; - blkcnt = max(blkcnt, xfs_attr3_rmt_blocks(mp, valuelen)); } /* * Keep de-allocating extents until the remote-value region is gone. */ - blkcnt = lblkno - args->rmtblkno; lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; done = 0; while (!done) { + int committed; + xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h index c7cca60a062a..92a8fd7977cc 100644 --- a/fs/xfs/xfs_attr_remote.h +++ b/fs/xfs/xfs_attr_remote.h @@ -20,6 +20,14 @@ #define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ +/* + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ struct xfs_attr3_rmt_hdr { __be32 rm_magic; __be32 rm_offset; @@ -39,6 +47,8 @@ struct xfs_attr3_rmt_hdr { extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0d2554299688..1b2472a46e46 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -513,6 +513,7 @@ _xfs_buf_find( xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", __func__, blkno, eofs); + WARN_ON(1); return NULL; } -- cgit v1.2.3 From 5d477b6079619910dab882fa229cce1f14f86cf8 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 10 May 2013 14:04:11 +0200 Subject: vfs: Fix invalid ida_remove() call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the group id of a shared mount is not allocated, the umount still tries to call mnt_release_group_id(), which eventually hits a kernel warning at ida_remove() spewing a message like: ida_remove called for id=0 which is not allocated. This patch fixes the bug simply checking the group id in the caller. Reported-by: Cristian Rodríguez Signed-off-by: Takashi Iwai Signed-off-by: Al Viro --- fs/pnode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/pnode.c b/fs/pnode.c index 3d2a7141b87a..9af0df15256e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt) if (peer_mnt == mnt) peer_mnt = NULL; } - if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && + list_empty(&mnt->mnt_share)) mnt_release_group_id(mnt); list_del_init(&mnt->mnt_share); -- cgit v1.2.3 From 1d7095c72d35eee4ebc28e66563e636b9adafeb2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 May 2013 15:21:56 -0400 Subject: qnx6: qnx6_readdir() has a braino in pos calculation We want to mask lower 5 bits out, not leave only those and clear the rest... As it is, we end up always starting to read from the beginning of directory, no matter what the current position had been. Signed-off-by: Al Viro --- fs/qnx6/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8798d065e400..afa6be6fc397 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -120,7 +120,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = file_inode(filp); struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); + loff_t pos = filp->f_pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; -- cgit v1.2.3 From 31abdab9c11bb1694ecd1476a7edbe8e964d94ac Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 May 2013 02:38:52 -0400 Subject: hpfs: deadlock and race in directory lseek() For one thing, there's an ABBA deadlock on hpfs fs-wide lock and i_mutex in hpfs_dir_lseek() - there's a lot of methods that grab the former with the caller already holding the latter, so it must take i_mutex first. For another, locking the damn thing, carefully validating the offset, then dropping locks and assigning the offset is obviously racy. Moreover, we _must_ do hpfs_add_pos(), or the machinery in dnode.c won't modify the sucker on B-tree surgeries. Signed-off-by: Al Viro --- fs/hpfs/dir.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 546f6d39713a..834ac13c04b7 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,25 +33,27 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; + mutex_lock(&i->i_mutex); hpfs_lock(s); /*printk("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; - mutex_lock(&i->i_mutex); pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); else goto fail; if (pos == 12) goto fail; } - mutex_unlock(&i->i_mutex); + hpfs_add_pos(i, &filp->f_pos); ok: + filp->f_pos = new_off; hpfs_unlock(s); - return filp->f_pos = new_off; -fail: mutex_unlock(&i->i_mutex); + return new_off; +fail: /*printk("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); + mutex_unlock(&i->i_mutex); return -ESPIPE; } -- cgit v1.2.3 From 448293aadb54ab38b9c053bf9f1eecafdc0ed214 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 22 May 2013 13:41:26 -0400 Subject: befs_readdir(): do not increment ->f_pos if filldir tells us to stop Signed-off-by: Al Viro --- fs/befs/linuxvfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8615ee89ab55..f95dddced968 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -265,8 +265,8 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir) result = filldir(dirent, keybuf, keysize, filp->f_pos, (ino_t) value, d_type); } - - filp->f_pos++; + if (!result) + filp->f_pos++; befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos); -- cgit v1.2.3 From 0bdc7acba56a7ca4232f15f37b16f7ec079385ab Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 31 May 2013 15:07:52 -0400 Subject: reiserfs: fix spurious multiple-fill in reiserfs_readdir_dentry After sleeping for filldir(), we check to see if the file system has changed and research. The next_pos pointer is updated but its value isn't pushed into the key used for the search itself. As a result, the search returns the same item that the last cycle of the loop did and filldir() is called multiple times with the same data. The end result is that the buffer can contain the same name multiple times. This can be returned to userspace or used internally in the xattr code where it can manifest with the following warning: jdm-20004 reiserfs_delete_xattrs: Couldn't delete all xattrs (-2) reiserfs_for_each_xattr uses reiserfs_readdir_dentry to iterate over the xattr names and ends up trying to unlink the same name twice. The second attempt fails with -ENOENT and the error is returned. At some point I'll need to add support into reiserfsck to remove the orphaned directories left behind when this occurs. The fix is to push the value into the key before researching. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara --- fs/reiserfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 66c53b642a88..6c2d136561cb 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -204,6 +204,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, next_pos = deh_offset(deh) + 1; if (item_moved(&tmp_ih, &path_to_entry)) { + set_cpu_key_k_offset(&pos_key, + next_pos); goto research; } } /* for */ -- cgit v1.2.3 From 4a8570112b76a63ad21cfcbe2783f98f7fd5ba1b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 31 May 2013 15:54:17 -0400 Subject: reiserfs: fix problems with chowning setuid file w/ xattrs reiserfs_chown_xattrs() takes the iattr struct passed into ->setattr and uses it to iterate over all the attrs associated with a file to change ownership of xattrs (and transfer quota associated with the xattr files). When the setuid bit is cleared during chown, ATTR_MODE and iattr->ia_mode are passed to all the xattrs as well. This means that the xattr directory will have S_IFREG added to its mode bits. This has been prevented in practice by a missing IS_PRIVATE check in reiserfs_acl_chmod, which caused a double-lock to occur while holding the write lock. Since the file system was completely locked up, the writeout of the corrupted mode never happened. This patch temporarily clears everything but ATTR_UID|ATTR_GID for the calls to reiserfs_setattr and adds the missing IS_PRIVATE check. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara --- fs/reiserfs/xattr.c | 14 +++++++++++++- fs/reiserfs/xattr_acl.c | 3 +++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4cce1d9552fb..821bcf70e467 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -318,7 +318,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data) static int chown_one_xattr(struct dentry *dentry, void *data) { struct iattr *attrs = data; - return reiserfs_setattr(dentry, attrs); + int ia_valid = attrs->ia_valid; + int err; + + /* + * We only want the ownership bits. Otherwise, we'll do + * things like change a directory to a regular file if + * ATTR_MODE is set. + */ + attrs->ia_valid &= (ATTR_UID|ATTR_GID); + err = reiserfs_setattr(dentry, attrs); + attrs->ia_valid = ia_valid; + + return err; } /* No i_mutex, but the inode is unconnected. */ diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d7c01ef64eda..6c8767fdfc6a 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode) int depth; int error; + if (IS_PRIVATE(inode)) + return 0; + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; -- cgit v1.2.3 From a1457c0ce976bad1356b9b0437f2a5c3ab8a9cfc Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 31 May 2013 15:51:17 -0400 Subject: reiserfs: fix deadlock with nfs racing on create/lookup Reiserfs is currently able to be deadlocked by having two NFS clients where one has removed and recreated a file and another is accessing the file with an open file handle. If one client deletes and recreates a file with timing such that the recreated file obtains the same [dirid, objectid] pair as the original file while another client accesses the file via file handle, the create and lookup can race and deadlock if the lookup manages to create the in-memory inode first. The create thread, in insert_inode_locked4, will hold the write lock while waiting on the other inode to be unlocked. The lookup thread, anywhere in the iget path, will release and reacquire the write lock while it schedules. If it needs to reacquire the lock while the create thread has it, it will never be able to make forward progress because it needs to reacquire the lock before ultimately unlocking the inode. This patch drops the write lock across the insert_inode_locked4 call so that the ordering of inode_wait -> write lock is retained. Since this would have been the case before the BKL push-down, this is safe. Signed-off-by: Jeff Mahoney Signed-off-by: Jan Kara --- fs/reiserfs/inode.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 77d6d47abc83..f844533792ee 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1811,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - if (insert_inode_locked4(inode, args.objectid, - reiserfs_find_actor, &args) < 0) { + + reiserfs_write_unlock(inode->i_sb); + err = insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args); + reiserfs_write_lock(inode->i_sb); + if (err) { err = -EINVAL; goto out_bad_inode; } + if (old_format_only(sb)) /* not a perfect generation count, as object ids can be reused, but ** this is as good as reiserfs can do right now. -- cgit v1.2.3 From 28420dad233520811c0e0860e7fb4975ed863fc4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 3 Jun 2013 14:40:22 +0200 Subject: fuse: fix readdirplus Oops in fuse_dentry_revalidate Fix bug introduced by commit 4582a4ab2a "FUSE: Adapt readdirplus to application usage patterns". We need to check for a positive dentry; negative dentries are not added by readdirplus. Secondly we need to advise the use of readdirplus on the *parent*, otherwise the whole thing is useless. Thirdly all this is only relevant if "readdirplus_auto" mode is selected by the filesystem. We advise the use of readdirplus only if the dentry was still valid. If we had to redo the lookup then there was no use in doing the -plus version. Reported-by: Bernd Schubert Signed-off-by: Miklos Szeredi CC: Feng Shuo CC: stable@vger.kernel.org --- fs/fuse/dir.c | 12 +++++++++--- fs/fuse/inode.c | 7 ++++--- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 254df56b847b..f3f783dc4f75 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -180,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc) static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; + struct dentry *parent; + struct fuse_conn *fc; inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) @@ -187,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) else if (fuse_dentry_time(entry) < get_jiffies_64()) { int err; struct fuse_entry_out outarg; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_forget_link *forget; - struct dentry *parent; u64 attr_version; /* For negative dentries, always do a fresh lookup */ @@ -241,8 +241,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) entry_attr_timeout(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); + } else if (inode) { + fc = get_fuse_conn(inode); + if (fc->readdirplus_auto) { + parent = dget_parent(entry); + fuse_advise_use_readdirplus(parent->d_inode); + dput(parent); + } } - fuse_advise_use_readdirplus(inode); return 1; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6201f81e4d3a..9a0cdde14a08 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -867,10 +867,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) + if (arg->flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) - fc->readdirplus_auto = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; + } if (arg->flags & FUSE_ASYNC_DIO) fc->async_dio = 1; } else { -- cgit v1.2.3 From c9ecf989cc7626e9edf8abef79f64b909542129b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 30 May 2013 15:35:50 -0400 Subject: fuse: return -EIOCBQUEUED from fuse_direct_IO() for all async requests If request submission fails for an async request (i.e., get_user_pages() returns -ERESTARTSYS), we currently skip the -EIOCBQUEUED return and drop into wait_for_sync_kiocb() forever. Avoid this by always returning -EIOCBQUEUED for async requests. If an error occurs, the error is passed into fuse_aio_complete(), returned via aio_complete() and thus propagated to userspace via io_getevents(). Signed-off-by: Brian Foster Reviewed-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d9f467907791..b3ad8d61a162 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2432,7 +2432,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (ret > 0 && !is_sync_kiocb(iocb)) + if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; ret = wait_on_sync_kiocb(iocb); -- cgit v1.2.3 From e5c5f05dca0cf90f0f3bb1aea85dcf658baff185 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 30 May 2013 16:41:34 +0400 Subject: fuse: fix alignment in short read optimization for async_dio The bug was introduced with async_dio feature: trying to optimize short reads, we cut number-of-bytes-to-read to i_size boundary. Hence the following example: truncate --size=300 /mnt/file dd if=/mnt/file of=/dev/null iflag=direct led to FUSE_READ request of 300 bytes size. This turned out to be problem for userspace fuse implementations who rely on assumption that kernel fuse does not change alignment of request from client FS. The patch turns off the optimization if async_dio is disabled. And, if it's enabled, the patch fixes adjustment of number-of-bytes-to-read to preserve alignment. Note, that we cannot throw out short read optimization entirely because otherwise a direct read of a huge size issued on a tiny file would generate a huge amount of fuse requests and most of them would be ACKed by userspace with zero bytes read. Signed-off-by: Maxim Patlasov Reviewed-by: Brian Foster Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b3ad8d61a162..e570081f9f76 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2373,6 +2373,11 @@ static void fuse_do_truncate(struct file *file) fuse_do_setattr(inode, &attr, file); } +static inline loff_t fuse_round_up(loff_t off) +{ + return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); +} + static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2380,6 +2385,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; @@ -2391,10 +2397,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, i_size = i_size_read(inode); /* optimization for short read */ - if (rw != WRITE && offset + count > i_size) { + if (async_dio && rw != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - count = i_size - offset; + count = min_t(loff_t, count, fuse_round_up(i_size - offset)); } io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); @@ -2412,7 +2418,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = ff->fc->async_dio; + io->async = async_dio; io->iocb = iocb; /* @@ -2420,7 +2426,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * to wait on real async I/O requests, so we must submit this request * synchronously. */ - if (!is_sync_kiocb(iocb) && (offset + count > i_size)) + if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) io->async = false; if (rw == WRITE) -- cgit v1.2.3 From 4a586812055dbd2588b0836ab758f6b9670c3949 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 24 May 2013 15:02:49 -0400 Subject: GFS2: Set log descriptor type for jdata blocks This patch sets the log descriptor type according to whether the journal commit is for (journaled) data or metadata. This was recently broken when the functions to process data and metadata log ops were combined. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/lops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 68b4c8f1fce8..6c33d7b6e0c4 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -419,7 +419,9 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, if (total > limit) num = limit; gfs2_log_unlock(sdp); - page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num); + page = gfs2_get_log_desc(sdp, + is_databuf ? GFS2_LOG_DESC_JDATA : + GFS2_LOG_DESC_METADATA, num + 1, num); ld = page_address(page); gfs2_log_lock(sdp); ptr = (__be64 *)(ld + 1); -- cgit v1.2.3 From 2b3dcf35810ff02ad0e785527a25c1b13bf82b19 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 28 May 2013 10:04:44 -0400 Subject: GFS2: Increase i_writecount during gfs2_setattr_size This patch calls get_write_access in a few functions. This merely increases inode->i_writecount for the duration of the function. That will ensure that any file closes won't delete the inode's multi-block reservation while the function is running. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 17 +++++++++++++---- fs/gfs2/file.c | 19 +++++++++++++------ fs/gfs2/rgrp.c | 4 +++- 3 files changed, 29 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1dc9a13ce6bb..93b5809c20bb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1286,17 +1286,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) if (ret) return ret; + ret = get_write_access(inode); + if (ret) + return ret; + inode_dio_wait(inode); ret = gfs2_rs_alloc(GFS2_I(inode)); if (ret) - return ret; + goto out; oldsize = inode->i_size; - if (newsize >= oldsize) - return do_grow(inode, newsize); + if (newsize >= oldsize) { + ret = do_grow(inode, newsize); + goto out; + } - return do_shrink(inode, oldsize, newsize); + ret = do_shrink(inode, oldsize, newsize); +out: + put_write_access(inode); + return ret; } int gfs2_truncatei_resume(struct gfs2_inode *ip) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index acd16764b133..ad0dc38d87ab 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -402,16 +402,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(vma->vm_file); + ret = get_write_access(inode); + if (ret) + goto out; + ret = gfs2_rs_alloc(ip); if (ret) - return ret; + goto out_write_access; gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) - goto out; + goto out_uninit; set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); @@ -480,12 +484,15 @@ out_quota_unlock: gfs2_quota_unlock(ip); out_unlock: gfs2_glock_dq(&gh); -out: +out_uninit: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); wait_for_stable_page(page); } +out_write_access: + put_write_access(inode); +out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } @@ -594,10 +601,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if ((file->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) - gfs2_rs_delete(ip); + if (!(file->f_mode & FMODE_WRITE)) + return 0; + gfs2_rs_delete(ip); return 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5232525934ae..9809156e3d04 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -638,8 +638,10 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) */ void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if (ip->i_res) { + if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) { gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); -- cgit v1.2.3 From e8830d8856e3ad61067dd46c05438b0d75a0441a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 30 May 2013 09:48:56 -0400 Subject: GFS2: Fall back to vmalloc if kmalloc fails for dir hash tables This version has one more correction: the vmalloc calls are replaced by __vmalloc calls to preserve the GFP_NOFS flag. When GFS2's directory management code allocates buffers for a directory hash table, if it can't get the memory it needs, it currently gives a bad return code. Rather than giving an error, this patch allows it to use virtual memory rather than kernel memory for the hash table. This should make it possible for directories to function properly, even when kernel memory becomes very fragmented. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/dir.c | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3e82bd23179..b631c9043460 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) return ERR_PTR(-EIO); } - hc = kmalloc(hsize, GFP_NOFS); - ret = -ENOMEM; + hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); + if (hc == NULL) + hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + if (hc == NULL) return ERR_PTR(-ENOMEM); ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); return ERR_PTR(ret); } spin_lock(&inode->i_lock); - if (ip->i_hash_cache) - kfree(hc); - else + if (ip->i_hash_cache) { + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); + } else { ip->i_hash_cache = hc; + } spin_unlock(&inode->i_lock); return ip->i_hash_cache; @@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip) { __be64 *hc = ip->i_hash_cache; ip->i_hash_cache = NULL; - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); } static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) @@ -1113,7 +1125,10 @@ static int dir_double_exhash(struct gfs2_inode *dip) if (IS_ERR(hc)) return PTR_ERR(hc); - h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); + h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); + if (hc2 == NULL) + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + if (!hc2) return -ENOMEM; @@ -1145,7 +1160,10 @@ fail: gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); out_kfree: - kfree(hc2); + if (is_vmalloc_addr(hc2)) + vfree(hc2); + else + kfree(hc2); return error; } @@ -1846,6 +1864,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); ht = kzalloc(size, GFP_NOFS); + if (ht == NULL) + ht = vzalloc(size); if (!ht) return -ENOMEM; @@ -1933,7 +1953,10 @@ out_rlist: gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); out: - kfree(ht); + if (is_vmalloc_addr(ht)) + vfree(ht); + else + kfree(ht); return error; } -- cgit v1.2.3 From a6a4d98b0124b5d3befe8b3a99f51f1b4fcc6dcf Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 29 May 2013 11:51:52 -0400 Subject: GFS2: Don't cache iopen glocks This patch makes GFS2 immediately reclaim/delete all iopen glocks as soon as they're dequeued. This allows deleters to get an EXclusive lock on iopen so files are deleted properly instead of being set as unlinked. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 1 + fs/gfs2/super.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8833a4f264e3..62b484e4a9e4 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail_refresh: + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_iopen: diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 917c8e1eb4ae..e5639dec66c4 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1444,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode) /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } @@ -1514,8 +1515,10 @@ out_unlock: if (gfs2_rs_active(ip->i_res)) gfs2_rs_deltree(ip->i_res); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); + } gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) @@ -1534,6 +1537,7 @@ out: ip->i_gl = NULL; if (ip->i_iopen_gh.gh_gl) { ip->i_iopen_gh.gh_gl->gl_object = NULL; + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } -- cgit v1.2.3