From 4b70ac5fd9b58bfaa5f25b4ea48f528aefbf3308 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2014 19:02:48 +0200 Subject: aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock() On 04/30, Benjamin LaHaise wrote: > > > - ctx->mmap_size = 0; > > - > > - kill_ioctx(mm, ctx, NULL); > > + if (ctx) { > > + ctx->mmap_size = 0; > > + kill_ioctx(mm, ctx, NULL); > > + } > > Rather than indenting and moving the two lines changing mmap_size and the > kill_ioctx() call, why not just do "if (!ctx) ... continue;"? That reduces > the number of lines changed and avoid excessive indentation. OK. To me the code looks better/simpler with "if (ctx)", but this is subjective of course, I won't argue. The patch still removes the empty line between mmap_size = 0 and kill_ioctx(), we reset mmap_size only for kill_ioctx(). But feel free to remove this change. ------------------------------------------------------------------------------- Subject: [PATCH v3 1/2] aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock() 1. We can read ->ioctx_table only once and we do not read rcu_read_lock() or even rcu_dereference(). This mm has no users, nobody else can play with ->ioctx_table. Otherwise the code is buggy anyway, if we need rcu_read_lock() in a loop because ->ioctx_table can be updated then kfree(table) is obviously wrong. 2. Update the comment. "exit_mmap(mm) is coming" is the good reason to avoid munmap(), but another reason is that we simply can't do vm_munmap() unless current->mm == mm and this is not true in general, the caller is mmput(). 3. We do not really need to nullify mm->ioctx_table before return, probably the current code does this to catch the potential problems. But in this case RCU_INIT_POINTER(NULL) looks better. Signed-off-by: Oleg Nesterov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 955947ef3e02..b6696462e345 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -791,40 +791,30 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { - struct kioctx_table *table; - struct kioctx *ctx; - unsigned i = 0; - - while (1) { - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - - do { - if (!table || i >= table->nr) { - rcu_read_unlock(); - rcu_assign_pointer(mm->ioctx_table, NULL); - if (table) - kfree(table); - return; - } + struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); + int i; - ctx = table->table[i++]; - } while (!ctx); + if (!table) + return; - rcu_read_unlock(); + for (i = 0; i < table->nr; ++i) { + struct kioctx *ctx = table->table[i]; + if (!ctx) + continue; /* - * We don't need to bother with munmap() here - - * exit_mmap(mm) is coming and it'll unmap everything. - * Since aio_free_ring() uses non-zero ->mmap_size - * as indicator that it needs to unmap the area, - * just set it to 0; aio_free_ring() is the only - * place that uses ->mmap_size, so it's safe. + * We don't need to bother with munmap() here - exit_mmap(mm) + * is coming and it'll unmap everything. And we simply can't, + * this is not necessarily our ->mm. + * Since kill_ioctx() uses non-zero ->mmap_size as indicator + * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, NULL); } + + RCU_INIT_POINTER(mm->ioctx_table, NULL); + kfree(table); } static void put_reqs_available(struct kioctx *ctx, unsigned nr) -- cgit v1.2.3 From 855ef0dec7271ff7be7381feaaf3f4aed80bd503 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 30 Apr 2014 16:16:36 +0200 Subject: aio: kill the misleading rcu read locks in ioctx_add_table() and kill_ioctx() ioctx_add_table() is the writer, it does not need rcu_read_lock() to protect ->ioctx_table. It relies on mm->ioctx_lock and rcu locks just add the confusion. And it doesn't need rcu_dereference() by the same reason, it must see any updates previously done under the same ->ioctx_lock. We could use rcu_dereference_protected() but the patch uses rcu_dereference_raw(), the function is simple enough. The same for kill_ioctx(), although it does not update the pointer. Signed-off-by: Oleg Nesterov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index b6696462e345..c1d8c480c138 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -554,8 +554,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); + table = rcu_dereference_raw(mm->ioctx_table); while (1) { if (table) @@ -563,7 +562,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) if (!table->table[i]) { ctx->id = i; table->table[i] = ctx; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -577,8 +575,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) } new_nr = (table ? table->nr : 1) * 4; - - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * @@ -589,8 +585,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - old = rcu_dereference(mm->ioctx_table); + old = rcu_dereference_raw(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -737,12 +732,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, spin_lock(&mm->ioctx_lock); - rcu_read_lock(); - table = rcu_dereference(mm->ioctx_table); - + table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; - rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* percpu_ref_kill() will do the necessary call_rcu() */ -- cgit v1.2.3 From be6fb451a24582732c66e28cb0beb3f19c4289fd Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 22 Jul 2014 09:56:56 -0400 Subject: aio: remove no longer needed preempt_disable() Based on feedback from Jens Axboe on 263782c1c95bbddbb022dc092fd89a36bb8d5577, clean up get/put_reqs_available() to remove the no longer needed preempt_disable() and preempt_enable() pair. Signed-off-by: Benjamin LaHaise Cc: Jens Axboe --- fs/aio.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 8216aa0c7539..9ce9e8ea773f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -814,10 +814,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) struct kioctx_cpu *kcpu; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); kcpu->reqs_available += nr; while (kcpu->reqs_available >= ctx->req_batch * 2) { @@ -826,7 +824,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr) } local_irq_restore(flags); - preempt_enable(); } static bool get_reqs_available(struct kioctx *ctx) @@ -835,10 +832,8 @@ static bool get_reqs_available(struct kioctx *ctx) bool ret = false; unsigned long flags; - preempt_disable(); - kcpu = this_cpu_ptr(ctx->cpu); - local_irq_save(flags); + kcpu = this_cpu_ptr(ctx->cpu); if (!kcpu->reqs_available) { int old, avail = atomic_read(&ctx->reqs_available); @@ -858,7 +853,6 @@ static bool get_reqs_available(struct kioctx *ctx) kcpu->reqs_available--; out: local_irq_restore(flags); - preempt_enable(); return ret; } -- cgit v1.2.3 From b53f1f82fbde8dcf34ab7d731c2a9ae6f0d8d2e2 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:51 +0800 Subject: aio: remove the needless registration of ring file's private_data Remove the registration of ring file's private_data, we do not use it. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 9ce9e8ea773f..55e03a858e8b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -192,7 +192,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) } file->f_flags = O_RDWR; - file->private_data = ctx; return file; } -- cgit v1.2.3 From 8dc4379e17cddad7b2088a3f300ded50d2a6d493 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:52 +0800 Subject: aio: use the macro rather than the inline magic number Replace the inline magic number with the ready-made macro(AIO_RING_MAGIC), just clean up. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 55e03a858e8b..6fc6b9857a58 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -201,7 +201,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type, static const struct dentry_operations ops = { .d_dname = simple_dname, }; - return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); + return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC); } /* aio_setup -- cgit v1.2.3 From 2be4e7deec2d4398a0eb2165cc04086ebfc831d2 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:53 +0800 Subject: aio: fix some comments The function comments of aio_run_iocb and aio_read_events are out of date, so fix them here. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 6fc6b9857a58..d6d9520f6866 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1020,7 +1020,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) } EXPORT_SYMBOL(aio_complete); -/* aio_read_events +/* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of * events fetched */ @@ -1272,9 +1272,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, } /* - * aio_setup_iocb: - * Performs the initial checks and aio retry method - * setup for the kiocb at the time of io submission. + * aio_run_iocb: + * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, char __user *buf, bool compat) -- cgit v1.2.3 From 00fefb9cf2b5493a86912de55ba912bdfae4a207 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 23 Jul 2014 18:03:54 +0800 Subject: aio: use iovec array rather than the single one Previously, we only offer a single iovec to handle all the read/write cases, so the PREADV/PWRITEV request always need to alloc more iovec buffer when copying user vectors. If we use a tmp iovec array rather than the single one, some small PREADV/PWRITEV workloads(vector size small than the tmp buffer) will not need to alloc more iovec buffer when copying user vectors. Reviewed-by: Jeff Moyer Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index d6d9520f6866..0fd9181d8c0c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1243,12 +1243,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, if (compat) ret = compat_rw_copy_check_uvector(rw, (struct compat_iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, (struct iovec __user *)buf, - *nr_segs, 1, *iovec, iovec); + *nr_segs, UIO_FASTIOV, *iovec, iovec); if (ret < 0) return ret; @@ -1285,7 +1285,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, fmode_t mode; aio_rw_op *rw_op; rw_iter_op *iter_op; - struct iovec inline_vec, *iovec = &inline_vec; + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; switch (opcode) { @@ -1320,7 +1320,7 @@ rw_common: if (!ret) ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); return ret; } @@ -1367,7 +1367,7 @@ rw_common: return -EINVAL; } - if (iovec != &inline_vec) + if (iovec != inline_vecs) kfree(iovec); if (ret != -EIOCBQUEUED) { -- cgit v1.2.3 From e91259f3c72648976511b1600f983d202dda1af1 Mon Sep 17 00:00:00 2001 From: Vincent Stehlé Date: Mon, 21 Jul 2014 23:47:36 +0200 Subject: cifs: remove unused function cifs_oplock_break_wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 743162013d40 ("sched: Remove proliferation of wait_on_bit() action functions") has removed the call to cifs_oplock_break_wait, making this function unused; remove it. This fixes the following compilation warning: fs/cifs/misc.c:578:1: warning: ‘cifs_oplock_break_wait’ defined but not used [-Wunused-function] Signed-off-by: Vincent Stehlé Cc: Steve French Signed-off-by: Steve French --- fs/cifs/misc.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 81340c6253eb..b7415d596dbd 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -574,13 +574,6 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) cinode->oplock = 0; } -static int -cifs_oplock_break_wait(void *unused) -{ - schedule(); - return signal_pending(current) ? -ERESTARTSYS : 0; -} - /* * We wait for oplock breaks to be processed before we attempt to perform * writes. -- cgit v1.2.3 From 8144f1f69943f447fd1bcb2d26ca011002d5df63 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Aug 2014 13:36:54 -0400 Subject: locks: show delegations as "DELEG" in /proc/locks Now that they are a distinct lease type, show them as such. Cc: J. Bruce Fields Signed-off-by: Jeff Layton --- fs/locks.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index a6f54802d277..356667a434c1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2452,7 +2452,11 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_puts(f, "FLOCK ADVISORY "); } } else if (IS_LEASE(fl)) { - seq_puts(f, "LEASE "); + if (fl->fl_flags & FL_DELEG) + seq_puts(f, "DELEG "); + else + seq_puts(f, "LEASE "); + if (lease_breaking(fl)) seq_puts(f, "BREAKING "); else if (fl->fl_file) -- cgit v1.2.3 From 566709bd627caf933ab8edffaf598203a0c5c8b2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Aug 2014 14:09:35 -0400 Subject: locks: don't call locks_release_private from locks_copy_lock All callers of locks_copy_lock pass in a brand new file_lock struct, so there's no need to call locks_release_private on it. Replace that with a warning that fires in the event that we receive a target lock that doesn't look like it's properly initialized. Acked-by: J. Bruce Fields Signed-off-by: Jeff Layton --- fs/locks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 356667a434c1..2c2d4f5022a7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -285,7 +285,8 @@ EXPORT_SYMBOL(__locks_copy_lock); void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { - locks_release_private(new); + /* "new" must be a freshly-initialized lock */ + WARN_ON_ONCE(new->fl_ops); __locks_copy_lock(new, fl); new->fl_file = fl->fl_file; -- cgit v1.2.3 From 8ae31240ccc8ff339101e2d023b82cb7c93dd002 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 11 Aug 2014 19:29:30 -0500 Subject: Add missing definitions for CIFS File System Attributes Signed-off-by: Steve French Acked-by: Shirish Pargaonkar --- fs/cifs/cifspdu.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 33df36ef9d52..5f9822ac0245 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2253,6 +2253,29 @@ typedef struct { /* minimum includes first three fields, and empty FS Name */ #define MIN_FS_ATTR_INFO_SIZE 12 + +/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */ +#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000 +#define FILE_SUPPORTS_USN_JOURNAL 0x02000000 +#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000 +#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000 +#define FILE_SUPPORTS_HARD_LINKS 0x00400000 +#define FILE_SUPPORTS_TRANSACTIONS 0x00200000 +#define FILE_SEQUENTIAL_WRITE_ONCE 0x00100000 +#define FILE_READ_ONLY_VOLUME 0x00080000 +#define FILE_NAMED_STREAMS 0x00040000 +#define FILE_SUPPORTS_ENCRYPTION 0x00020000 +#define FILE_SUPPORTS_OBJECT_IDS 0x00010000 +#define FILE_VOLUME_IS_COMPRESSED 0x00008000 +#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100 +#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080 +#define FILE_SUPPORTS_SPARSE_FILES 0x00000040 +#define FILE_VOLUME_QUOTAS 0x00000020 +#define FILE_FILE_COMPRESSION 0x00000010 +#define FILE_PERSISTENT_ACLS 0x00000008 +#define FILE_UNICODE_ON_DISK 0x00000004 +#define FILE_CASE_PRESERVED_NAMES 0x00000002 +#define FILE_CASE_SENSITIVE_SEARCH 0x00000001 typedef struct { __le32 Attributes; __le32 MaxPathNameComponentLength; -- cgit v1.2.3 From 3d1a3745d8ca7ccdf00905b01fd5ab42ff523a94 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 11 Aug 2014 21:05:25 -0500 Subject: Add sparse file support to SMB2/SMB3 mounts Many Linux filesystes make a file "sparse" when extending a file with ftruncate. This does work for CIFS to Samba (only) but not for SMB2/SMB3 (to Samba or Windows) since there is a "set sparse" fsctl which is supposed to be sent to mark a file as sparse. This patch marks a file as sparse by sending this simple set sparse fsctl if it is extended more than 2 pages. It has been tested to Windows 8.1, Samba and various SMB2/SMB3 servers which do support setting sparse (and MacOS which does not appear to support the fsctl yet). If a server share does not support setting a file as sparse, then we do not retry setting sparse on that share. The disk space savings for sparse files can be quite large (even more significant on Windows servers than Samba). Signed-off-by: Steve French Reviewed-by: Shirish Pargaonkar --- fs/cifs/cifsglob.h | 1 + fs/cifs/smb2ops.c | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2pdu.c | 4 +++- 3 files changed, 47 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 0012e1e291d4..bc20a6ea6754 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -883,6 +883,7 @@ struct cifs_tcon { for this mount even if server would support */ bool local_lease:1; /* check leases (only) on local system not remote */ bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ + bool broken_sparse_sup; /* if server or share does not support sparse */ bool need_reconnect:1; /* connection reset, tid now invalid */ #ifdef CONFIG_CIFS_SMB2 bool print:1; /* set if connection to printer share */ diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 77f8aeb9c2fc..74634369c21e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -736,6 +736,49 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile, __u64 size, bool set_alloc) { __le64 eof = cpu_to_le64(size); + struct inode *inode; + + /* + * If extending file more than one page make sparse. Many Linux fs + * make files sparse by default when extending via ftruncate + */ + inode = cfile->dentry->d_inode; + + if (!set_alloc && (size > inode->i_size + 8192)) { + struct cifsInodeInfo *cifsi; + __u8 set_sparse = 1; + int rc; + + cifsi = CIFS_I(inode); + + /* if file already sparse or no server support don't bother */ + if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) + goto smb2_set_eof; + + /* + * Can't check for sparse support on share the usual way via the + * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share + * since Samba server doesn't set the flag on the share, yet + * supports the set sparse FSCTL and returns sparse correctly + * in the file attributes. If we fail setting sparse though we + * mark that server does not support sparse files for this share + * to avoid repeatedly sending the unsupported fsctl to server + * if the file is repeatedly extended. + */ + if (tcon->broken_sparse_sup) + goto smb2_set_eof; + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_SPARSE, + true /* is_fctl */, &set_sparse, 1, NULL, NULL); + if (rc) { + tcon->broken_sparse_sup = true; + cifs_dbg(FYI, "set sparse rc = %d\n", rc); + } else + cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE; + } + +smb2_set_eof: return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, &eof, false); } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 42ebc1a8be6c..74440af59f35 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1224,7 +1224,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, cifs_dbg(FYI, "SMB2 IOCTL\n"); - *out_data = NULL; + if (out_data != NULL) + *out_data = NULL; + /* zero out returned data len, in case of error */ if (plen) *plen = 0; -- cgit v1.2.3 From b84d49f9440b2b039828f3eb114e4bd4ebeb0c54 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 12 Aug 2014 08:03:49 -0400 Subject: locks: don't reuse file_lock in __posix_lock_file Currently in the case where a new file lock completely replaces the old one, we end up overwriting the existing lock with the new info. This means that we have to call fl_release_private inside i_lock. Change the code to instead copy the info to new_fl, insert that lock into the correct spot and then delete the old lock. In a later patch, we'll defer the freeing of the old lock until after the i_lock has been dropped. Acked-by: J. Bruce Fields Signed-off-by: Jeff Layton --- fs/locks.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 2c2d4f5022a7..7dd4defb4d8d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1022,18 +1022,21 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_delete_lock(before); continue; } - /* Replace the old lock with the new one. - * Wake up anybody waiting for the old one, - * as the change in lock type might satisfy - * their needs. + /* + * Replace the old lock with new_fl, and + * remove the old one. It's safe to do the + * insert here since we know that we won't be + * using new_fl later, and that the lock is + * just replacing an existing lock. */ - locks_wake_up_blocks(fl); - fl->fl_start = request->fl_start; - fl->fl_end = request->fl_end; - fl->fl_type = request->fl_type; - locks_release_private(fl); - locks_copy_private(fl, request); - request = fl; + error = -ENOLCK; + if (!new_fl) + goto out; + locks_copy_lock(new_fl, request); + request = new_fl; + new_fl = NULL; + locks_delete_lock(before); + locks_insert_lock(before, request); added = true; } } -- cgit v1.2.3 From ed9814d85810c27670987b40c77e8a07105838fe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Aug 2014 14:20:31 -0400 Subject: locks: defer freeing locks in locks_delete_lock until after i_lock has been dropped In commit 72f98e72551fa (locks: turn lock_flocks into a spinlock), we moved from using the BKL to a global spinlock. With this change, we lost the ability to block in the fl_release_private operation. This is problematic for NFS (and probably some other filesystems as well). Add a new list_head argument to locks_delete_lock. If that argument is non-NULL, then queue any locks that we want to free to the list instead of freeing them. Then, add a new locks_dispose_list function that will walk such a list and call locks_free_lock on them after the i_lock has been dropped. Finally, change all of the callers of locks_delete_lock to pass in a list_head, except for lease_modify. That function can be called long after the i_lock has been acquired. Deferring the freeing of a lease after unlocking it in that function is non-trivial until we overhaul some of the spinlocking in the lease code. Currently though, no filesystem that sets fl_release_private supports leases, so this is not currently a problem. We'll eventually want to make the same change in the lease code, but it needs a lot more work before we can reasonably do so. Acked-by: J. Bruce Fields Signed-off-by: Jeff Layton --- fs/locks.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 7dd4defb4d8d..4ce087cca501 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -247,6 +247,18 @@ void locks_free_lock(struct file_lock *fl) } EXPORT_SYMBOL(locks_free_lock); +static void +locks_dispose_list(struct list_head *dispose) +{ + struct file_lock *fl; + + while (!list_empty(dispose)) { + fl = list_first_entry(dispose, struct file_lock, fl_block); + list_del_init(&fl->fl_block); + locks_free_lock(fl); + } +} + void locks_init_lock(struct file_lock *fl) { memset(fl, 0, sizeof(struct file_lock)); @@ -651,12 +663,16 @@ static void locks_unlink_lock(struct file_lock **thisfl_p) * * Must be called with i_lock held! */ -static void locks_delete_lock(struct file_lock **thisfl_p) +static void locks_delete_lock(struct file_lock **thisfl_p, + struct list_head *dispose) { struct file_lock *fl = *thisfl_p; locks_unlink_lock(thisfl_p); - locks_free_lock(fl); + if (dispose) + list_add(&fl->fl_block, dispose); + else + locks_free_lock(fl); } /* Determine if lock sys_fl blocks lock caller_fl. Common functionality @@ -812,6 +828,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) struct inode * inode = file_inode(filp); int error = 0; int found = 0; + LIST_HEAD(dispose); if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { new_fl = locks_alloc_lock(); @@ -834,7 +851,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) if (request->fl_type == fl->fl_type) goto out; found = 1; - locks_delete_lock(before); + locks_delete_lock(before, &dispose); break; } @@ -881,6 +898,7 @@ out: spin_unlock(&inode->i_lock); if (new_fl) locks_free_lock(new_fl); + locks_dispose_list(&dispose); return error; } @@ -894,6 +912,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str struct file_lock **before; int error; bool added = false; + LIST_HEAD(dispose); /* * We may need two file_lock structures for this operation, @@ -989,7 +1008,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str else request->fl_end = fl->fl_end; if (added) { - locks_delete_lock(before); + locks_delete_lock(before, &dispose); continue; } request = fl; @@ -1019,7 +1038,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str * one (This may happen several times). */ if (added) { - locks_delete_lock(before); + locks_delete_lock(before, &dispose); continue; } /* @@ -1035,7 +1054,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_copy_lock(new_fl, request); request = new_fl; new_fl = NULL; - locks_delete_lock(before); + locks_delete_lock(before, &dispose); locks_insert_lock(before, request); added = true; } @@ -1097,6 +1116,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_free_lock(new_fl); if (new_fl2) locks_free_lock(new_fl2); + locks_dispose_list(&dispose); return error; } @@ -1272,7 +1292,7 @@ int lease_modify(struct file_lock **before, int arg) printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); fl->fl_fasync = NULL; } - locks_delete_lock(before); + locks_delete_lock(before, NULL); } return 0; } @@ -2324,6 +2344,7 @@ void locks_remove_file(struct file *filp) struct inode * inode = file_inode(filp); struct file_lock *fl; struct file_lock **before; + LIST_HEAD(dispose); if (!inode->i_flock) return; @@ -2369,12 +2390,13 @@ void locks_remove_file(struct file *filp) fl->fl_type, fl->fl_flags, fl->fl_start, fl->fl_end); - locks_delete_lock(before); + locks_delete_lock(before, &dispose); continue; } before = &fl->fl_next; } spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); } /** -- cgit v1.2.3 From 2dfb928f7e5977a3faac943c134bbda5ae492629 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Aug 2014 18:14:12 -0400 Subject: locks: move locks_free_lock calls in do_fcntl_add_lease outside spinlock There's no need to call locks_free_lock here while still holding the i_lock. Defer that until the lock has been dropped. Acked-by: J. Bruce Fields Signed-off-by: Jeff Layton --- fs/locks.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 4ce087cca501..cb66fb05ad4a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1761,13 +1761,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) ret = fl; spin_lock(&inode->i_lock); error = __vfs_setlease(filp, arg, &ret); - if (error) { - spin_unlock(&inode->i_lock); - locks_free_lock(fl); - goto out_free_fasync; - } - if (ret != fl) - locks_free_lock(fl); + if (error) + goto out_unlock; + if (ret == fl) + fl = NULL; /* * fasync_insert_entry() returns the old entry if any. @@ -1779,9 +1776,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) new = NULL; error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); +out_unlock: spin_unlock(&inode->i_lock); - -out_free_fasync: + if (fl) + locks_free_lock(fl); if (new) fasync_free(new); return error; -- cgit v1.2.3 From ba7b6e62f420f5a8832bc161ab0c7ba767f65b3d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 1 Jul 2014 16:21:33 +0200 Subject: btrfs: adjust statfs calculations according to raid profiles This has been discussed in thread: http://thread.gmane.org/gmane.comp.file-systems.btrfs/32528 and this patch implements this proposal: http://thread.gmane.org/gmane.comp.file-systems.btrfs/32536 Works fine for "clean" raid profiles where the raid factor correction does the right job. Otherwise it's pessimistic and may show low space although there's still some left. The df nubmers are lightly wrong in case of mixed block groups, but this is not a major usecase and can be addressed later. The RAID56 numbers are wrong almost the same way as before and will be addressed separately. CC: Hugo Mills CC: cwillu CC: Josef Bacik Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8e16bca69c56..18cdcd1dbe11 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1672,6 +1672,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) return 0; } +/* + * Calculate numbers for 'df', pessimistic in case of mixed raid profiles. + * + * If there's a redundant raid level at DATA block groups, use the respective + * multiplier to scale the sizes. + * + * Unused device space usage is based on simulating the chunk allocator + * algorithm that respects the device sizes, order of allocations and the + * 'alloc_start' value, this is a close approximation of the actual use but + * there are other factors that may change the result (like a new metadata + * chunk). + * + * FIXME: not accurate for mixed block groups, total and free/used are ok, + * available appears slightly larger. + */ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); @@ -1682,6 +1697,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 total_free_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)fs_info->fsid; + unsigned factor = 1; + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; /* holding chunk_muext to avoid allocating new chunks */ @@ -1689,30 +1706,52 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { + int i; + total_free_data += found->disk_total - found->disk_used; total_free_data -= btrfs_account_ro_block_groups_free_space(found); + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (!list_empty(&found->block_groups[i])) { + switch (i) { + case BTRFS_RAID_DUP: + case BTRFS_RAID_RAID1: + case BTRFS_RAID_RAID10: + factor = 2; + } + } + } } total_used += found->disk_used; } + rcu_read_unlock(); - buf->f_namelen = BTRFS_NAME_LEN; - buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; - buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bsize = dentry->d_sb->s_blocksize; - buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); + buf->f_blocks >>= bits; + buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); + + /* Account global block reserve as used, it's in logical size already */ + spin_lock(&block_rsv->lock); + buf->f_bfree -= block_rsv->size >> bits; + spin_unlock(&block_rsv->lock); + buf->f_bavail = total_free_data; ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); if (ret) { mutex_unlock(&fs_info->chunk_mutex); return ret; } - buf->f_bavail += total_free_data; + buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; mutex_unlock(&fs_info->chunk_mutex); + buf->f_type = BTRFS_SUPER_MAGIC; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_namelen = BTRFS_NAME_LEN; + /* We treat it as constant endianness (it doesn't matter _which_) because we want the fsid to come out the same whether mounted on a big-endian or little-endian host */ -- cgit v1.2.3 From e339a6b097c515a31ce230d498c44ff2e89f1cf4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 2 Jul 2014 10:54:25 -0700 Subject: Btrfs: __btrfs_mod_ref should always use no_quota Before I extended the no_quota arg to btrfs_dec/inc_ref because I didn't understand how snapshot delete was using it and assumed that we needed the quota operations there. With Mark's work this has turned out to be not the case, we _always_ need to use no_quota for btrfs_dec/inc_ref, so just drop the argument and make __btrfs_mod_ref call it's process function with no_quota set always. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 20 ++++++++++---------- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/extent-tree.c | 24 +++++++++++------------- 3 files changed, 23 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index aeab453b8e24..44ee5d2e52a4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -280,9 +280,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; @@ -1035,14 +1035,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if ((owner == root->root_key.objectid || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { - ret = btrfs_inc_ref(trans, root, buf, 1, 1); + ret = btrfs_inc_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_dec_ref(trans, root, buf, 0, 1); + ret = btrfs_dec_ref(trans, root, buf, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); BUG_ON(ret); /* -ENOMEM */ } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -1050,9 +1050,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); /* -ENOMEM */ } if (new_flags != 0) { @@ -1069,11 +1069,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_inc_ref(trans, root, cow, 1, 1); + ret = btrfs_inc_ref(trans, root, cow, 1); else - ret = btrfs_inc_ref(trans, root, cow, 0, 1); + ret = btrfs_inc_ref(trans, root, cow, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, buf, 1, 1); + ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } clean_tree_block(trans, root, buf); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index be91397f4e92..8e29b614fe93 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3326,9 +3326,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota); + struct extent_buffer *buf, int full_backref); int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota); + struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 flags, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813537f362f9..591893fe58cb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3057,7 +3057,7 @@ out: static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - int full_backref, int inc, int no_quota) + int full_backref, int inc) { u64 bytenr; u64 num_bytes; @@ -3111,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, no_quota); + key.offset, 1); if (ret) goto fail; } else { @@ -3119,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_level_size(root, level - 1); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, - no_quota); + 1); if (ret) goto fail; } @@ -3130,15 +3130,15 @@ fail: } int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); } int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref, int no_quota) + struct extent_buffer *buf, int full_backref) { - return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -7532,9 +7532,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, /* wc->stage == UPDATE_BACKREF */ if (!(wc->flags[level] & flag)) { BUG_ON(!path->locks[level]); - ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); + ret = btrfs_inc_ref(trans, root, eb, 1); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, root, eb->start, eb->len, flag, @@ -7769,11 +7769,9 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (wc->refs[level] == 1) { if (level == 0) { if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) - ret = btrfs_dec_ref(trans, root, eb, 1, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 1); else - ret = btrfs_dec_ref(trans, root, eb, 0, - wc->for_reloc); + ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ } /* make block locked assertion in clean_tree_block happy */ -- cgit v1.2.3 From 6f7ff6d7832c6be13e8c95598884dbc40ad69fb7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 2 Jul 2014 20:07:54 +0100 Subject: Btrfs: read lock extent buffer while walking backrefs Before processing the extent buffer, acquire a read lock on it, so that we're safe against concurrent updates on the extent buffer. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e25564bfcb46..a1efd39ca28a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1001,8 +1001,11 @@ again: ret = -EIO; goto out; } + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); ret = find_extent_in_eb(eb, bytenr, *extent_item_pos, &eie); + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); if (ret < 0) goto out; -- cgit v1.2.3 From 1152651a081720ef6a8c76bb7da676e8c900ac30 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 17 Jul 2014 12:39:01 -0700 Subject: btrfs: qgroup: account shared subtrees during snapshot delete During its tree walk, btrfs_drop_snapshot() will skip any shared subtrees it encounters. This is incorrect when we have qgroups turned on as those subtrees need to have their contents accounted. In particular, the case we're concerned with is when removing our snapshot root leaves the subtree with only one root reference. In those cases we need to find the last remaining root and add each extent in the subtree to the corresponding qgroup exclusive counts. This patch implements the shared subtree walk and a new qgroup operation, BTRFS_QGROUP_OPER_SUB_SUBTREE. When an operation of this type is encountered during qgroup accounting, we search for any root references to that extent and in the case that we find only one reference left, we go ahead and do the math on it's exclusive counts. Signed-off-by: Mark Fasheh Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 261 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/qgroup.c | 164 +++++++++++++++++++++++++++++++ fs/btrfs/qgroup.h | 1 + 3 files changed, 426 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 591893fe58cb..102ed3143976 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7478,6 +7478,220 @@ reada: wc->reada_slot = slot; } +static int account_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb) +{ + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + bytenr, num_bytes, + BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); + if (ret) + return ret; + } + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_root *root, + struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +/* + * root_eb is the subtree root and is locked before this function is called. + */ +static int account_shared_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, + int root_level) +{ + int ret = 0; + int level; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!root->fs_info->quota_enabled) + return 0; + + if (!extent_buffer_uptodate(root_eb)) { + ret = btrfs_read_buffer(root_eb, root_gen); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = account_leaf_items(trans, root, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + extent_buffer_get(root_eb); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int child_bsize = root->nodesize; + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* We need to get child blockptr/gen from + * parent before we can read it. */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + eb = read_tree_block(root, child_bytenr, child_bsize, + child_gen); + if (!eb || !extent_buffer_uptodate(eb)) { + ret = -EIO; + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = btrfs_qgroup_record_ref(trans, root->fs_info, + root->objectid, + child_bytenr, + child_bsize, + BTRFS_QGROUP_OPER_SUB_SUBTREE, + 0); + if (ret) + goto out; + + } + + if (level == 0) { + ret = account_leaf_items(trans, root, path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(root, path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + /* * helper to process tree block while walking down the tree. * @@ -7581,6 +7795,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, int level = wc->level; int reada = 0; int ret = 0; + bool need_account = false; generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); @@ -7626,6 +7841,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (wc->stage == DROP_REFERENCE) { if (wc->refs[level - 1] > 1) { + need_account = true; if (level == 1 && (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) goto skip; @@ -7689,6 +7905,16 @@ skip: parent = 0; } + if (need_account) { + ret = account_shared_subtree(trans, root, next, + generation, level - 1); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting shared subtree. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } + } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, root->root_key.objectid, level - 1, 0, 0); BUG_ON(ret); /* -ENOMEM */ @@ -7773,6 +7999,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ + ret = account_leaf_items(trans, root, eb); + if (ret) { + printk_ratelimited(KERN_ERR "BTRFS: %s Error " + "%d accounting leaf items. Quota " + "is out of sync, rescan required.\n", + root->fs_info->sb->s_id, ret); + } } /* make block locked assertion in clean_tree_block happy */ if (!path->locks[level] && @@ -7898,6 +8131,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int level; bool root_dropped = false; + btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid); + path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -8023,6 +8258,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } + /* + * Qgroup update accounting is run from + * delayed ref handling. This usually works + * out because delayed refs are normally the + * only way qgroup updates are added. However, + * we may have added updates during our tree + * walk so run qgroups here to make sure we + * don't lose any updates. + */ + ret = btrfs_delayed_qgroup_accounting(trans, + root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8076,6 +8329,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: + ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); + if (ret) + printk_ratelimited(KERN_ERR "BTRFS: Failure %d " + "running qgroup updates " + "during snapshot delete. " + "Quota is out of sync, " + "rescan required.\n", ret); + btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 98cb6b2630f9..65b62c467e28 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1201,6 +1201,50 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; } + +static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, + struct btrfs_qgroup_operation *oper2) +{ + /* + * Ignore seq and type here, we're looking for any operation + * at all related to this extent on that root. + */ + if (oper1->bytenr < oper2->bytenr) + return -1; + if (oper1->bytenr > oper2->bytenr) + return 1; + if (oper1->ref_root < oper2->ref_root) + return -1; + if (oper1->ref_root > oper2->ref_root) + return 1; + return 0; +} + +static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct rb_node *n; + struct btrfs_qgroup_operation *cur; + int cmp; + + spin_lock(&fs_info->qgroup_op_lock); + n = fs_info->qgroup_op_tree.rb_node; + while (n) { + cur = rb_entry(n, struct btrfs_qgroup_operation, n); + cmp = comp_oper_exist(cur, oper); + if (cmp < 0) { + n = n->rb_right; + } else if (cmp) { + n = n->rb_left; + } else { + spin_unlock(&fs_info->qgroup_op_lock); + return -EEXIST; + } + } + spin_unlock(&fs_info->qgroup_op_lock); + return 0; +} + static int comp_oper(struct btrfs_qgroup_operation *oper1, struct btrfs_qgroup_operation *oper2) { @@ -1290,6 +1334,23 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); INIT_LIST_HEAD(&oper->elem.list); oper->elem.seq = 0; + + if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { + /* + * If any operation for this bytenr/ref_root combo + * exists, then we know it's not exclusively owned and + * shouldn't be queued up. + * + * This also catches the case where we have a cloned + * extent that gets queued up multiple times during + * drop snapshot. + */ + if (qgroup_oper_exists(fs_info, oper)) { + kfree(oper); + return 0; + } + } + ret = insert_qgroup_oper(fs_info, oper); if (ret) { /* Shouldn't happen so have an assert for developers */ @@ -1883,6 +1944,106 @@ out: return ret; } +/* + * Process a reference to a shared subtree. This type of operation is + * queued during snapshot removal when we encounter extents which are + * shared between more than one root. + */ +static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_operation *oper) +{ + struct ulist *roots = NULL; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct btrfs_qgroup_list *glist; + struct ulist *parents; + int ret = 0; + struct btrfs_qgroup *qg; + u64 root_obj = 0; + struct seq_list elem = {}; + + parents = ulist_alloc(GFP_NOFS); + if (!parents) + return -ENOMEM; + + btrfs_get_tree_mod_seq(fs_info, &elem); + ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, + elem.seq, &roots); + btrfs_put_tree_mod_seq(fs_info, &elem); + if (ret < 0) + return ret; + + if (roots->nnodes != 1) + goto out; + + ULIST_ITER_INIT(&uiter); + unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ + /* + * If we find our ref root then that means all refs + * this extent has to the root have not yet been + * deleted. In that case, we do nothing and let the + * last ref for this bytenr drive our update. + * + * This can happen for example if an extent is + * referenced multiple times in a snapshot (clone, + * etc). If we are in the middle of snapshot removal, + * queued updates for such an extent will find the + * root if we have not yet finished removing the + * snapshot. + */ + if (unode->val == oper->ref_root) + goto out; + + root_obj = unode->val; + BUG_ON(!root_obj); + + spin_lock(&fs_info->qgroup_lock); + qg = find_qgroup_rb(fs_info, root_obj); + if (!qg) + goto out_unlock; + + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* + * Adjust counts for parent groups. First we find all + * parents, then in the 2nd loop we do the adjustment + * while adding parents of the parents to our ulist. + */ + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out_unlock; + } + + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(parents, &uiter))) { + qg = u64_to_ptr(unode->aux); + qg->excl += oper->num_bytes; + qg->excl_cmpr += oper->num_bytes; + qgroup_dirty(fs_info, qg); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qg->groups, next_group) { + ret = ulist_add(parents, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out_unlock; + } + } + +out_unlock: + spin_unlock(&fs_info->qgroup_lock); + +out: + ulist_free(roots); + ulist_free(parents); + return ret; +} + /* * btrfs_qgroup_account_ref is called for every ref that is added to or deleted * from the fs. First, all roots referencing the extent are searched, and @@ -1920,6 +2081,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, case BTRFS_QGROUP_OPER_SUB_SHARED: ret = qgroup_shared_accounting(trans, fs_info, oper); break; + case BTRFS_QGROUP_OPER_SUB_SUBTREE: + ret = qgroup_subtree_accounting(trans, fs_info, oper); + break; default: ASSERT(0); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 5952ff1fbd7a..18cc68ca3090 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type { BTRFS_QGROUP_OPER_ADD_SHARED, BTRFS_QGROUP_OPER_SUB_EXCL, BTRFS_QGROUP_OPER_SUB_SHARED, + BTRFS_QGROUP_OPER_SUB_SUBTREE, }; struct btrfs_qgroup_operation { -- cgit v1.2.3 From f90e579c2b391979630b3343de0be65ab1b478ce Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 17 Jul 2014 12:39:04 -0700 Subject: btrfs: correctly handle return from ulist_add ulist_add() can return '1' on sucess, which qgroup_subtree_accounting() doesn't take into account. As a result, that value can be bubbled up to callers, causing an error to be printed. Fix this by only returning the value of ulist_add() when it indicates an error. Signed-off-by: Mark Fasheh Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 65b62c467e28..b497498484be 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1959,6 +1959,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, struct btrfs_qgroup_list *glist; struct ulist *parents; int ret = 0; + int err; struct btrfs_qgroup *qg; u64 root_obj = 0; struct seq_list elem = {}; @@ -2013,10 +2014,12 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, * while adding parents of the parents to our ulist. */ list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(parents, glist->group->qgroupid, + err = ulist_add(parents, glist->group->qgroupid, ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) + if (err < 0) { + ret = err; goto out_unlock; + } } ULIST_ITER_INIT(&uiter); @@ -2028,10 +2031,12 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, /* Add any parents of the parents */ list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(parents, glist->group->qgroupid, + err = ulist_add(parents, glist->group->qgroupid, ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) + if (err < 0) { + ret = err; goto out_unlock; + } } } -- cgit v1.2.3 From ce62003f690dff38d3164a632ec69efa15c32cbf Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 24 Jul 2014 22:48:05 +0800 Subject: Btrfs: fix compressed write corruption on enospc When failing to allocate space for the whole compressed extent, we'll fallback to uncompressed IO, but we've forgotten to redirty the pages which belong to this compressed extent, and these 'clean' pages will simply skip 'submit' part and go to endio directly, at last we got data corruption as we write nothing. Signed-off-by: Liu Bo Tested-By: Martin Steigerwald Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3668048e16f8..8ea7610fbaf3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -709,6 +709,18 @@ retry: unlock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1); + + /* + * we need to redirty the pages if we decide to + * fallback to uncompressed IO, otherwise we + * will not submit these pages down to lower + * layers. + */ + extent_range_redirty_for_io(inode, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1); + goto retry; } goto out_free; -- cgit v1.2.3 From 4eb1f66dce6c4dc28dd90a7ffbe6b2b1cb08aa4e Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 28 Jul 2014 10:57:04 +0200 Subject: Btrfs: Fix memory corruption by ulist_add_merge() on 32bit arch We've got bug reports that btrfs crashes when quota is enabled on 32bit kernel, typically with the Oops like below: BUG: unable to handle kernel NULL pointer dereference at 00000004 IP: [] find_parent_nodes+0x360/0x1380 [btrfs] *pde = 00000000 Oops: 0000 [#1] SMP CPU: 0 PID: 151 Comm: kworker/u8:2 Tainted: G S W 3.15.2-1.gd43d97e-default #1 Workqueue: btrfs-qgroup-rescan normal_work_helper [btrfs] task: f1478130 ti: f147c000 task.ti: f147c000 EIP: 0060:[] EFLAGS: 00010213 CPU: 0 EIP is at find_parent_nodes+0x360/0x1380 [btrfs] EAX: f147dda8 EBX: f147ddb0 ECX: 00000011 EDX: 00000000 ESI: 00000000 EDI: f147dda4 EBP: f147ddf8 ESP: f147dd38 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 CR0: 8005003b CR2: 00000004 CR3: 00bf3000 CR4: 00000690 Stack: 00000000 00000000 f147dda4 00000050 00000001 00000000 00000001 00000050 00000001 00000000 d3059000 00000001 00000022 000000a8 00000000 00000000 00000000 000000a1 00000000 00000000 00000001 00000000 00000000 11800000 Call Trace: [] __btrfs_find_all_roots+0x9d/0xf0 [btrfs] [] btrfs_qgroup_rescan_worker+0x401/0x760 [btrfs] [] normal_work_helper+0xc8/0x270 [btrfs] [] process_one_work+0x11b/0x390 [] worker_thread+0x101/0x340 [] kthread+0x9b/0xb0 [] ret_from_kernel_thread+0x21/0x30 [] kthread_create_on_node+0x110/0x110 This indicates a NULL corruption in prefs_delayed list. The further investigation and bisection pointed that the call of ulist_add_merge() results in the corruption. ulist_add_merge() takes u64 as aux and writes a 64bit value into old_aux. The callers of this function in backref.c, however, pass a pointer of a pointer to old_aux. That is, the function overwrites 64bit value on 32bit pointer. This caused a NULL in the adjacent variable, in this case, prefs_delayed. Here is a quick attempt to band-aid over this: a new function, ulist_add_merge_ptr() is introduced to pass/store properly a pointer value instead of u64. There are still ugly void ** cast remaining in the callers because void ** cannot be taken implicitly. But, it's safer than explicit cast to u64, anyway. Bugzilla: https://bugzilla.novell.com/show_bug.cgi?id=887046 Cc: [v3.11+] Signed-off-by: Takashi Iwai Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 11 +++++------ fs/btrfs/ulist.h | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a1efd39ca28a..54a201dac7f9 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -276,9 +276,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, } if (ret > 0) goto next; - ret = ulist_add_merge(parents, eb->start, - (uintptr_t)eie, - (u64 *)&old, GFP_NOFS); + ret = ulist_add_merge_ptr(parents, eb->start, + eie, (void **)&old, GFP_NOFS); if (ret < 0) break; if (!ret && extent_item_pos) { @@ -1011,9 +1010,9 @@ again: goto out; ref->inode_list = eie; } - ret = ulist_add_merge(refs, ref->parent, - (uintptr_t)ref->inode_list, - (u64 *)&eie, GFP_NOFS); + ret = ulist_add_merge_ptr(refs, ref->parent, + ref->inode_list, + (void **)&eie, GFP_NOFS); if (ret < 0) goto out; if (!ret && extent_item_pos) { diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 7f78cbf5cf41..4c29db604bbe 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); + +/* just like ulist_add_merge() but take a pointer for the aux data */ +static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, + void **old_aux, gfp_t gfp_mask) +{ +#if BITS_PER_LONG == 32 + u64 old64 = (uintptr_t)*old_aux; + int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask); + *old_aux = (void *)((uintptr_t)old64); + return ret; +#else + return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask); +#endif +} + struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter); -- cgit v1.2.3 From 27b9a8122ff71a8cadfbffb9c4f0694300464f3b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 9 Aug 2014 21:22:27 +0100 Subject: Btrfs: fix csum tree corruption, duplicate and outdated checksums Under rare circumstances we can end up leaving 2 versions of a checksum for the same file extent range. The reason for this is that after calling btrfs_next_leaf we process slot 0 of the leaf it returns, instead of processing the slot set in path->slots[0]. Most of the time (by far) path->slots[0] is 0, but after btrfs_next_leaf() releases the path and before it searches for the next leaf, another task might cause a split of the next leaf, which migrates some of its keys to the leaf we were processing before calling btrfs_next_leaf(). In this case btrfs_next_leaf() returns again the same leaf but with path->slots[0] having a slot number corresponding to the first new key it got, that is, a slot number that didn't exist before calling btrfs_next_leaf(), as the leaf now has more keys than it had before. So we must really process the returned leaf starting at path->slots[0] always, as it isn't always 0, and the key at slot 0 can have an offset much lower than our search offset/bytenr. For example, consider the following scenario, where we have: sums->bytenr: 40157184, sums->len: 16384, sums end: 40173568 four 4kb file data blocks with offsets 40157184, 40161280, 40165376, 40169472 Leaf N: slot = 0 slot = btrfs_header_nritems() - 1 |-------------------------------------------------------------------| | [(CSUM CSUM 39239680), size 8] ... [(CSUM CSUM 40116224), size 4] | |-------------------------------------------------------------------| Leaf N + 1: slot = 0 slot = btrfs_header_nritems() - 1 |--------------------------------------------------------------------| | [(CSUM CSUM 40161280), size 32] ... [((CSUM CSUM 40615936), size 8 | |--------------------------------------------------------------------| Because we are at the last slot of leaf N, we call btrfs_next_leaf() to find the next highest key, which releases the current path and then searches for that next key. However after releasing the path and before finding that next key, the item at slot 0 of leaf N + 1 gets moved to leaf N, due to a call to ctree.c:push_leaf_left() (via ctree.c:split_leaf()), and therefore btrfs_next_leaf() will returns us a path again with leaf N but with the slot pointing to its new last key (CSUM CSUM 40161280). This new version of leaf N is then: slot = 0 slot = btrfs_header_nritems() - 2 slot = btrfs_header_nritems() - 1 |----------------------------------------------------------------------------------------------------| | [(CSUM CSUM 39239680), size 8] ... [(CSUM CSUM 40116224), size 4] [(CSUM CSUM 40161280), size 32] | |----------------------------------------------------------------------------------------------------| And incorrecly using slot 0, makes us set next_offset to 39239680 and we jump into the "insert:" label, which will set tmp to: tmp = min((sums->len - total_bytes) >> blocksize_bits, (next_offset - file_key.offset) >> blocksize_bits) = min((16384 - 0) >> 12, (39239680 - 40157184) >> 12) = min(4, (u64)-917504 = 18446744073708634112 >> 12) = 4 and ins_size = csum_size * tmp = 4 * 4 = 16 bytes. In other words, we insert a new csum item in the tree with key (CSUM_OBJECTID CSUM_KEY 40157184 = sums->bytenr) that contains the checksums for all the data (4 blocks of 4096 bytes each = sums->len). Which is wrong, because the item with key (CSUM CSUM 40161280) (the one that was moved from leaf N + 1 to the end of leaf N) contains the old checksums of the last 12288 bytes of our data and won't get those old checksums removed. So this leaves us 2 different checksums for 3 4kb blocks of data in the tree, and breaks the logical rule: Key_N+1.offset >= Key_N.offset + length_of_data_its_checksums_cover An obvious bad effect of this is that a subsequent csum tree lookup to get the checksum of any of the blocks with logical offset of 40161280, 40165376 or 40169472 (the last 3 4kb blocks of file data), will get the old checksums. Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/file-item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index f46cfe45d686..54c84daec9b5 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -756,7 +756,7 @@ again: found_next = 1; if (ret != 0) goto insert; - slot = 0; + slot = path->slots[0]; } btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || -- cgit v1.2.3 From 8d875f95da43c6a8f18f77869f2ef26e9594fecc Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 12 Aug 2014 10:47:42 -0700 Subject: btrfs: disable strict file flushes for renames and truncates Truncates and renames are often used to replace old versions of a file with new versions. Applications often expect this to be an atomic replacement, even if they haven't done anything to make sure the new version is fully on disk. Btrfs has strict flushing in place to make sure that renaming over an old file with a new file will fully flush out the new file before allowing the transaction commit with the rename to complete. This ordering means the commit code needs to be able to lock file pages, and there are a few paths in the filesystem where we will try to end a transaction with the page lock held. It's rare, but these things can deadlock. This patch removes the ordered flushes and switches to a best effort filemap_flush like ext4 uses. It's not perfect, but it should fix the deadlocks. Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 6 --- fs/btrfs/disk-io.c | 32 ------------- fs/btrfs/file.c | 26 +--------- fs/btrfs/inode.c | 47 ++---------------- fs/btrfs/ordered-data.c | 123 ------------------------------------------------ fs/btrfs/ordered-data.h | 5 -- fs/btrfs/transaction.c | 33 +------------ fs/btrfs/transaction.h | 1 - 8 files changed, 6 insertions(+), 267 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4794923c410c..43527fd78825 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -84,12 +84,6 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; - /* - * list for tracking inodes that must be sent to disk before a - * rename or truncate commit - */ - struct list_head ordered_operations; - /* node for the red-black tree that links inodes in subvolume root */ struct rb_node rb_node; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 08e65e9cf2aa..d0ed9e664f7d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -60,8 +60,6 @@ static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, - struct btrfs_root *root); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_root *root); @@ -3829,34 +3827,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root) btrfs_cleanup_transaction(root); } -static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, - struct btrfs_root *root) -{ - struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); - - mutex_lock(&root->fs_info->ordered_operations_mutex); - spin_lock(&root->fs_info->ordered_root_lock); - - list_splice_init(&t->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - - list_del_init(&btrfs_inode->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - - btrfs_invalidate_inodes(btrfs_inode->root); - - spin_lock(&root->fs_info->ordered_root_lock); - } - - spin_unlock(&root->fs_info->ordered_root_lock); - mutex_unlock(&root->fs_info->ordered_operations_mutex); -} - static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; @@ -4093,8 +4063,6 @@ again: void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { - btrfs_destroy_ordered_operations(cur_trans, root); - btrfs_destroy_delayed_refs(cur_trans, root); cur_trans->state = TRANS_STATE_COMMIT_START; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1f2b99cb55ea..d3afac292d67 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1838,33 +1838,9 @@ out: int btrfs_release_file(struct inode *inode, struct file *filp) { - /* - * ordered_data_close is set by settattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. - */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We need to block on a committing transaction to keep us from - * throwing a ordered operation on to the list and causing - * something like sync to deadlock trying to flush out this - * inode. - */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); - btrfs_end_transaction(trans, root); - if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) - filemap_flush(inode->i_mapping); - } if (filp->private_data) btrfs_ioctl_trans_end(filp); + filemap_flush(inode->i_mapping); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ea7610fbaf3..73098328d040 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7950,27 +7950,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - /* - * setattr is responsible for setting the ordered_data_close flag, - * but that is only tested during the last file release. That - * could happen well after the next commit, leaving a great big - * window where new writes may get lost if someone chooses to write - * to this file after truncating to zero - * - * The inode doesn't have any dirty data here, and so if we commit - * this is a noop. If someone immediately starts writing to the inode - * it is very likely we'll catch some of their writes in this - * transaction, and the commit will find this file on the ordered - * data list with good things to send down. - * - * This is a best effort solution, there is still a window where - * using truncate to replace the contents of the file will - * end up with a zero length file after a crash. - */ - if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, - &BTRFS_I(inode)->runtime_flags)) - btrfs_add_ordered_operation(trans, root, inode); - /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to @@ -8118,7 +8097,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); - INIT_LIST_HEAD(&ei->ordered_operations); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -8158,17 +8136,6 @@ void btrfs_destroy_inode(struct inode *inode) if (!root) goto free; - /* - * Make sure we're properly removed from the ordered operation - * lists. - */ - smp_mb(); - if (!list_empty(&BTRFS_I(inode)->ordered_operations)) { - spin_lock(&root->fs_info->ordered_root_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - } - if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { btrfs_info(root->fs_info, "inode %llu still on the orphan list", @@ -8350,12 +8317,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, ret = 0; /* - * we're using rename to replace one file with another. - * and the replacement file is large. Start IO on it now so - * we don't add too much work to the end of the transaction + * we're using rename to replace one file with another. Start IO on it + * now so we don't add too much work to the end of the transaction */ - if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && - old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); /* close the racy window with snapshot create/destroy ioctl */ @@ -8403,12 +8368,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ btrfs_pin_log_trans(root); } - /* - * make sure the inode gets flushed if it is replacing - * something. - */ - if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode)) - btrfs_add_ordered_operation(trans, root, old_inode); inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7187b14faa6c..963895c1f801 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode, trace_btrfs_ordered_extent_remove(inode, entry); - /* - * we have no more ordered extents for this inode and - * no dirty pages. We can safely remove it from the - * list of ordered extents - */ - if (RB_EMPTY_ROOT(&tree->tree) && - !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { - spin_lock(&root->fs_info->ordered_root_lock); - list_del_init(&BTRFS_I(inode)->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - } - if (!root->nr_ordered_extents) { spin_lock(&root->fs_info->ordered_root_lock); BUG_ON(list_empty(&root->ordered_root)); @@ -686,81 +674,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) mutex_unlock(&fs_info->ordered_operations_mutex); } -/* - * this is used during transaction commit to write all the inodes - * added to the ordered operation list. These files must be fully on - * disk before the transaction commits. - * - * we have two modes here, one is to just start the IO via filemap_flush - * and the other is to wait for all the io. When we wait, we have an - * extra check to make sure the ordered operation list really is empty - * before we return - */ -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait) -{ - struct btrfs_inode *btrfs_inode; - struct inode *inode; - struct btrfs_transaction *cur_trans = trans->transaction; - struct list_head splice; - struct list_head works; - struct btrfs_delalloc_work *work, *next; - int ret = 0; - - INIT_LIST_HEAD(&splice); - INIT_LIST_HEAD(&works); - - mutex_lock(&root->fs_info->ordered_extent_flush_mutex); - spin_lock(&root->fs_info->ordered_root_lock); - list_splice_init(&cur_trans->ordered_operations, &splice); - while (!list_empty(&splice)) { - btrfs_inode = list_entry(splice.next, struct btrfs_inode, - ordered_operations); - inode = &btrfs_inode->vfs_inode; - - list_del_init(&btrfs_inode->ordered_operations); - - /* - * the inode may be getting freed (in sys_unlink path). - */ - inode = igrab(inode); - if (!inode) - continue; - - if (!wait) - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - - work = btrfs_alloc_delalloc_work(inode, wait, 1); - if (!work) { - spin_lock(&root->fs_info->ordered_root_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) - list_add_tail(&btrfs_inode->ordered_operations, - &splice); - list_splice_tail(&splice, - &cur_trans->ordered_operations); - spin_unlock(&root->fs_info->ordered_root_lock); - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_work(root->fs_info->flush_workers, - &work->work); - - cond_resched(); - spin_lock(&root->fs_info->ordered_root_lock); - } - spin_unlock(&root->fs_info->ordered_root_lock); -out: - list_for_each_entry_safe(work, next, &works, list) { - list_del_init(&work->list); - btrfs_wait_and_free_delalloc_work(work); - } - mutex_unlock(&root->fs_info->ordered_extent_flush_mutex); - return ret; -} - /* * Used to start IO or wait for a given ordered extent to finish. * @@ -1120,42 +1033,6 @@ out: return index; } - -/* - * add a given inode to the list of inodes that must be fully on - * disk before a transaction commit finishes. - * - * This basically gives us the ext3 style data=ordered mode, and it is mostly - * used to make sure renamed files are fully on disk. - * - * It is a noop if the inode is already fully on disk. - * - * If trans is not null, we'll do a friendly check for a transaction that - * is already flushing things and force the IO down ourselves. - */ -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) -{ - struct btrfs_transaction *cur_trans = trans->transaction; - u64 last_mod; - - last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); - - /* - * if this file hasn't been changed since the last transaction - * commit, we can safely return without doing anything - */ - if (last_mod <= root->fs_info->last_trans_committed) - return; - - spin_lock(&root->fs_info->ordered_root_lock); - if (list_empty(&BTRFS_I(inode)->ordered_operations)) { - list_add_tail(&BTRFS_I(inode)->ordered_operations, - &cur_trans->ordered_operations); - } - spin_unlock(&root->fs_info->ordered_root_lock); -} - int __init ordered_data_init(void) { btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 246897058efb..d81a274d621e 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum, int len); -int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait); -void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_get_logged_extents(struct inode *inode, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 5f379affdf23..d89c6d3542ca 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -218,7 +218,6 @@ loop: spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); - INIT_LIST_HEAD(&cur_trans->ordered_operations); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); list_add_tail(&cur_trans->list, &fs_info->trans_list); @@ -1612,27 +1611,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); } -static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - int ret; - - ret = btrfs_run_delayed_items(trans, root); - if (ret) - return ret; - - /* - * rename don't use btrfs_join_transaction, so, once we - * set the transaction to blocked above, we aren't going - * to get any new ordered operations. We can safely run - * it here and no for sure that nothing new will be added - * to the list - */ - ret = btrfs_run_ordered_operations(trans, root, 1); - - return ret; -} - static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) @@ -1653,13 +1631,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *prev_trans = NULL; int ret; - ret = btrfs_run_ordered_operations(trans, root, 0); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - btrfs_end_transaction(trans, root); - return ret; - } - /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; @@ -1740,7 +1711,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, if (ret) goto cleanup_transaction; - ret = btrfs_flush_all_pending_stuffs(trans, root); + ret = btrfs_run_delayed_items(trans, root); if (ret) goto cleanup_transaction; @@ -1748,7 +1719,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ - ret = btrfs_flush_all_pending_stuffs(trans, root); + ret = btrfs_run_delayed_items(trans, root); if (ret) goto cleanup_transaction; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 7dd558ed0716..579be51b27e5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -55,7 +55,6 @@ struct btrfs_transaction { wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; struct list_head pending_snapshots; - struct list_head ordered_operations; struct list_head pending_chunks; struct list_head switch_commits; struct btrfs_delayed_ref_root delayed_refs; -- cgit v1.2.3 From d43cc79343dfabf9f168531d3f5cff313205c8fb Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 13 Aug 2014 17:16:29 -0500 Subject: Cleanup sparse file support by creating worker function for it Simply move code to new function (for clarity). Function sets or clears the sparse file attribute flag. Signed-off-by: Steve French Reviewed-by: David Disseldorp --- fs/cifs/smb2ops.c | 80 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 74634369c21e..85be34ad8d76 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -731,6 +731,52 @@ smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile, return SMB2_write(xid, parms, written, iov, nr_segs); } +/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */ +static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse) +{ + struct cifsInodeInfo *cifsi; + int rc; + + cifsi = CIFS_I(inode); + + /* if file already sparse don't bother setting sparse again */ + if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse) + return true; /* already sparse */ + + if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse) + return true; /* already not sparse */ + + /* + * Can't check for sparse support on share the usual way via the + * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share + * since Samba server doesn't set the flag on the share, yet + * supports the set sparse FSCTL and returns sparse correctly + * in the file attributes. If we fail setting sparse though we + * mark that server does not support sparse files for this share + * to avoid repeatedly sending the unsupported fsctl to server + * if the file is repeatedly extended. + */ + if (tcon->broken_sparse_sup) + return false; + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_SPARSE, + true /* is_fctl */, &setsparse, 1, NULL, NULL); + if (rc) { + tcon->broken_sparse_sup = true; + cifs_dbg(FYI, "set sparse rc = %d\n", rc); + return false; + } + + if (setsparse) + cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE; + else + cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE); + + return true; +} + static int smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile, __u64 size, bool set_alloc) @@ -745,40 +791,12 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, inode = cfile->dentry->d_inode; if (!set_alloc && (size > inode->i_size + 8192)) { - struct cifsInodeInfo *cifsi; __u8 set_sparse = 1; - int rc; - - cifsi = CIFS_I(inode); - - /* if file already sparse or no server support don't bother */ - if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) - goto smb2_set_eof; - - /* - * Can't check for sparse support on share the usual way via the - * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share - * since Samba server doesn't set the flag on the share, yet - * supports the set sparse FSCTL and returns sparse correctly - * in the file attributes. If we fail setting sparse though we - * mark that server does not support sparse files for this share - * to avoid repeatedly sending the unsupported fsctl to server - * if the file is repeatedly extended. - */ - if (tcon->broken_sparse_sup) - goto smb2_set_eof; - - rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, FSCTL_SET_SPARSE, - true /* is_fctl */, &set_sparse, 1, NULL, NULL); - if (rc) { - tcon->broken_sparse_sup = true; - cifs_dbg(FYI, "set sparse rc = %d\n", rc); - } else - cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE; + + /* whether set sparse succeeds or not, extend the file */ + smb2_set_sparse(xid, tcon, cfile, inode, set_sparse); } -smb2_set_eof: return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, &eof, false); } -- cgit v1.2.3 From 024408062b21af7316221c420ff16bdaac478fa8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 9 Aug 2014 10:16:44 -0400 Subject: cifs: handle lease F_UNLCK requests properly Currently any F_UNLCK request for a lease just gets back -EAGAIN. Allow them to go immediately to generic_setlease instead. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 888398067420..0a4a4d7d407e 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -812,8 +812,9 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) if (!(S_ISREG(inode->i_mode))) return -EINVAL; - /* check if file is oplocked */ - if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || + /* Check if file is oplocked if this is request for new lease */ + if (arg == F_UNLCK || + ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode)))) return generic_setlease(file, arg, lease); else if (tlink_tcon(cfile->tlink)->local_lease && -- cgit v1.2.3 From 754789a1c046106cfdb067102642f73e0fd35fb3 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 15 Aug 2014 23:49:01 -0500 Subject: [CIFS] Workaround MacOS server problem with SMB2.1 write response Writes fail to Mac servers with SMB2.1 mounts (works with cifs though) due to them sending an incorrect RFC1001 length for the SMB2.1 Write response. Workaround this problem. MacOS server sends a write response with 3 bytes of pad beyond the end of the SMB itself. The RFC1001 length is 3 bytes more than the sum of the SMB2.1 header length + the write reponse. Incorporate feedback from Jeff and JRA to allow servers to send a tcp frame that is even more than three bytes too long (ie much longer than the SMB2/SMB3 request that it contains) but we do log it once now. In the earlier version of the patch I had limited how far off the length field could be before we fail the request. Signed-off-by: Steve French --- fs/cifs/smb2misc.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index f2e6ac29a8d6..4aa7a0f07d6e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -178,9 +178,24 @@ smb2_check_message(char *buf, unsigned int length) /* Windows 7 server returns 24 bytes more */ if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; - /* server can return one byte more */ + /* server can return one byte more due to implied bcc[0] */ if (clc_len == 4 + len + 1) return 0; + + /* + * MacOS server pads after SMB2.1 write response with 3 bytes + * of junk. Other servers match RFC1001 len to actual + * SMB2/SMB3 frame length (header + smb2 response specific data) + * Log the server error (once), but allow it and continue + * since the frame is parseable. + */ + if (clc_len < 4 /* RFC1001 header size */ + len) { + printk_once(KERN_WARNING + "SMB2 server sent bad RFC1001 len %d not %d\n", + len, clc_len - 4); + return 0; + } + return 1; } return 0; -- cgit v1.2.3 From 18f39e7be0121317550d03e267e3ebd4dbfbb3ce Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 17 Aug 2014 00:22:24 -0500 Subject: [CIFS] Possible null ptr deref in SMB2_tcon As Raphael Geissert pointed out, tcon_error_exit can dereference tcon and there is one path in which tcon can be null. Signed-off-by: Steve French CC: Stable # v3.7+ Reported-by: Raphael Geissert --- fs/cifs/smb2pdu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 74440af59f35..240c627bc0c6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -907,7 +907,8 @@ tcon_exit: tcon_error_exit: if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - tcon->bad_network_name = true; + if (tcon) + tcon->bad_network_name = true; } goto tcon_exit; } -- cgit v1.2.3 From 52755808d4525f4d5b86d112d36ffc7a46f3fb48 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 18 Aug 2014 20:49:57 +0400 Subject: CIFS: Fix SMB2 readdir error handling SMB2 servers indicates the end of a directory search with STATUS_NO_MORE_FILE error code that is not processed now. This causes generic/257 xfstest to fail. Fix this by triggering the end of search by this error code in SMB2_query_directory. Also when negotiating CIFS protocol we tell the server to close the search automatically at the end and there is no need to do it itself. In the case of SMB2 protocol, we need to close it explicitly - separate close directory checks for different protocols. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/file.c | 2 +- fs/cifs/readdir.c | 2 +- fs/cifs/smb1ops.c | 7 +++++++ fs/cifs/smb2maperror.c | 2 +- fs/cifs/smb2ops.c | 9 +++++++++ fs/cifs/smb2pdu.c | 9 ++++----- 7 files changed, 25 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index bc20a6ea6754..ce24c1fc2123 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -409,6 +409,8 @@ struct smb_version_operations { /* get mtu credits */ int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int, unsigned int *, unsigned int *); + /* check if we need to issue closedir */ + bool (*dir_needs_close)(struct cifsFileInfo *); }; struct smb_version_values { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4ab2f79ffa7a..d5fec92e0360 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -762,7 +762,7 @@ int cifs_closedir(struct inode *inode, struct file *file) cifs_dbg(FYI, "Freeing private data in close dir\n"); spin_lock(&cifs_file_list_lock); - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { + if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; spin_unlock(&cifs_file_list_lock); if (server->ops->close_dir) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b15862e0f68c..798c80a41c88 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -593,7 +593,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, /* close and restart search */ cifs_dbg(FYI, "search backing up - close and restart search\n"); spin_lock(&cifs_file_list_lock); - if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { + if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; spin_unlock(&cifs_file_list_lock); if (server->ops->close) diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 5e8c22d6c7b9..1a6df4b03f67 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1015,6 +1015,12 @@ cifs_wp_retry_size(struct inode *inode) return CIFS_SB(inode->i_sb)->wsize; } +static bool +cifs_dir_needs_close(struct cifsFileInfo *cfile) +{ + return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1086,6 +1092,7 @@ struct smb_version_operations smb1_operations = { .create_mf_symlink = cifs_create_mf_symlink, .is_read_op = cifs_is_read_op, .wp_retry_size = cifs_wp_retry_size, + .dir_needs_close = cifs_dir_needs_close, #ifdef CONFIG_CIFS_XATTR .query_all_EAs = CIFSSMBQAllEAs, .set_EA = CIFSSMBSetEA, diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index e31a9dfdcd39..a689514e260f 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"}, {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"}, {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"}, - {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"}, + {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"}, {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"}, {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"}, {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"}, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 85be34ad8d76..3fcd410cee31 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1222,6 +1222,12 @@ smb2_wp_retry_size(struct inode *inode) SMB2_MAX_BUFFER_SIZE); } +static bool +smb2_dir_needs_close(struct cifsFileInfo *cfile) +{ + return !cfile->invalidHandle; +} + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -1297,6 +1303,7 @@ struct smb_version_operations smb20_operations = { .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, }; struct smb_version_operations smb21_operations = { @@ -1374,6 +1381,7 @@ struct smb_version_operations smb21_operations = { .parse_lease_buf = smb2_parse_lease_buf, .clone_range = smb2_clone_range, .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, }; struct smb_version_operations smb30_operations = { @@ -1454,6 +1462,7 @@ struct smb_version_operations smb30_operations = { .clone_range = smb2_clone_range, .validate_negotiate = smb3_validate_negotiate, .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 240c627bc0c6..fa0dd044213b 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2180,6 +2180,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; if (rc) { + if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) { + srch_inf->endOfSearch = true; + rc = 0; + } cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); goto qdir_exit; } @@ -2217,11 +2221,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, else cifs_dbg(VFS, "illegal search buffer type\n"); - if (rsp->hdr.Status == STATUS_NO_MORE_FILES) - srch_inf->endOfSearch = 1; - else - srch_inf->endOfSearch = 0; - return rc; qdir_exit: -- cgit v1.2.3 From b46799a8f28c43c5264ac8d8ffa28b311b557e03 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 18 Aug 2014 20:49:58 +0400 Subject: CIFS: Fix wrong directory attributes after rename When we requests rename we also need to update attributes of both source and target parent directories. Not doing it causes generic/309 xfstest to fail on SMB2 mounts. Fix this by marking these directories for force revalidating. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 41de3935caa0..753e7a3486de 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1717,6 +1717,12 @@ unlink_target: target_dentry, to_name); } + /* force revalidate to go get info when needed */ + CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; + + source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = + target_dir->i_mtime = current_fs_time(source_dir->i_sb); + cifs_rename_exit: kfree(info_buf_source); kfree(from_name); -- cgit v1.2.3 From ad3829cf1db5cf6a5dfafd54f9291b44f5fb1da8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 17 Aug 2014 06:43:58 -0500 Subject: Incorrect error returned on setting file compressed on SMB2 When the server (for an SMB2 or SMB3 mount) doesn't support an ioctl (such as setting the compressed flag on a file) we were incorrectly returning EIO instead of EOPNOTSUPP, this is confusing e.g. doing chattr +c to a file on a non-btrfs Samba partition, now the error returned is more intuitive to the user. Also fixes error mapping on setting hardlink to servers which don't support that. Signed-off-by: Steve French Reviewed-by: David Disseldorp --- fs/cifs/smb2maperror.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index a689514e260f..af59d03db492 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -298,7 +298,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = { {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"}, {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"}, {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"}, - {STATUS_INVALID_DEVICE_REQUEST, -EIO, "STATUS_INVALID_DEVICE_REQUEST"}, + {STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"}, {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"}, {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"}, {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"}, -- cgit v1.2.3 From 31742c5a331766bc7df6b0d525df00c6cd20d5a6 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 17 Aug 2014 08:38:47 -0500 Subject: enable fallocate punch hole ("fallocate -p") for SMB3 Implement FALLOC_FL_PUNCH_HOLE (which does not change the file size fortunately so this matches the behavior of the equivalent SMB3 fsctl call) for SMB3 mounts. This allows "fallocate -p" to work. It requires that the server support setting files as sparse (which Windows allows). Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 19 +++++++++++++++++++ fs/cifs/cifsglob.h | 2 ++ fs/cifs/smb2ops.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/smb2pdu.h | 6 ++++++ fs/cifs/smbfsctl.h | 2 +- 5 files changed, 73 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0a4a4d7d407e..155347e190f9 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -207,6 +207,19 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) +{ + struct super_block *sb = file->f_path.dentry->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct TCP_Server_Info *server = tcon->ses->server; + + if (server->ops->fallocate) + return server->ops->fallocate(file, tcon, mode, off, len); + + return -EOPNOTSUPP; +} + static int cifs_permission(struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -909,6 +922,7 @@ const struct file_operations cifs_file_ops = { .unlocked_ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_strict_ops = { @@ -928,6 +942,7 @@ const struct file_operations cifs_file_strict_ops = { .unlocked_ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_direct_ops = { @@ -948,6 +963,7 @@ const struct file_operations cifs_file_direct_ops = { #endif /* CONFIG_CIFS_POSIX */ .llseek = cifs_llseek, .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_nobrl_ops = { @@ -966,6 +982,7 @@ const struct file_operations cifs_file_nobrl_ops = { .unlocked_ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_strict_nobrl_ops = { @@ -984,6 +1001,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .unlocked_ioctl = cifs_ioctl, #endif /* CONFIG_CIFS_POSIX */ .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_file_direct_nobrl_ops = { @@ -1003,6 +1021,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { #endif /* CONFIG_CIFS_POSIX */ .llseek = cifs_llseek, .setlease = cifs_setlease, + .fallocate = cifs_fallocate, }; const struct file_operations cifs_dir_ops = { diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ce24c1fc2123..dfc731b02aa9 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -411,6 +411,8 @@ struct smb_version_operations { unsigned int *, unsigned int *); /* check if we need to issue closedir */ bool (*dir_needs_close)(struct cifsFileInfo *); + long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t, + loff_t); }; struct smb_version_values { diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 3fcd410cee31..101670c2c374 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1015,6 +1015,50 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, + loff_t offset, loff_t len) +{ + struct inode *inode; + struct cifsInodeInfo *cifsi; + struct cifsFileInfo *cfile = file->private_data; + struct file_zero_data_information fsctl_buf; + long rc; + unsigned int xid; + __u8 set_sparse = 1; + + xid = get_xid(); + + inode = cfile->dentry->d_inode; + cifsi = CIFS_I(inode); + + /* Need to make file sparse, if not already, before freeing range. */ + /* Consider adding equivalent for compressed since it could also work */ + if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) + return -EOPNOTSUPP; + + cifs_dbg(FYI, "offset %lld len %lld", offset, len); + + fsctl_buf.FileOffset = cpu_to_le64(offset); + fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), NULL, NULL); + free_xid(xid); + return rc; +} + +static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, + loff_t off, loff_t len) +{ + /* KEEP_SIZE already checked for by do_fallocate */ + if (mode & FALLOC_FL_PUNCH_HOLE) + return smb3_punch_hole(file, tcon, off, len); + + return -EOPNOTSUPP; +} + static void smb2_downgrade_oplock(struct TCP_Server_Info *server, struct cifsInodeInfo *cinode, bool set_level2) @@ -1463,6 +1507,7 @@ struct smb_version_operations smb30_operations = { .validate_negotiate = smb3_validate_negotiate, .wp_retry_size = smb2_wp_retry_size, .dir_needs_close = smb2_dir_needs_close, + .fallocate = smb3_fallocate, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 69f3595d3952..fbe486c285a9 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -573,6 +573,12 @@ struct copychunk_ioctl { __u32 Reserved2; } __packed; +/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */ +struct file_zero_data_information { + __le64 FileOffset; + __le64 BeyondFinalZero; +} __packed; + struct copychunk_ioctl_rsp { __le32 ChunksWritten; __le32 ChunkBytesWritten; diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 0e538b5c9622..83efa59535be 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -63,7 +63,7 @@ #define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */ #define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */ #define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */ -#define FSCTL_SET_ZERO_DATA 0x000900C8 /* BB add struct */ +#define FSCTL_SET_ZERO_DATA 0x000980C8 #define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */ #define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */ #define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */ -- cgit v1.2.3 From 30175628bf7f521e9ee31ac98fa6d6fe7441a556 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 17 Aug 2014 18:16:40 -0500 Subject: [SMB3] Enable fallocate -z support for SMB3 mounts fallocate -z (FALLOC_FL_ZERO_RANGE) can map to SMB3 FSCTL_SET_ZERO_DATA SMB3 FSCTL but FALLOC_FL_ZERO_RANGE when called without the FALLOC_FL_KEEPSIZE flag set could want the file size changed so we can not support that subcase unless the file is cached (and thus we know the file size). Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/smb2ops.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 101670c2c374..5a48aa290dfe 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1015,6 +1015,56 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, + loff_t offset, loff_t len, bool keep_size) +{ + struct inode *inode; + struct cifsInodeInfo *cifsi; + struct cifsFileInfo *cfile = file->private_data; + struct file_zero_data_information fsctl_buf; + long rc; + unsigned int xid; + + xid = get_xid(); + + inode = cfile->dentry->d_inode; + cifsi = CIFS_I(inode); + + /* if file not oplocked can't be sure whether asking to extend size */ + if (!CIFS_CACHE_READ(cifsi)) + if (keep_size == false) + return -EOPNOTSUPP; + + /* + * Must check if file sparse since fallocate -z (zero range) assumes + * non-sparse allocation + */ + if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) + return -EOPNOTSUPP; + + /* + * need to make sure we are not asked to extend the file since the SMB3 + * fsctl does not change the file size. In the future we could change + * this to zero the first part of the range then set the file size + * which for a non sparse file would zero the newly extended range + */ + if (keep_size == false) + if (i_size_read(inode) < offset + len) + return -EOPNOTSUPP; + + cifs_dbg(FYI, "offset %lld len %lld", offset, len); + + fsctl_buf.FileOffset = cpu_to_le64(offset); + fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); + + rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information), NULL, NULL); + free_xid(xid); + return rc; +} + static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len) { @@ -1055,6 +1105,11 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, /* KEEP_SIZE already checked for by do_fallocate */ if (mode & FALLOC_FL_PUNCH_HOLE) return smb3_punch_hole(file, tcon, off, len); + else if (mode & FALLOC_FL_ZERO_RANGE) { + if (mode & FALLOC_FL_KEEP_SIZE) + return smb3_zero_range(file, tcon, off, len, true); + return smb3_zero_range(file, tcon, off, len, false); + } return -EOPNOTSUPP; } -- cgit v1.2.3 From 7a5c3c9be1059feed0e470c6dc0994dcaed4f12c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 17 Jun 2014 18:58:59 +0800 Subject: Btrfs: fix put dio bio twice when we submit dio bio fail The caller of btrfs_submit_direct_hook() will put the original dio bio when btrfs_submit_direct_hook() return a error number, so we needn't put the original bio in btrfs_submit_direct_hook(). Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73098328d040..33c05188cbf0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7306,10 +7306,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, &map_length, NULL, 0); - if (ret) { - bio_put(orig_bio); + if (ret) return -EIO; - } if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; @@ -7326,6 +7324,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); if (!bio) return -ENOMEM; + bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; atomic_inc(&dip->pending_bios); -- cgit v1.2.3 From 1707e26d6ab05c477a91d260e31fda7c6c38588e Mon Sep 17 00:00:00 2001 From: chandan Date: Tue, 1 Jul 2014 12:04:28 +0530 Subject: Btrfs: fill_holes: Fix slot number passed to hole_mergeable() call. For a non-existent key, btrfs_search_slot() sets path->slots[0] to the slot where the key could have been present, which in this case would be the slot containing the extent item which would be the next neighbor of the file range being punched. The current code passes an incremented path->slots[0] and we skip to the wrong file extent item. This would mean that we would fail to merge the "yet to be created" hole with the next neighboring hole (if one exists). Fix this. Signed-off-by: Chandan Rajendra Reviewed-by: Wang Shilong Signed-off-by: Chris Mason --- fs/btrfs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d3afac292d67..77e33534c7d6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2088,10 +2088,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, goto out; } - if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { + if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { u64 num_bytes; - path->slots[0]++; key.offset = offset; btrfs_set_item_key_safe(root, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], -- cgit v1.2.3 From b96de000bc8bc9688b3a2abea4332bd57648a49f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 3 Jul 2014 18:22:05 +0800 Subject: Btrfs: device_list_add() should not update list when mounted device_list_add() is called when user runs btrfs dev scan, which would add any btrfs device into the btrfs_fs_devices list. Now think of a mounted btrfs. And a new device which contains the a SB from the mounted btrfs devices. In this situation when user runs btrfs dev scan, the current code would just replace existing device with the new device. Which is to note that old device is neither closed nor gracefully removed from the btrfs. The FS is still operational with the old bdev however the device name is the btrfs_device is new which is provided by the btrfs dev scan. reproducer: devmgt[1] detach /dev/sdc replace the missing disk /dev/sdc btrfs rep start -f 1 /dev/sde /btrfs Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120 Total devices 2 FS bytes used 32.00KiB devid 1 size 958.94MiB used 115.88MiB path /dev/sde devid 2 size 958.94MiB used 103.88MiB path /dev/sdd make /dev/sdc to reappear devmgt attach host2 btrfs dev scan btrfs fi show -m Label: none uuid: 5dc0aaf4-4683-4050-b2d6-5ebe5f5cd120^M Total devices 2 FS bytes used 32.00KiB^M devid 1 size 958.94MiB used 115.88MiB path /dev/sdc <- Wrong. devid 2 size 958.94MiB used 103.88MiB path /dev/sdd since /dev/sdc has been replaced with /dev/sde, the /dev/sdc shouldn't be part of the btrfs-fsid when it reappears. If user want it to be part of it then sys admin should be using btrfs device add instead. [1] github.com/anajain/devmgt.git Signed-off-by: Anand Jain Signed-off-by: Wang Shilong Signed-off-by: Miao Xie Reviewed-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6cb82f62cb7c..7c538f65214b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -508,6 +508,33 @@ static noinline int device_list_add(const char *path, ret = 1; device->fs_devices = fs_devices; } else if (!device->name || strcmp(device->name->str, path)) { + /* + * When FS is already mounted. + * 1. If you are here and if the device->name is NULL that + * means this device was missing at time of FS mount. + * 2. If you are here and if the device->name is different + * from 'path' that means either + * a. The same device disappeared and reappeared with + * different name. or + * b. The missing-disk-which-was-replaced, has + * reappeared now. + * + * We must allow 1 and 2a above. But 2b would be a spurious + * and unintentional. + * + * Further in case of 1 and 2a above, the disk at 'path' + * would have missed some transaction when it was away and + * in case of 2a the stale bdev has to be updated as well. + * 2b must not be allowed at all time. + */ + + /* + * As of now don't allow update to btrfs_fs_device through + * the btrfs dev scan cli, after FS has been mounted. + */ + if (fs_devices->opened) + return -EBUSY; + name = rcu_string_strdup(path, GFP_NOFS); if (!name) return -ENOMEM; -- cgit v1.2.3 From 77bdae4d136e167bab028cbec58b988f91cf73c0 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 3 Jul 2014 18:22:06 +0800 Subject: btrfs: check generation as replace duplicates devid+uuid When FS in unmounted we need to check generation number as well since devid+uuid combination could match with the missing replaced disk when it reappears, and without this patch it might pair with the replaced disk again. device_list_add() function is called in the following threads, mount device option mount argument ioctl BTRFS_IOC_SCAN_DEV (btrfs dev scan) ioctl BTRFS_IOC_DEVICES_READY (btrfs dev ready ) they have been unit tested to work fine with this patch. If the user knows what he is doing and really want to pair with replaced disk (which is not a standard operation), then he should first clear the kernel btrfs device list in the memory by doing the module unload/load and followed with the mount -o device option. Signed-off-by: Anand Jain Signed-off-by: Wang Shilong Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7c538f65214b..5700ab03e84b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -532,8 +532,19 @@ static noinline int device_list_add(const char *path, * As of now don't allow update to btrfs_fs_device through * the btrfs dev scan cli, after FS has been mounted. */ - if (fs_devices->opened) + if (fs_devices->opened) { return -EBUSY; + } else { + /* + * That is if the FS is _not_ mounted and if you + * are here, that means there is more than one + * disk with same uuid and devid.We keep the one + * with larger generation number or the last-in if + * generation are equal. + */ + if (found_transid < device->generation) + return -EEXIST; + } name = rcu_string_strdup(path, GFP_NOFS); if (!name) @@ -546,6 +557,15 @@ static noinline int device_list_add(const char *path, } } + /* + * Unmount does not free the btrfs_device struct but would zero + * generation along with most of the other members. So just update + * it back. We need it to pick the disk with largest generation + * (as above). + */ + if (!fs_devices->opened) + device->generation = found_transid; + if (found_transid > fs_devices->latest_trans) { fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; -- cgit v1.2.3 From 69611ac810af8e88eeb5ebd851589a0e8cc90860 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 3 Jul 2014 18:22:12 +0800 Subject: Btrfs: fix unzeroed members in fs_devices when creating a fs from seed fs We forgot to zero some members in fs_devices when we create new fs_devices from the one of the seed fs. It would cause the problem that we got wrong chunk profile when allocating chunks. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5700ab03e84b..50edbbc2e76c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1988,6 +1988,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; + fs_devices->missing_devices = 0; + fs_devices->num_can_discard = 0; + fs_devices->rotating = 0; fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); -- cgit v1.2.3 From 3a7d55c84c76a3088ddfb798c187182143683a16 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 16 Jul 2014 18:38:01 +0800 Subject: Btrfs: fix wrong missing device counter decrease The missing devices are accounted by its own fs device, for example the missing devices in seed filesystem will be accounted by the fs device of the seed filesystem, not by the new filesystem which is based on the seed filesystem, so when we remove the missing device in the seed filesystem, we should decrease the counter of its own fs device. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 50edbbc2e76c..da0e632a21fc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1718,7 +1718,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device->fs_devices->total_devices--; if (device->missing) - root->fs_info->fs_devices->missing_devices--; + device->fs_devices->missing_devices--; next_device = list_entry(root->fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); -- cgit v1.2.3 From 9a025a0860ccc0f02af153c966bc1f83e5d9fc62 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:13 +0800 Subject: Btrfs: fix wrong write range for filemap_fdatawrite_range() filemap_fdatawrite_range() expect the third arg to be @end not @len, fix it. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 33c05188cbf0..73fadc7ead0e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7533,7 +7533,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, count = iov_iter_count(iter); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, count); + filemap_fdatawrite_range(inode->i_mapping, offset, + offset + count - 1); if (rw & WRITE) { /* -- cgit v1.2.3 From e2eca69dc6c09d968d69312b9899968a9b03a4a9 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Thu, 17 Jul 2014 11:44:14 +0800 Subject: Btrfs: fix wrong extent mapping for DirectIO btrfs_next_leaf() will use current leaf's last key to search and then return a bigger one. So it may still return a file extent item that is smaller than expected value and we will get an overflow here for @em->len. This is easy to reproduce for Btrfs Direct writting, it did not cause any problem, because writting will re-insert right mapping later. However, by hacking code to make DIO support compression, wrong extent mapping is kept and it encounter merging failure(EEXIST) quickly. Fix this problem by looping to find next file extent item that is bigger than @start or we could not find anything more. Signed-off-by: Wang Shilong Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 73fadc7ead0e..a3c6e76f5a4e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6275,6 +6275,8 @@ next: goto not_found; if (start + len <= found_key.offset) goto not_found; + if (start > found_key.offset) + goto next; em->start = start; em->orig_start = start; em->len = found_key.offset - start; -- cgit v1.2.3 From 2c91943b5066314a8bb9f0a65584e5e4cd92ea63 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 18 Jul 2014 09:55:43 +0800 Subject: btrfs: Return right extent when fiemap gives unaligned offset and len. When page aligned start and len passed to extent_fiemap(), the result is good, but when start and len is not aligned, e.g. start = 1 and len = 4095 is passed to extent_fiemap(), it returns no extent. The problem is that start and len is all rounded down which causes the problem. This patch will round down start and round up (start + len) to return right extent. Reported-by: Chandan Rajendra Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a389820d158b..1c70cff4a9e1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4213,8 +4213,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; - start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); - len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + start = round_down(start, BTRFS_I(inode)->root->sectorsize); + len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start; /* * lookup the last file extent. We're not using i_size here -- cgit v1.2.3 From ff61d17c6324d1b483fbbc5144f09668c24ff60c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 Jul 2014 11:37:06 +0800 Subject: Btrfs: Fix the problem that the replace destroys the seed filesystem The seed filesystem was destroyed by the device replace, the reproduce method is: # mkfs.btrfs -f # btrfstune -S 1 # mount # btrfs device add # umount # mount # btrfs replace start -f # umount # mount It is because we erase the super block on the seed device. It is wrong, we should not change anything on the seed device. Signed-off-by: Miao Xie Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index da0e632a21fc..00c8efdcd1e5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1848,8 +1848,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->bdev) { fs_info->fs_devices->open_devices--; - /* zero out the old super */ - btrfs_scratch_superblock(srcdev); + /* + * zero out the old super if it is not writable + * (e.g. seed device) + */ + if (srcdev->writeable) + btrfs_scratch_superblock(srcdev); } call_rcu(&srcdev->rcu, free_device); -- cgit v1.2.3 From 5d68da3b8ee6eb2257aa4b8d885581782278ae93 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 Jul 2014 11:37:07 +0800 Subject: Btrfs: don't write any data into a readonly device when scrub We should not write data into a readonly device especially seed device when doing scrub, skip those devices. Signed-off-by: Miao Xie Reviewed-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b6d198f5181e..23d3f6e6a482 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2904,6 +2904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; + struct rcu_string *name; if (btrfs_fs_closing(fs_info)) return -EINVAL; @@ -2965,6 +2966,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return -ENODEV; } + if (!is_dev_replace && !readonly && !dev->writeable) { + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + rcu_read_lock(); + name = rcu_dereference(dev->name); + btrfs_err(fs_info, "scrub: device %s is not writable", + name->str); + rcu_read_unlock(); + return -EROFS; + } + mutex_lock(&fs_info->scrub_lock); if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { mutex_unlock(&fs_info->scrub_lock); -- cgit v1.2.3 From 7df69d3e94d6de537fd1afb574c760d8dc83ab60 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 Jul 2014 11:37:13 +0800 Subject: Btrfs: Fix wrong device size when we are resizing the device total_bytes of device is just a in-memory variant which is used to record the size of the device, and it might be changed before we resize a device, if the resize operation fails, it will be fallbacked. But some code used it to update on-disk metadata of the device, it would cause the problem that on-disk metadata of the devices was not consistent. We should use the other variant named disk_total_bytes to update the on-disk metadata of device, because that variant is updated only when the resize operation is successful. Fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 3 ++- fs/btrfs/volumes.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d0ed9e664f7d..c99a414813c1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3450,7 +3450,8 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) btrfs_set_stack_device_generation(dev_item, 0); btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); - btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_total_bytes(dev_item, + dev->disk_total_bytes); btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 00c8efdcd1e5..9d4ce53d7569 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1483,7 +1483,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_set_device_group(leaf, dev_item, 0); btrfs_set_device_seek_speed(leaf, dev_item, 0); -- cgit v1.2.3 From 95669976bd7d30ae265db938ecb46a6b7f8cb893 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 24 Jul 2014 11:37:14 +0800 Subject: Btrfs: don't consider the missing device when allocating new chunks The original code allocated new chunks by the number of the writable devices and missing devices to make sure that any RAID levels on a degraded FS continue to be honored, but it introduced a problem that it stopped us to allocating new chunks, the steps to reproduce is following: # mkfs.btrfs -m raid1 -d raid1 -f # mkfs.btrfs -f //Removing from the original fs # mount -o degraded # dd if=/dev/null of=/tmpfile bs=1M It is because we allocate new chunks only on the writable devices, if we take the number of missing devices into account, and want to allocate new chunks with higher RAID level, we will fail becaue we don't have enough writable device. Fix it by ignoring the number of missing devices when allocating new chunks. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 102ed3143976..5524434da059 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3586,13 +3586,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) */ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - u64 num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; + u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; u64 tmp; @@ -8440,13 +8434,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) if (stripped) return extended_to_chunk(stripped); - /* - * we add in the count of missing devices because we want - * to make sure that any RAID levels on a degraded FS - * continue to be honored. - */ - num_devices = root->fs_info->fs_devices->rw_devices + - root->fs_info->fs_devices->missing_devices; + num_devices = root->fs_info->fs_devices->rw_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | -- cgit v1.2.3 From 85cd083b498572fb9fa575cce3ed910c8ee84294 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 9 Aug 2014 09:49:31 +0800 Subject: udf: avoid unneeded up_write when fail to add entry in ->symlink We have released the ->i_data_sem before invoking udf_add_entry(), so in following error path, we should not release this lock again. Signed-off-by: Chao Yu Signed-off-by: Jan Kara --- fs/udf/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 9737cba1357d..83a06001742b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1014,7 +1014,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) - goto out_no_entry; + goto out_fail; cfi.icb.extLength = cpu_to_le32(sb->s_blocksize); cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); if (UDF_SB(inode->i_sb)->s_lvid_bh) { @@ -1036,6 +1036,7 @@ out: out_no_entry: up_write(&iinfo->i_data_sem); +out_fail: inode_dec_link_count(inode); iput(inode); goto out; -- cgit v1.2.3 From 410dd3cf4c9b36f27ed4542ee18b1af5e68645a4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 17 Aug 2014 11:49:57 +0200 Subject: isofs: Fix unbounded recursion when processing relocated directories We did not check relocated directory in any way when processing Rock Ridge 'CL' tag. Thus a corrupted isofs image can possibly have a CL entry pointing to another CL entry leading to possibly unbounded recursion in kernel code and thus stack overflow or deadlocks (if there is a loop created from CL entries). Fix the problem by not allowing CL entry to point to a directory entry with CL entry (such use makes no good sense anyway) and by checking whether CL entry doesn't point to itself. CC: stable@vger.kernel.org Reported-by: Chris Evans Signed-off-by: Jan Kara --- fs/isofs/inode.c | 15 ++++++++------- fs/isofs/isofs.h | 23 +++++++++++++++++++---- fs/isofs/rock.c | 39 ++++++++++++++++++++++++++++----------- 3 files changed, 55 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 4556ce1af5b0..5ddaf8625d3b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -61,7 +61,7 @@ static void isofs_put_super(struct super_block *sb) return; } -static int isofs_read_inode(struct inode *); +static int isofs_read_inode(struct inode *, int relocated); static int isofs_statfs (struct dentry *, struct kstatfs *); static struct kmem_cache *isofs_inode_cachep; @@ -1259,7 +1259,7 @@ out_toomany: goto out; } -static int isofs_read_inode(struct inode *inode) +static int isofs_read_inode(struct inode *inode, int relocated) { struct super_block *sb = inode->i_sb; struct isofs_sb_info *sbi = ISOFS_SB(sb); @@ -1404,7 +1404,7 @@ static int isofs_read_inode(struct inode *inode) */ if (!high_sierra) { - parse_rock_ridge_inode(de, inode); + parse_rock_ridge_inode(de, inode, relocated); /* if we want uid/gid set, override the rock ridge setting */ if (sbi->s_uid_set) inode->i_uid = sbi->s_uid; @@ -1483,9 +1483,10 @@ static int isofs_iget5_set(struct inode *ino, void *data) * offset that point to the underlying meta-data for the inode. The * code below is otherwise similar to the iget() code in * include/linux/fs.h */ -struct inode *isofs_iget(struct super_block *sb, - unsigned long block, - unsigned long offset) +struct inode *__isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset, + int relocated) { unsigned long hashval; struct inode *inode; @@ -1507,7 +1508,7 @@ struct inode *isofs_iget(struct super_block *sb, return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - ret = isofs_read_inode(inode); + ret = isofs_read_inode(inode, relocated); if (ret < 0) { iget_failed(inode); inode = ERR_PTR(ret); diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 99167238518d..0ac4c1f73fbd 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -107,7 +107,7 @@ extern int iso_date(char *, int); struct inode; /* To make gcc happy */ -extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *); +extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated); extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *); extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *); @@ -118,9 +118,24 @@ extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int extern struct buffer_head *isofs_bread(struct inode *, sector_t); extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long); -extern struct inode *isofs_iget(struct super_block *sb, - unsigned long block, - unsigned long offset); +struct inode *__isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset, + int relocated); + +static inline struct inode *isofs_iget(struct super_block *sb, + unsigned long block, + unsigned long offset) +{ + return __isofs_iget(sb, block, offset, 0); +} + +static inline struct inode *isofs_iget_reloc(struct super_block *sb, + unsigned long block, + unsigned long offset) +{ + return __isofs_iget(sb, block, offset, 1); +} /* Because the inode number is no longer relevant to finding the * underlying meta-data for an inode, we are free to choose a more diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index c0bf42472e40..f488bbae541a 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -288,12 +288,16 @@ eio: goto out; } +#define RR_REGARD_XA 1 +#define RR_RELOC_DE 2 + static int parse_rock_ridge_inode_internal(struct iso_directory_record *de, - struct inode *inode, int regard_xa) + struct inode *inode, int flags) { int symlink_len = 0; int cnt, sig; + unsigned int reloc_block; struct inode *reloc; struct rock_ridge *rr; int rootflag; @@ -305,7 +309,7 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de, init_rock_state(&rs, inode); setup_rock_ridge(de, inode, &rs); - if (regard_xa) { + if (flags & RR_REGARD_XA) { rs.chr += 14; rs.len -= 14; if (rs.len < 0) @@ -485,12 +489,22 @@ repeat: "relocated directory\n"); goto out; case SIG('C', 'L'): - ISOFS_I(inode)->i_first_extent = - isonum_733(rr->u.CL.location); - reloc = - isofs_iget(inode->i_sb, - ISOFS_I(inode)->i_first_extent, - 0); + if (flags & RR_RELOC_DE) { + printk(KERN_ERR + "ISOFS: Recursive directory relocation " + "is not supported\n"); + goto eio; + } + reloc_block = isonum_733(rr->u.CL.location); + if (reloc_block == ISOFS_I(inode)->i_iget5_block && + ISOFS_I(inode)->i_iget5_offset == 0) { + printk(KERN_ERR + "ISOFS: Directory relocation points to " + "itself\n"); + goto eio; + } + ISOFS_I(inode)->i_first_extent = reloc_block; + reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0); if (IS_ERR(reloc)) { ret = PTR_ERR(reloc); goto out; @@ -637,9 +651,11 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) return rpnt; } -int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode) +int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode, + int relocated) { - int result = parse_rock_ridge_inode_internal(de, inode, 0); + int flags = relocated ? RR_RELOC_DE : 0; + int result = parse_rock_ridge_inode_internal(de, inode, flags); /* * if rockridge flag was reset and we didn't look for attributes @@ -647,7 +663,8 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode) */ if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1) && (ISOFS_SB(inode->i_sb)->s_rock == 2)) { - result = parse_rock_ridge_inode_internal(de, inode, 14); + result = parse_rock_ridge_inode_internal(de, inode, + flags | RR_REGARD_XA); } return result; } -- cgit v1.2.3 From e1c42045203071c4634b89e696037357810d3083 Mon Sep 17 00:00:00 2001 From: arter97 Date: Wed, 6 Aug 2014 23:22:50 +0900 Subject: f2fs: fix typo Fix typo and some grammatical errors. The words "filesystem" and "readahead" are being used without the space treewide. Signed-off-by: Park Ju Hyung Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 4 ++-- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 4 ++-- fs/f2fs/debug.c | 4 ++-- fs/f2fs/dir.c | 4 ++-- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 4 ++-- fs/f2fs/gc.h | 2 +- fs/f2fs/namei.c | 2 +- fs/f2fs/node.c | 14 +++++++------- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 2 +- fs/f2fs/super.c | 2 +- fs/f2fs/xattr.c | 2 +- 15 files changed, 30 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 214fe1054fce..736a348509f7 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -23,7 +23,7 @@ config F2FS_STAT_FS mounted as f2fs. Each file shows the whole f2fs information. /sys/kernel/debug/f2fs/status includes: - - major file system information managed by f2fs currently + - major filesystem information managed by f2fs currently - average SIT information about whole segments - current memory footprint consumed by f2fs. @@ -68,6 +68,6 @@ config F2FS_CHECK_FS bool "F2FS consistency checking feature" depends on F2FS_FS help - Enables BUG_ONs which check the file system consistency in runtime. + Enables BUG_ONs which check the filesystem consistency in runtime. If you want to improve the performance, say N. diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6aeed5bada52..7e1c13bfb99f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -758,7 +758,7 @@ retry_flush_dents: } /* - * POR: we should ensure that there is no dirty node pages + * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. */ retry_flush_nodes: @@ -942,7 +942,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) } /* - * We guarantee that this checkpoint procedure should not fail. + * We guarantee that this checkpoint procedure will not fail. */ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 03313099c51c..7aef28d1fffa 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -691,7 +691,7 @@ get_next: allocated = true; blkaddr = dn.data_blkaddr; } - /* Give more consecutive addresses for the read ahead */ + /* Give more consecutive addresses for the readahead */ if (blkaddr == (bh_result->b_blocknr + ofs)) { ofs++; dn.ofs_in_node++; @@ -739,7 +739,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page) trace_f2fs_readpage(page, DATA); - /* If the file has inline data, try to read it directlly */ + /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); else diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index a441ba33be11..fecebdbfd781 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -32,7 +32,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); int i; - /* valid check of the segment numbers */ + /* validation check of the segment numbers */ si->hit_ext = sbi->read_hit_ext; si->total_ext = sbi->total_hit_ext; si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); @@ -152,7 +152,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); - /* buld nm */ + /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index bcf893c3d903..a69bbfa90c99 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -124,7 +124,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, /* * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs are occurred. + * We stop here for figuring out where the bugs has occurred. */ f2fs_bug_on(!de->name_len); @@ -563,7 +563,7 @@ fail: } /* - * It only removes the dentry from the dentry page,corresponding name + * It only removes the dentry from the dentry page, corresponding name * entry in name page does not need to be touched during deletion. */ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4dab5338a97a..790a0738d314 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -395,7 +395,7 @@ enum count_type { }; /* - * The below are the page types of bios used in submti_bio(). + * The below are the page types of bios used in submit_bio(). * The available types are: * DATA User data pages. It operates as async mode. * NODE Node pages. It operates as async mode. @@ -470,7 +470,7 @@ struct f2fs_sb_info { struct list_head dir_inode_list; /* dir inode list */ spinlock_t dir_inode_lock; /* for dir inode list lock */ - /* basic file system units */ + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ unsigned int blocksize; /* block size */ @@ -799,7 +799,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) /* * odd numbered checkpoint should at cp segment 0 - * and even segent must be at cp segment 1 + * and even segment must be at cp segment 1 */ if (!(ckpt_version & 1)) start_addr += sbi->blocks_per_seg; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 208f1a9bd569..87cdac4efeec 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -288,7 +288,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { - /* direct node is not exist */ + /* direct node does not exists */ if (whence == SEEK_DATA) { pgofs = PGOFS_OF_NEXT_DNODE(pgofs, F2FS_I(inode)); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d7947d90ccc3..87c50c507461 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -58,7 +58,7 @@ static int gc_thread_func(void *data) * 3. IO subsystem is idle by checking the # of requests in * bdev's request list. * - * Note) We have to avoid triggering GCs too much frequently. + * Note) We have to avoid triggering GCs frequently. * Because it is possible that some segments can be * invalidated soon after by user update or deletion. * So, I'd like to wait some time to collect dirty segments. @@ -222,7 +222,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) u = (vblocks * 100) >> sbi->log_blocks_per_seg; - /* Handle if the system time is changed by user */ + /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) sit_i->min_mtime = mtime; if (mtime > sit_i->max_mtime) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 5d5eb6047bf4..16f0b2b22999 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -91,7 +91,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) block_t invalid_user_blocks = sbi->user_block_count - written_block_count(sbi); /* - * Background GC is triggered with the following condition. + * Background GC is triggered with the following conditions. * 1. There are a number of invalid blocks. * 2. There is not enough free space. */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 27b03776ffd2..eb407d207289 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -229,7 +229,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, inode); f2fs_unlock_op(sbi); - /* In order to evict this inode, we set it dirty */ + /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); fail: trace_f2fs_unlink_exit(inode, err); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d3d90d284631..1f3329907483 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -237,7 +237,7 @@ retry: nat_get_blkaddr(e) != NULL_ADDR && new_blkaddr == NEW_ADDR); - /* increament version no as node is removed */ + /* increment version no as node is removed */ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); nat_set_version(e, inc_node_version(version)); @@ -274,7 +274,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) } /* - * This function returns always success + * This function always returns success */ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { @@ -650,7 +650,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { - /* refernece count'll be increased */ + /* reference count'll be increased */ pages[i] = get_node_page(sbi, nid[i]); if (IS_ERR(pages[i])) { err = PTR_ERR(pages[i]); @@ -836,7 +836,7 @@ void remove_inode_page(struct inode *inode) f2fs_put_page(page, 1); return; } - /* 0 is possible, after f2fs_new_inode() is failed */ + /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); set_new_dnode(&dn, inode, page, page, ino); truncate_node(&dn); @@ -1637,7 +1637,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (!ipage) return -ENOMEM; - /* Should not use this inode from free nid list */ + /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); SetPageUptodate(ipage); @@ -1665,7 +1665,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) /* * ra_sum_pages() merge contiguous pages into one bio and submit. - * these pre-readed pages are alloced in bd_inode's mapping tree. + * these pre-read pages are allocated in bd_inode's mapping tree. */ static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, int start, int nrpages) @@ -1709,7 +1709,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { nrpages = min(last_offset - i, bio_blocks); - /* read ahead node pages */ + /* readahead node pages */ nrpages = ra_sum_pages(sbi, pages, addr, nrpages); if (!nrpages) return -ENOMEM; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0dfeebae2a50..31b630efe4db 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -62,7 +62,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) } /* - * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue + * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * Example: * LSB <--> MSB @@ -808,7 +808,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, } /* - * This function always allocates a used segment (from dirty seglist) by SSR + * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 55973f7b0330..ff483257283b 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -549,7 +549,7 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) } /* - * Summary block is always treated as invalid block + * Summary block is always treated as an invalid block */ static inline void check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 657582fc7601..633315acfef8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -663,7 +663,7 @@ restore_gc: if (need_restart_gc) { if (start_gc_thread(sbi)) f2fs_msg(sbi->sb, KERN_WARNING, - "background gc thread is stop"); + "background gc thread has stopped"); } else if (need_stop_gc) { stop_gc_thread(sbi); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8bea941ee309..728a5dc3dc16 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -528,7 +528,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, int free; /* * If value is NULL, it is remove operation. - * In case of update operation, we caculate free. + * In case of update operation, we calculate free. */ free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); if (found) -- cgit v1.2.3 From b067ba1f1b3fa7ec798d35e12aed6cdba9cea905 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Aug 2014 16:32:25 -0700 Subject: f2fs: should convert inline_data during the mkwrite If mkwrite is called to an inode having inline_data, it can overwrite the data index space as NEW_ADDR. (e.g., the first 4 bytes are coincidently zero) Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 11 ++++++++--- fs/f2fs/inline.c | 20 ++++++++++++-------- 4 files changed, 22 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7aef28d1fffa..ac3ccc25386b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -946,7 +946,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_balance_fs(sbi); repeat: - err = f2fs_convert_inline_data(inode, pos + len); + err = f2fs_convert_inline_data(inode, pos + len, NULL); if (err) goto fail; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 790a0738d314..c8288c9a4e1e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1439,7 +1439,7 @@ extern const struct inode_operations f2fs_special_inode_operations; */ bool f2fs_may_inline(struct inode *); int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_data(struct inode *, pgoff_t); +int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *); int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); void truncate_inline_data(struct inode *, u64); int recover_inline_data(struct inode *, struct page *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 87cdac4efeec..ecbdf6a6381c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -41,6 +41,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, sb_start_pagefault(inode->i_sb); + /* force to convert with normal data indices */ + err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page); + if (err) + goto out; + /* block allocation */ f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -533,7 +538,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { - err = f2fs_convert_inline_data(inode, attr->ia_size); + err = f2fs_convert_inline_data(inode, attr->ia_size, NULL); if (err) return err; @@ -622,7 +627,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t off_start, off_end; int ret = 0; - ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1); + ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL); if (ret) return ret; @@ -678,7 +683,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (ret) return ret; - ret = f2fs_convert_inline_data(inode, offset + len); + ret = f2fs_convert_inline_data(inode, offset + len, NULL); if (ret) return ret; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 5beeccef9ae1..1ec512d0bed3 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -124,9 +124,10 @@ out: return err; } -int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) +int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size, + struct page *page) { - struct page *page; + struct page *new_page = page; int err; if (!f2fs_has_inline_data(inode)) @@ -134,17 +135,20 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) else if (to_size <= MAX_INLINE_DATA) return 0; - page = grab_cache_page(inode->i_mapping, 0); - if (!page) - return -ENOMEM; + if (!page || page->index != 0) { + new_page = grab_cache_page(inode->i_mapping, 0); + if (!new_page) + return -ENOMEM; + } - err = __f2fs_convert_inline_data(inode, page); - f2fs_put_page(page, 1); + err = __f2fs_convert_inline_data(inode, new_page); + if (!page || page->index != 0) + f2fs_put_page(new_page, 1); return err; } int f2fs_write_inline_data(struct inode *inode, - struct page *page, unsigned size) + struct page *page, unsigned size) { void *src_addr, *dst_addr; struct page *ipage; -- cgit v1.2.3 From 0342fd301a9a940210f59fd094f766e7a0671987 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Aug 2014 16:57:17 -0700 Subject: f2fs: make clear on test condition and return types This patch adds a parentheses to make clear for condition check. And also it changes the return type for better meanings. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inline.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c8288c9a4e1e..2e4aa3a51b0d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1442,5 +1442,5 @@ int f2fs_read_inline_data(struct inode *, struct page *); int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *); int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); void truncate_inline_data(struct inode *, u64); -int recover_inline_data(struct inode *, struct page *); +bool recover_inline_data(struct inode *, struct page *); #endif diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 1ec512d0bed3..520758b91999 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -203,7 +203,7 @@ void truncate_inline_data(struct inode *inode, u64 from) f2fs_put_page(ipage, 1); } -int recover_inline_data(struct inode *inode, struct page *npage) +bool recover_inline_data(struct inode *inode, struct page *npage) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct f2fs_inode *ri = NULL; @@ -222,7 +222,7 @@ int recover_inline_data(struct inode *inode, struct page *npage) ri = F2FS_INODE(npage); if (f2fs_has_inline_data(inode) && - ri && ri->i_inline & F2FS_INLINE_DATA) { + ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(IS_ERR(ipage)); @@ -234,7 +234,7 @@ process_inline: memcpy(dst_addr, src_addr, MAX_INLINE_DATA); update_inode(inode, ipage); f2fs_put_page(ipage, 1); - return -1; + return true; } if (f2fs_has_inline_data(inode)) { @@ -246,10 +246,10 @@ process_inline: clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); update_inode(inode, ipage); f2fs_put_page(ipage, 1); - } else if (ri && ri->i_inline & F2FS_INLINE_DATA) { + } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { truncate_blocks(inode, 0); set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); goto process_inline; } - return 0; + return false; } -- cgit v1.2.3 From 617deb8c053aeec06c7aa16ac7225f046fab95e8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Aug 2014 17:04:24 -0700 Subject: f2fs: fix the initial inode page for recovery If a new inode page is needed for recover_dentry, we should assing i_inline as zero. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1f3329907483..093d7991b7ed 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1651,6 +1651,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; + dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; new_ni = old_ni; new_ni.ino = ino; @@ -1659,6 +1660,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); return 0; } -- cgit v1.2.3 From 695facc05a3eaaf434911dc5dbbc3e6c6a4e1862 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Aug 2014 17:06:18 -0700 Subject: f2fs: clear FI_INC_LINK during the recovery If an inode are fsynced multiple times with fsync & dent marks, this inode will set FI_INC_LINK at find_fsync_dnodes during the recovery. But, in recover_inode, recover_dentry doesn't clear that flag when multiple hits were occurred. So this patch removes the flag for the further consistency. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index fe1c6d921ba2..cfb2aa9bfc20 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -62,8 +62,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode) } retry: de = f2fs_find_entry(dir, &name, &page); - if (de && inode->i_ino == le32_to_cpu(de->ino)) + if (de && inode->i_ino == le32_to_cpu(de->ino)) { + clear_inode_flag(F2FS_I(inode), FI_INC_LINK); goto out_unmap_put; + } if (de) { einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { -- cgit v1.2.3 From e3b4d43f7c233c6fce21fe4b4cb55b6d59afddae Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Aug 2014 23:45:42 -0700 Subject: f2fs: should clear the inline_xattr flag During the recovery, we should clear the inline_xattr flag if its xattr node block is recovered. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 093d7991b7ed..151045f3e7b7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1557,26 +1557,25 @@ void recover_inline_xattr(struct inode *inode, struct page *page) struct page *ipage; struct f2fs_inode *ri; - if (!f2fs_has_inline_xattr(inode)) - return; - if (!IS_INODE(page)) return; - ri = F2FS_INODE(page); - if (!(ri->i_inline & F2FS_INLINE_XATTR)) - return; - ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(IS_ERR(ipage)); + ri = F2FS_INODE(page); + if (!(ri->i_inline & F2FS_INLINE_XATTR)) { + clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + goto update_inode; + } + dst_addr = inline_xattr_addr(ipage); src_addr = inline_xattr_addr(page); inline_size = inline_xattr_size(inode); f2fs_wait_on_page_writeback(ipage, NODE); memcpy(dst_addr, src_addr, inline_size); - +update_inode: update_inode(inode, ipage); f2fs_put_page(ipage, 1); } -- cgit v1.2.3 From 1c35a90e8ab57cd34b8e806b9c75ba05b3b5c7a3 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 7 Aug 2014 23:49:17 -0700 Subject: f2fs: fix to recover inline_xattr/data and blocks This patch fixes not to skip xattr recovery and inline xattr/data recovery order. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 9 +-------- fs/f2fs/recovery.c | 13 +++++++++---- 3 files changed, 11 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2e4aa3a51b0d..cc5ead1bd378 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1205,7 +1205,7 @@ void alloc_nid_failed(struct f2fs_sb_info *, nid_t); void recover_node_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, struct node_info *, block_t); void recover_inline_xattr(struct inode *, struct page *); -bool recover_xattr_data(struct inode *, struct page *, block_t); +void recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); int restore_node_summary(struct f2fs_sb_info *, unsigned int, struct f2fs_summary_block *); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 151045f3e7b7..c80e3d59314d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1557,9 +1557,6 @@ void recover_inline_xattr(struct inode *inode, struct page *page) struct page *ipage; struct f2fs_inode *ri; - if (!IS_INODE(page)) - return; - ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(IS_ERR(ipage)); @@ -1580,16 +1577,13 @@ update_inode: f2fs_put_page(ipage, 1); } -bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; nid_t new_xnid = nid_of_node(page); struct node_info ni; - if (!f2fs_has_xattr_block(ofs_of_node(page))) - return false; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; @@ -1617,7 +1611,6 @@ recover_xnid: set_node_addr(sbi, &ni, blkaddr, false); update_inode_page(inode); - return true; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index cfb2aa9bfc20..d7b67b86f607 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -302,14 +302,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct node_info ni; int err = 0, recovered = 0; - recover_inline_xattr(inode, page); - - if (recover_inline_data(inode, page)) + /* step 1: recover xattr */ + if (IS_INODE(page)) { + recover_inline_xattr(inode, page); + } else if (f2fs_has_xattr_block(ofs_of_node(page))) { + recover_xattr_data(inode, page, blkaddr); goto out; + } - if (recover_xattr_data(inode, page, blkaddr)) + /* step 2: recover inline data */ + if (recover_inline_data(inode, page)) goto out; + /* step 3: recover data indices */ start = start_bidx_of_node(ofs_of_node(page), fi); end = start + ADDRS_PER_PAGE(page, fi); -- cgit v1.2.3 From b307384e4f4670c490b4d142d27fed497df51fae Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 8 Aug 2014 10:18:43 -0700 Subject: f2fs: avoid bug_on when error is occurred During the recovery, if an error like EIO or ENOMEM, f2fs_bug_on should skip. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d7b67b86f607..7ca7aadaa607 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -472,7 +472,8 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #2: recover data */ err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); - f2fs_bug_on(!list_empty(&inode_list)); + if (!err) + f2fs_bug_on(!list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); -- cgit v1.2.3 From 97c3c5cac2bba0ecc4b0de83d33a23aa427ef628 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 19 Aug 2014 09:13:01 -0700 Subject: f2fs: don't skip checkpoint if there is no dirty node pages This is the errorneous scenario. 1. write data 2. do checkpoint 3. produce some dirty node pages by the gc thread 4. write back dirty node pages 5. f2fs_put_super will skip the checkpoint, since dirty count for node pages is zero. This patch removes such the wrong condition check. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 633315acfef8..60e3554a6eb4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -432,7 +432,7 @@ static void f2fs_put_super(struct super_block *sb) stop_gc_thread(sbi); /* We don't need to do checkpoint when it's clean */ - if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES)) + if (sbi->s_dirty) write_checkpoint(sbi, true); iput(sbi->node_inode); -- cgit v1.2.3 From e6d8fb340f20bc5275ceed86133d1823dec9478d Mon Sep 17 00:00:00 2001 From: Chin-Tsung Cheng Date: Fri, 15 Aug 2014 15:49:31 +0800 Subject: ext3: Count internal journal as bsddf overhead in ext3_statfs The journal blocks of external journal device should not be counted as overhead. Signed-off-by: Chin-Tsung Cheng Signed-off-by: Jan Kara --- fs/ext3/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 08cdfe5461e3..622e88249024 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2828,8 +2828,9 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) */ overhead += ngroups * (2 + sbi->s_itb_per_group); - /* Add the journal blocks as well */ - overhead += sbi->s_journal->j_maxlen; + /* Add the internal journal blocks as well */ + if (sbi->s_journal && !sbi->journal_bdev) + overhead += sbi->s_journal->j_maxlen; sbi->s_overhead_last = overhead; smp_wmb(); -- cgit v1.2.3 From 2bb93d244157b6dfa4964d4088be4680b3169701 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Aug 2014 18:56:29 -0500 Subject: Trivial whitespace fix Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 5a48aa290dfe..4e4eecdec4f9 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1035,7 +1035,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, if (keep_size == false) return -EOPNOTSUPP; - /* + /* * Must check if file sparse since fallocate -z (zero range) assumes * non-sparse allocation */ -- cgit v1.2.3 From eaebdedc61ac7bc3dd6ef6eb508097c4aaabef0c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 2 Jul 2014 22:08:46 +0200 Subject: GFS2: fs/gfs2/super.c: replace seq_printf by seq_puts fix checkpatch warnings: "WARNING: Prefer seq_puts to seq_printf" Cc: cluster-devel@redhat.com Signed-off-by: Fabian Frederick Signed-off-by: Steven Whitehouse --- fs/gfs2/super.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2607ff13d486..a346f56c4c6d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1294,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) int val; if (is_ancestor(root, sdp->sd_master_dir)) - seq_printf(s, ",meta"); + seq_puts(s, ",meta"); if (args->ar_lockproto[0]) seq_printf(s, ",lockproto=%s", args->ar_lockproto); if (args->ar_locktable[0]) @@ -1302,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) if (args->ar_hostdata[0]) seq_printf(s, ",hostdata=%s", args->ar_hostdata); if (args->ar_spectator) - seq_printf(s, ",spectator"); + seq_puts(s, ",spectator"); if (args->ar_localflocks) - seq_printf(s, ",localflocks"); + seq_puts(s, ",localflocks"); if (args->ar_debug) - seq_printf(s, ",debug"); + seq_puts(s, ",debug"); if (args->ar_posix_acl) - seq_printf(s, ",acl"); + seq_puts(s, ",acl"); if (args->ar_quota != GFS2_QUOTA_DEFAULT) { char *state; switch (args->ar_quota) { @@ -1328,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",quota=%s", state); } if (args->ar_suiddir) - seq_printf(s, ",suiddir"); + seq_puts(s, ",suiddir"); if (args->ar_data != GFS2_DATA_DEFAULT) { char *state; switch (args->ar_data) { @@ -1345,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",data=%s", state); } if (args->ar_discard) - seq_printf(s, ",discard"); + seq_puts(s, ",discard"); val = sdp->sd_tune.gt_logd_secs; if (val != 30) seq_printf(s, ",commit=%d", val); @@ -1376,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",errors=%s", state); } if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) - seq_printf(s, ",nobarrier"); + seq_puts(s, ",nobarrier"); if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) - seq_printf(s, ",demote_interface_used"); + seq_puts(s, ",demote_interface_used"); if (args->ar_rgrplvb) - seq_printf(s, ",rgrplvb"); + seq_puts(s, ",rgrplvb"); return 0; } -- cgit v1.2.3 From b650738cd093a9f9e9551db9ce5cd68acd842dc0 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 6 Aug 2014 09:08:36 -0400 Subject: GFS2: Change maxlen variables to size_t This patch changes some variables (especially maxlen in function gfs2_block_map) from unsigned int to size_t. We need 64-bit arithmetic for very large files (e.g. 1PB) where the variables otherwise get shifted to all 0's. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e6ee5b6e8d99..f0b945ab853e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp) * Returns: The length of the extent (minimum of one block) */ -static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) +static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob) { const __be64 *end = (start + len); const __be64 *first = ptr; @@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, struct buffer_head *bh_map, struct metapath *mp, const unsigned int sheight, const unsigned int height, - const unsigned int maxlen) + const size_t maxlen) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, } else { /* Need to allocate indirect blocks */ ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; - dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); + dblks = min(maxlen, (size_t)(ptrs_per_blk - + mp->mp_list[end_of_metadata])); if (height == ip->i_height) { /* Writing into existing tree, extend tree down */ iblks = height - sheight; @@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int bsize = sdp->sd_sb.sb_bsize; - const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; + const size_t maxlen = bh_map->b_size >> inode->i_blkbits; const u64 *arr = sdp->sd_heightsize; __be64 *ptr; u64 size; -- cgit v1.2.3 From 2ddfbdd6848d496d8088b28992d257fd02e58c9d Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 20 Aug 2014 12:44:45 -0400 Subject: GFS2: Request demote when a "try" flock fails This patch changes the flock code so that it uses the TRY_1CB flag instead of the TRY flag on the first attempt. That forces any holding nodes to issue a dlm callback, which requests a demote of the glock. Then, if the "try" failed, it sleeps a small amount of time for the demote to occur. Then it tries again, for an increasing amount of time. Subsequent attempts to gain the "try" lock don't use "_1CB" so that only one callback is issued. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 26b3f952e6b1..7f4ed3daa38c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -979,9 +980,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) unsigned int state; int flags; int error = 0; + int sleeptime; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT; mutex_lock(&fp->f_fl_mutex); @@ -1001,7 +1003,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) gfs2_holder_init(gl, state, flags, fl_gh); gfs2_glock_put(gl); } - error = gfs2_glock_nq(fl_gh); + for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { + error = gfs2_glock_nq(fl_gh); + if (error != GLR_TRYFAILED) + break; + fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; + fl_gh->gh_error = 0; + msleep(sleeptime); + } if (error) { gfs2_holder_uninit(fl_gh); if (error == GLR_TRYFAILED) @@ -1024,7 +1033,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); flock_lock_file_wait(file, fl); if (fl_gh->gh_gl) { - gfs2_glock_dq_wait(fl_gh); + gfs2_glock_dq(fl_gh); gfs2_holder_uninit(fl_gh); } mutex_unlock(&fp->f_fl_mutex); -- cgit v1.2.3 From 87fa3bb0786f37dff0b92f2c38421dd56d8902a9 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 29 Jul 2014 19:09:39 +0800 Subject: Btrfs: fix regression of btrfs device replace Commit 49c6f736f34f901117c20960ebd7d5e60f12fcac( btrfs: dev replace should replace the sysfs entry) added the missing sysfs entry in the process of device replace, but didn't take missing devices into account, so now we have BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 IP: [] btrfs_kobj_rm_device+0x21/0x40 [btrfs] ... To reproduce it, 1. mkfs.btrfs -f disk1 disk2 2. mkfs.ext4 disk1 3. mount disk2 /mnt -odegraded 4. btrfs replace start -B 1 disk3 /mnt -------------------------- This fixes the problem. Reported-by: Chris Murphy Signed-off-by: Liu Bo Reviewed-by: Satoru Takeuchi Tested-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 78699364f537..12e53556e214 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -614,7 +614,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, if (!fs_info->device_dir_kobj) return -EINVAL; - if (one_device) { + if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; -- cgit v1.2.3 From 9c3b306e1c9e6be4be09e99a8fe2227d1005effc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 31 Jul 2014 14:41:07 +0100 Subject: Btrfs: race free update of commit root for ro snapshots This is a better solution for the problem addressed in the following commit: Btrfs: update commit root on snapshot creation after orphan cleanup (3821f348889e506efbd268cc8149e0ebfa47c4e5) The previous solution wasn't the best because of 2 reasons: 1) It added another full transaction commit, which is more expensive than just swapping the commit root with the root; 2) If a reboot happened after the first transaction commit (the one that creates the snapshot) and before the second transaction commit, then we would end up with the same problem if a send using that snapshot was requested before the first transaction commit after the reboot. This change addresses those 2 issues. The second issue is addressed by switching the commit root in the dentry lookup VFS callback, which is also called by the snapshot/subvol creation ioctl and performs orphan cleanup if needed. Like the vfs, the ioctl locks the parent inode too, preventing race issues between a dentry lookup and snapshot creation. Cc: Alex Lyakas Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 36 ++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.c | 33 --------------------------------- 2 files changed, 36 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a3c6e76f5a4e..6dd6e50d143a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5181,6 +5181,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } + /* + * If orphan cleanup did remove any orphans, it means the tree + * was modified and therefore the commit root is not the same as + * the current root anymore. This is a problem, because send + * uses the commit root and therefore can see inode items that + * don't exist in the current root anymore, and for example make + * calls to btrfs_iget, which will do tree lookups based on the + * current root and not on the commit root. Those lookups will + * fail, returning a -ESTALE error, and making send fail with + * that error. So make sure a send does not see any orphans we + * have just removed, and that it will see the same inodes + * regardless of whether a transaction commit happened before + * it started (meaning that the commit root will be the same as + * the current root) or not. + */ + if (sub_root->node != sub_root->commit_root) { + u64 sub_flags = btrfs_root_flags(&sub_root->root_item); + + if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) { + struct extent_buffer *eb; + + /* + * Assert we can't have races between dentry + * lookup called through the snapshot creation + * ioctl and the VFS. + */ + ASSERT(mutex_is_locked(&dir->i_mutex)); + + down_write(&root->fs_info->commit_root_sem); + eb = sub_root->commit_root; + sub_root->commit_root = + btrfs_root_node(sub_root); + up_write(&root->fs_info->commit_root_sem); + free_extent_buffer(eb); + } + } } return inode; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 47aceb494d1d..845287ca59c3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -711,39 +711,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - ret = btrfs_orphan_cleanup(pending_snapshot->snap); - if (ret) - goto fail; - - /* - * If orphan cleanup did remove any orphans, it means the tree was - * modified and therefore the commit root is not the same as the - * current root anymore. This is a problem, because send uses the - * commit root and therefore can see inode items that don't exist - * in the current root anymore, and for example make calls to - * btrfs_iget, which will do tree lookups based on the current root - * and not on the commit root. Those lookups will fail, returning a - * -ESTALE error, and making send fail with that error. So make sure - * a send does not see any orphans we have just removed, and that it - * will see the same inodes regardless of whether a transaction - * commit happened before it started (meaning that the commit root - * will be the same as the current root) or not. - */ - if (readonly && pending_snapshot->snap->node != - pending_snapshot->snap->commit_root) { - trans = btrfs_join_transaction(pending_snapshot->snap); - if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot->snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); -- cgit v1.2.3 From 5762b5c958abbecb7fb9f4596a6476d1ce91ecf6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 1 Aug 2014 00:10:32 +0100 Subject: Btrfs: ensure tmpfile inode is always persisted with link count of 0 If we open a file with O_TMPFILE, don't do any further operation on it (so that the inode item isn't updated) and then force a transaction commit, we get a persisted inode item with a link count of 1, and not 0 as it should be. Steps to reproduce it (requires a modern xfs_io with -T support): $ mkfs.btrfs -f /dev/sdd $ mount -o /dev/sdd /mnt $ xfs_io -T /mnt & $ sync Then btrfs-debug-tree shows the inode item with a link count of 1: $ btrfs-debug-tree /dev/sdd (...) fs tree key (FS_TREE ROOT_ITEM 0) leaf 29556736 items 4 free space 15851 generation 6 owner 5 fs uuid f164d01b-1b92-481d-a4e4-435fb0f843d0 chunk uuid 0e3d0e56-bcca-4a1c-aa5f-cec2c6f4f7a6 item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 inode generation 3 transid 6 size 0 block group 0 mode 40755 links 1 item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12 inode ref index 0 namelen 2 name: .. item 2 key (257 INODE_ITEM 0) itemoff 15951 itemsize 160 inode generation 6 transid 6 size 0 block group 0 mode 100600 links 1 item 3 key (ORPHAN ORPHAN_ITEM 257) itemoff 15951 itemsize 0 orphan item checksum tree key (CSUM_TREE ROOT_ITEM 0) (...) Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6dd6e50d143a..57c3129ee2a2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5641,6 +5641,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, return ERR_PTR(-ENOMEM); } + /* + * O_TMPFILE, set link count to 0, so that after this point, + * we fill in an inode item with the correct link count. + */ + if (!name) + set_nlink(inode, 0); + /* * we have to initialize this early, so we can reclaim the inode * number if we fail afterwards in this function. @@ -9007,6 +9014,14 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) if (ret) goto out; + /* + * We set number of links to 0 in btrfs_new_inode(), and here we set + * it to 1 because d_tmpfile() will issue a warning if the count is 0, + * through: + * + * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() + */ + set_nlink(inode, 1); d_tmpfile(dentry, inode); mark_inode_dirty(inode); -- cgit v1.2.3 From 74121f7cbb2f60b41c48184ad5b889c0704e7b90 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 7 Aug 2014 12:00:44 +0100 Subject: Btrfs: fix hole detection during file fsync The file hole detection logic during a file fsync wasn't correct, because it didn't look back (in a previous leaf) for the last file extent item that can be in a leaf to the left of our leaf and that has a generation lower than the current transaction id. This made it assume that a hole exists when it really doesn't exist in the file. Such false positive hole detection happens in the following scenario: * We have a file that has many file extent items, covering 3 or more btree leafs (the first leaf must contain non file extent items too). * Two ranges of the file are modified, with their extent items being located at 2 different leafs and those leafs aren't consecutive. * When processing the second modified leaf, we weren't checking if some file extent item exists that is located in some leaf that is between our 2 modified leafs, and therefore assumed the range defined between the last file extent item in the first leaf and the first file extent item in the second leaf matched a hole. Fortunately this didn't result in overriding the log with wrong data, instead it made the last loop in copy_items() attempt to insert a duplicated key (for a hole file extent item), which makes the file fsync code return with -EEXIST to file.c:btrfs_sync_file() which in turn ends up doing a full transaction commit, which is much more expensive then writing only to the log tree and wait for it to be durably persisted (as well as the file's modified extents/pages). Therefore fix the hole detection logic, so that we don't pay the cost of doing full transaction commits. I could trigger this issue with the following test for xfstests (which never fails, either without or with this patch). The last fsync call results in a full transaction commit, due to the -EEXIST error mentioned above. I could also observe this behaviour happening frequently when running xfstests/generic/075 in a loop. Test: _cleanup() { _cleanup_flakey rm -fr $tmp } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _supported_fs btrfs _supported_os Linux _require_scratch _require_dm_flakey _need_to_be_root rm -f $seqres.full # Create a file with many file extent items, each representing a 4Kb extent. # These items span 3 btree leaves, of 16Kb each (default mkfs.btrfs leaf size # as of btrfs-progs 3.12). _scratch_mkfs -l 16384 >/dev/null 2>&1 _init_flakey SAVE_MOUNT_OPTIONS="$MOUNT_OPTIONS" MOUNT_OPTIONS="$MOUNT_OPTIONS -o commit=999" _mount_flakey # First fsync, inode has BTRFS_INODE_NEEDS_FULL_SYNC flag set. $XFS_IO_PROG -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io # For any of the following fsync calls, inode doesn't have the flag # BTRFS_INODE_NEEDS_FULL_SYNC set. for ((i = 1; i <= 500; i++)); do OFFSET=$((4096 * i)) LEN=4096 $XFS_IO_PROG -c "pwrite -S 0x01 $OFFSET $LEN" -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io done # Commit transaction and bump next transaction's id (to 7). sync # Truncate will set the BTRFS_INODE_NEEDS_FULL_SYNC flag in the btrfs's # inode runtime flags. $XFS_IO_PROG -c "truncate 2048000" $SCRATCH_MNT/foo # Commit transaction and bump next transaction's id (to 8). sync # Touch 1 extent item from the first leaf and 1 from the last leaf. The leaf # in the middle, containing only file extent items, isn't touched. So the # next fsync, when calling btrfs_search_forward(), won't visit that middle # leaf. First and 3rd leaf have now a generation with value 8, while the # middle leaf remains with a generation with value 6. $XFS_IO_PROG \ -c "pwrite -S 0xee -b 4096 0 4096" \ -c "pwrite -S 0xff -b 4096 2043904 4096" \ -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io _load_flakey_table $FLAKEY_DROP_WRITES md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES # During mount, we'll replay the log created by the fsync above, and the file's # md5 digest should be the same we got before the unmount. _mount_flakey md5sum $SCRATCH_MNT/foo | _filter_scratch _unmount_flakey MOUNT_OPTIONS="$SAVE_MOUNT_OPTIONS" status=0 exit Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9e1f2cd5e67a..7e0e6e3029dd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3298,7 +3298,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct list_head ordered_sums; int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; bool has_extents = false; - bool need_find_last_extent = (*last_extent == 0); + bool need_find_last_extent = true; bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3352,8 +3352,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, */ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { has_extents = true; - if (need_find_last_extent && - first_key.objectid == (u64)-1) + if (first_key.objectid == (u64)-1) first_key = ins_keys[i]; } else { need_find_last_extent = false; @@ -3427,6 +3426,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!has_extents) return ret; + if (need_find_last_extent && *last_extent == first_key.offset) { + /* + * We don't have any leafs between our current one and the one + * we processed before that can have file extent items for our + * inode (and have a generation number smaller than our current + * transaction id). + */ + need_find_last_extent = false; + } + /* * Because we use btrfs_search_forward we could skip leaves that were * not modified and then assume *last_extent is valid when it really @@ -3537,7 +3546,7 @@ fill_holes: 0, 0); if (ret) break; - *last_extent = offset + len; + *last_extent = extent_end; } /* * Need to let the callers know we dropped the path so they should -- cgit v1.2.3 From 7064dd5c36187725e7ccfd837e07678ae435d3f5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Aug 2014 02:47:05 +0100 Subject: Btrfs: don't monopolize a core when evicting inode If an inode has a very large number of extent maps, we can spend a lot of time freeing them, which triggers a soft lockup warning. Therefore reschedule if we need to when freeing the extent maps while evicting the inode. I could trigger this all the time by running xfstests/generic/299 on a file system with the no-holes feature enabled. That test creates an inode with 11386677 extent maps. $ mkfs.btrfs -f -O no-holes $TEST_DEV $ MKFS_OPTIONS="-O no-holes" ./check generic/299 generic/299 382s ... Message from syslogd@debian-vm3 at Aug 7 10:44:29 ... kernel:[85304.208017] BUG: soft lockup - CPU#0 stuck for 22s! [umount:25330] 384s Ran: generic/299 Passed all 1 tests $ dmesg (...) [86304.300017] BUG: soft lockup - CPU#0 stuck for 23s! [umount:25330] (...) [86304.300036] Call Trace: [86304.300036] [] __slab_free+0x54/0x295 [86304.300036] [] ? free_extent_map+0x5c/0xb0 [btrfs] [86304.300036] [] kmem_cache_free+0x282/0x2a0 [86304.300036] [] free_extent_map+0x5c/0xb0 [btrfs] [86304.300036] [] btrfs_evict_inode+0xd5/0x660 [btrfs] [86304.300036] [] ? __inode_wait_for_writeback+0x6d/0xc0 [86304.300036] [] ? _raw_spin_unlock+0x2b/0x40 [86304.300036] [] evict+0xab/0x180 [86304.300036] [] dispose_list+0x3e/0x60 [86304.300036] [] evict_inodes+0xf4/0x110 [86304.300036] [] generic_shutdown_super+0x53/0x110 [86304.300036] [] kill_anon_super+0x16/0x30 [86304.300036] [] btrfs_kill_super+0x1a/0xa0 [btrfs] [86304.300036] [] deactivate_locked_super+0x59/0x80 [86304.300036] [] deactivate_super+0x4e/0x70 [86304.300036] [] mntput_no_expire+0x174/0x1f0 [86304.300036] [] ? mntput_no_expire+0x17/0x1f0 [86304.300036] [] SyS_umount+0x97/0x100 (...) Signed-off-by: Filipe Manana Reviewed-by: Satoru Takeuchi Tested-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 57c3129ee2a2..2ac260d41ccd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4674,6 +4674,11 @@ static void evict_inode_truncate_pages(struct inode *inode) clear_bit(EXTENT_FLAG_LOGGING, &em->flags); remove_extent_mapping(map_tree, em); free_extent_map(em); + if (need_resched()) { + write_unlock(&map_tree->lock); + cond_resched(); + write_lock(&map_tree->lock); + } } write_unlock(&map_tree->lock); @@ -4696,6 +4701,7 @@ static void evict_inode_truncate_pages(struct inode *inode) &cached_state, GFP_NOFS); free_extent_state(state); + cond_resched(); spin_lock(&io_tree->lock); } spin_unlock(&io_tree->lock); -- cgit v1.2.3 From 62e2390e1ad78f956e96a6a831761adc6f2bf58a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 8 Aug 2014 02:47:06 +0100 Subject: Btrfs: clone, don't create invalid hole extent map When cloning a file that consists of an inline extent, we were creating an extent map that represents a non-existing trailing hole starting at a file offset that isn't a multiple of the sector size. This happened because when processing an inline extent we weren't aligning the extent's length to the sector size, and therefore incorrectly treating the range [inline_extent_length; sector_size[ as a hole. Signed-off-by: Filipe Manana Reviewed-by: Satoru Takeuchi Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 845287ca59c3..fce6fd0e3f50 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3494,7 +3494,8 @@ process_slot: btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - last_dest_end = new_key.offset + datal; + last_dest_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, destoff, olen); -- cgit v1.2.3 From 51f395ad4058883e4273b02fdebe98072dbdc0d2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 8 Aug 2014 13:06:20 +0800 Subject: btrfs: Use right extent length when inserting overlap extent map. When current btrfs finds that a new extent map is going to be insereted but failed with -EEXIST, it will try again to insert the extent map but with the length of sectorsize. This is OK if we don't enable 'no-holes' feature since all extent space is continuous, we will not go into the not found->insert routine. But if we enable 'no-holes' feature, it will make things out of control. e.g. in 4K sectorsize, we pass the following args to btrfs_get_extent(): btrfs_get_extent() args: start: 27874 len 4100 28672 27874 28672 27874+4100 32768 |-----------------------| |---------hole--------------------|---------data----------| 1) not found and insert Since no extent map containing the range, btrfs_get_extent() will go into the not_found and insert routine, which will try to insert the extent map (27874, 27847 + 4100). 2) first overlap But it overlaps with (28672, 32768) extent, so -EEXIST will be returned by add_extent_mapping(). 3) retry but still overlap After catching the -EEXIST, then btrfs_get_extent() will try insert it again but with 4K length, which still overlaps, so -EEXIST will be returned. This makes the following patch fail to punch hole. d77815461f047e561f77a07754ae923ade597d4e btrfs: Avoid trucating page or punching hole in a already existed hole. This patch will use the right length, which is the (exsisting->start - em->start) to insert, making the above patch works in 'no-holes' mode. Also, some small code style problems in above patch is fixed too. Reported-by: Filipe David Manana Signed-off-by: Qu Wenruo Reviewed-by: Filipe David Manana Tested-by: Filipe David Manana Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77e33534c7d6..f15c13f97018 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2215,7 +2215,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); + lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); lockend = round_down(offset + len, BTRFS_I(inode)->root->sectorsize) - 1; same_page = ((offset >> PAGE_CACHE_SHIFT) == @@ -2276,7 +2276,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) tail_start + tail_len, 0, 1); if (ret) goto out_only_mutex; - } + } } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2ac260d41ccd..ae98df67950f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6146,14 +6146,14 @@ out_fail: static int merge_extent_mapping(struct extent_map_tree *em_tree, struct extent_map *existing, struct extent_map *em, - u64 map_start, u64 map_len) + u64 map_start) { u64 start_diff; BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); start_diff = map_start - em->start; em->start = map_start; - em->len = map_len; + em->len = existing->start - em->start; if (em->block_start < EXTENT_MAP_LAST_BYTE && !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff; @@ -6441,8 +6441,7 @@ insert: em->len); if (existing) { err = merge_extent_mapping(em_tree, existing, - em, start, - root->sectorsize); + em, start); free_extent_map(existing); if (err) { free_extent_map(em); -- cgit v1.2.3 From a3c108950d8e0bfcf48856cc159956022a7ad925 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 17 Aug 2014 15:09:21 -0500 Subject: btrfs: fix leak in qgroup_subtree_accounting() error path Coverity pointed this out; in the newly added qgroup_subtree_accounting(), if btrfs_find_all_roots() returns an error, we leak at least the parents pointer, and possibly the roots pointer, depending on what failure occurs. If btrfs_find_all_roots() returns an error, we need to free up all allocations before we return. "roots" is initialized to NULL, so it should be safe to free it unconditionally (ulist_free() handles that case). Cc: Mark Fasheh Signed-off-by: Eric Sandeen Reviewed-by: Mark Fasheh Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b497498484be..8abe45524de9 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1973,7 +1973,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, elem.seq, &roots); btrfs_put_tree_mod_seq(fs_info, &elem); if (ret < 0) - return ret; + goto out; if (roots->nnodes != 1) goto out; -- cgit v1.2.3 From 38c1c2e44bacb37efd68b90b3f70386a8ee370ee Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 19 Aug 2014 23:33:13 +0800 Subject: Btrfs: fix crash on endio of reading corrupted block The crash is ------------[ cut here ]------------ kernel BUG at fs/btrfs/extent_io.c:2124! [...] Workqueue: btrfs-endio normal_work_helper [btrfs] RIP: 0010:[] [] end_bio_extent_readpage+0xb45/0xcd0 [btrfs] This is in fact a regression. It is because we forgot to increase @offset properly in reading corrupted block, so that the @offset remains, and this leads to checksum errors while reading left blocks queued up in the same bio, and then ends up with hiting the above BUG_ON. Reported-by: Chris Murphy Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1c70cff4a9e1..f34b1e262d4e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2532,6 +2532,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) uptodate = 0; + offset += len; continue; } } -- cgit v1.2.3 From f6dc45c7a93a011dff6eb9b2ffda59c390c7705a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 20 Aug 2014 07:15:33 -0700 Subject: Btrfs: fix filemap_flush call in btrfs_file_release We should only be flushing on close if the file was flagged as needing it during truncate. I broke this with my ordered data vs transaction commit deadlock fix. Thanks to Miao Xie for catching this. Signed-off-by: Chris Mason Reported-by: Miao Xie Reported-by: Fengguang Wu --- fs/btrfs/file.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f15c13f97018..36861b7a6757 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1840,7 +1840,15 @@ int btrfs_release_file(struct inode *inode, struct file *filp) { if (filp->private_data) btrfs_ioctl_trans_end(filp); - filemap_flush(inode->i_mapping); + /* + * ordered_data_close is set by settattr when we are about to truncate + * a file from a non-zero size to a zero size. This tries to + * flush down new bytes that may have been written if the + * application were using truncate to replace a file in place. + */ + if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); return 0; } -- cgit v1.2.3 From 6f12ac25f0167adb5d9ad5547fd6838380261e5c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 19 Aug 2014 09:48:22 -0700 Subject: f2fs: trigger release_dirty_inode in f2fs_put_super The generic_shutdown_super calls sync_filesystem, evict_inode, and then f2fs_put_super. In f2fs_evict_inode, we remain some dirty inode information so we should release them at f2fs_put_super. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 1 + fs/f2fs/super.c | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7e1c13bfb99f..f14af912a146 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -348,7 +348,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -static void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_dirty_inode(struct f2fs_sb_info *sbi) { struct ino_entry *e, *tmp; int i; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cc5ead1bd378..cc89a7f490a6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1262,6 +1262,7 @@ int ra_meta_pages(struct f2fs_sb_info *, int, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); +void release_dirty_inode(struct f2fs_sb_info *); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 60e3554a6eb4..7a5477915d99 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -435,6 +435,9 @@ static void f2fs_put_super(struct super_block *sb) if (sbi->s_dirty) write_checkpoint(sbi, true); + /* normally superblock is clean, so we need to release this */ + release_dirty_inode(sbi); + iput(sbi->node_inode); iput(sbi->meta_inode); -- cgit v1.2.3 From ed2e621a95d704e6a4e904cc00524e8cbddda0c2 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 8 Aug 2014 15:37:41 -0700 Subject: f2fs: give a chance to mount again when encountering errors This patch gives another chance to try mount process when we encounter an error. This makes an effect on the roll-forward recovery failures as well. Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7a5477915d99..e161e13958e2 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -902,8 +902,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct buffer_head *raw_super_buf; struct inode *root; long err = -EINVAL; + bool retry = true; int i; +try_onemore: /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); if (!sbi) @@ -1083,9 +1085,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { err = recover_fsync_data(sbi); - if (err) + if (err) { f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%ld", err); + goto free_kobj; + } } /* @@ -1126,6 +1130,13 @@ free_sb_buf: brelse(raw_super_buf); free_sbi: kfree(sbi); + + /* give only one another chance */ + if (retry) { + retry = !retry; + shrink_dcache_sb(sb); + goto try_onemore; + } return err; } -- cgit v1.2.3 From 1e968fdfe69e4060f05fa04059ecad93a0284e32 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Aug 2014 16:49:25 -0700 Subject: f2fs: introduce f2fs_cp_error for readability This patch adds f2fs_cp_error for readability. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/f2fs.h | 5 +++++ fs/f2fs/gc.c | 2 +- fs/f2fs/super.c | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f14af912a146..6f38aad8e654 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -162,7 +162,7 @@ static int f2fs_write_meta_page(struct page *page, goto redirty_out; /* Should not write any meta pages, if any IO error was occurred */ - if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + if (unlikely(f2fs_cp_error(sbi))) goto no_write; f2fs_wait_on_page_writeback(page, META); @@ -934,7 +934,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { + if (!f2fs_cp_error(sbi)) { clear_prefree_segments(sbi); release_dirty_inode(sbi); F2FS_RESET_SB_DIRT(sbi); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cc89a7f490a6..2d009aed0c2e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1096,6 +1096,11 @@ static inline int f2fs_readonly(struct super_block *sb) return sb->s_flags & MS_RDONLY; } +static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) +{ + return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); +} + static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) { set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 87c50c507461..e8507b1c8759 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -693,7 +693,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi) gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) + if (unlikely(f2fs_cp_error(sbi))) goto stop; if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e161e13958e2..8aabe3ef42f6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -815,7 +815,7 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; - if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; } -- cgit v1.2.3 From 5274651927a76c947469a589e3d2a9adbd075da6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Aug 2014 18:18:36 -0700 Subject: f2fs: unlock_page when node page is redirtied out This patch fixes missing unlock_page when a node page is redirtied out. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index c80e3d59314d..9f126f80813d 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1129,8 +1129,11 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); } - NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); - wrote++; + + if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + unlock_page(page); + else + wrote++; if (--wbc->nr_to_write == 0) break; -- cgit v1.2.3 From 8501017e50fb7586ba522a2913ce664d6c2024f6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Aug 2014 18:37:46 -0700 Subject: f2fs: check s_dirty under cp_mutex It needs to check s_dirty under cp_mutex, since s_dirty is reset under that mutex. And previous condition was not correct, since we can omit doing checkpoint when checkpoint was done followed by all the node pages were written back. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 ++++++-- fs/f2fs/super.c | 3 --- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6f38aad8e654..dc29b7837687 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -952,6 +952,10 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); mutex_lock(&sbi->cp_mutex); + + if (!sbi->s_dirty) + goto out; + block_operations(sbi); trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); @@ -976,9 +980,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) do_checkpoint(sbi, is_umount); unblock_operations(sbi); - mutex_unlock(&sbi->cp_mutex); - stat_inc_cp_count(sbi->stat_info); +out: + mutex_unlock(&sbi->cp_mutex); trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8aabe3ef42f6..e7a7b619ffd4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -460,9 +460,6 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); - if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES)) - return 0; - if (sync) { mutex_lock(&sbi->gc_mutex); write_checkpoint(sbi, false); -- cgit v1.2.3 From 27b7edcf1ce03a3eddda24d4d271a9b29572a78b Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Aug 2014 19:39:28 +0900 Subject: cifs: fix a possible null pointer deref in decode_ascii_ssetup When kzalloc fails, we will end up doing NULL pointer derefrence Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/sess.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 39ee32688eac..3a5e83317683 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); - if (ses->serverOS) + if (ses->serverOS) { strncpy(ses->serverOS, bcc_ptr, len); - if (strncmp(ses->serverOS, "OS/2", 4) == 0) - cifs_dbg(FYI, "OS/2 server\n"); + if (strncmp(ses->serverOS, "OS/2", 4) == 0) + cifs_dbg(FYI, "OS/2 server\n"); + } bcc_ptr += len + 1; bleft -= len + 1; -- cgit v1.2.3 From d6ccf4997e62fb6629f9f003980dca5292138b7b Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 21 Aug 2014 19:11:20 +0900 Subject: cifs: fix memory leak when password is supplied multiple times Unlikely but possible. When password is supplied multiple times, we have to free the previous allocation. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/connect.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 03ed8a09581c..36ca2045009b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1600,6 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, tmp_end++; if (!(tmp_end < end && tmp_end[1] == delim)) { /* No it is not. Set the password to NULL */ + kfree(vol->password); vol->password = NULL; break; } @@ -1637,6 +1638,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, options = end; } + kfree(vol->password); /* Now build new password string */ temp_len = strlen(value); vol->password = kzalloc(temp_len+1, GFP_KERNEL); -- cgit v1.2.3 From 7de975e349b295f387f34eed38f115223f17d5ee Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Aug 2014 19:39:41 +0900 Subject: cifs: fix a possible use of uninit variable in SMB2_sess_setup In case of error, goto ssetup_exit can be hit and we could end up using uninitialized value of resp_buftype Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index fa0dd044213b..9df5d8effe47 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -530,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, struct smb2_sess_setup_rsp *rsp = NULL; struct kvec iov[2]; int rc = 0; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ struct TCP_Server_Info *server = ses->server; u16 blob_length = 0; -- cgit v1.2.3 From d4a029d21556437b09ffb3207cf2871651bec38f Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Aug 2014 19:39:59 +0900 Subject: cifs: remove unneeded check of null checking in if condition Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 9df5d8effe47..cb39c51cd3e0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1403,8 +1403,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, rsp = (struct smb2_close_rsp *)iov[0].iov_base; if (rc != 0) { - if (tcon) - cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); + cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); goto close_exit; } -- cgit v1.2.3 From cf779cab14d50a84b61399f758da269654b863db Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 11 Aug 2014 18:37:46 -0700 Subject: f2fs: handle EIO not to break fs consistency There are two rules when EIO is occurred. 1. don't write any checkpoint data to preserve the previous checkpoint 2. don't lose the cached dentry/node/meta pages So, at first, this patch adds set_page_dirty in f2fs_write_end_io's failure. Then, writing checkpoint/dentry/node blocks is not allowed. Note that, for the data pages, we can't just throw away by redirtying them. Otherwise, kworker can fall into infinite loop to flush them. (Ref. xfstests/019) Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 45 ++++++++++++++++++++++++++++++++------------- fs/f2fs/data.c | 11 ++++++++++- fs/f2fs/node.c | 2 ++ fs/f2fs/super.c | 5 ++++- 4 files changed, 48 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index dc29b7837687..c9c08d52ecfd 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -160,14 +160,11 @@ static int f2fs_write_meta_page(struct page *page, goto redirty_out; if (wbc->for_reclaim) goto redirty_out; - - /* Should not write any meta pages, if any IO error was occurred */ if (unlikely(f2fs_cp_error(sbi))) - goto no_write; + goto redirty_out; f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); -no_write: dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; @@ -737,7 +734,7 @@ retry: /* * Freeze all the FS-operations for checkpoint. */ -static void block_operations(struct f2fs_sb_info *sbi) +static int block_operations(struct f2fs_sb_info *sbi) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -745,6 +742,7 @@ static void block_operations(struct f2fs_sb_info *sbi) .for_reclaim = 0, }; struct blk_plug plug; + int err = 0; blk_start_plug(&plug); @@ -754,6 +752,10 @@ retry_flush_dents: if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); sync_dirty_dir_inodes(sbi); + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto out; + } goto retry_flush_dents; } @@ -767,9 +769,16 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); sync_node_pages(sbi, 0, &wbc); + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_unlock_all(sbi); + err = -EIO; + goto out; + } goto retry_flush_nodes; } +out: blk_finish_plug(&plug); + return err; } static void unblock_operations(struct f2fs_sb_info *sbi) @@ -813,8 +822,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) + while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); + if (unlikely(f2fs_cp_error(sbi))) + return; + } next_free_nid(sbi, &last_nid); @@ -924,6 +936,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* wait for previous submitted node/meta pages writeback */ wait_on_all_pages_writeback(sbi); + if (unlikely(f2fs_cp_error(sbi))) + return; + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); @@ -934,11 +949,13 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); - if (!f2fs_cp_error(sbi)) { - clear_prefree_segments(sbi); - release_dirty_inode(sbi); - F2FS_RESET_SB_DIRT(sbi); - } + release_dirty_inode(sbi); + + if (unlikely(f2fs_cp_error(sbi))) + return; + + clear_prefree_segments(sbi); + F2FS_RESET_SB_DIRT(sbi); } /* @@ -955,8 +972,10 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) if (!sbi->s_dirty) goto out; - - block_operations(sbi); + if (unlikely(f2fs_cp_error(sbi))) + goto out; + if (block_operations(sbi)) + goto out; trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ac3ccc25386b..6ba52a393063 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -53,7 +53,7 @@ static void f2fs_write_end_io(struct bio *bio, int err) struct page *page = bvec->bv_page; if (unlikely(err)) { - SetPageError(page); + set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); f2fs_stop_checkpoint(sbi); } @@ -836,10 +836,19 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; err = do_write_data_page(page, &fio); goto done; } + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + SetPageError(page); + unlock_page(page); + return 0; + } + if (!wbc->for_reclaim) need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0)) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 9f126f80813d..d2f784283425 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1215,6 +1215,8 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(sbi->por_doing)) goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; f2fs_wait_on_page_writeback(page, NODE); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e7a7b619ffd4..ddb1e9d36363 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -435,7 +435,10 @@ static void f2fs_put_super(struct super_block *sb) if (sbi->s_dirty) write_checkpoint(sbi, true); - /* normally superblock is clean, so we need to release this */ + /* + * normally superblock is clean, so we need to release this. + * In addition, EIO will skip do checkpoint, we need this as well. + */ release_dirty_inode(sbi); iput(sbi->node_inode); -- cgit v1.2.3 From b3fe0a0da2cb18d3c94d5ddce836e7f889d7286b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 13 Aug 2014 10:45:41 -0700 Subject: f2fs: add WARN_ON in f2fs_bug_on This patch adds WARN_ON when f2fs_bug_on is disable to see kernel messages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2d009aed0c2e..2723b2d45479 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,7 +24,7 @@ #define f2fs_bug_on(condition) BUG_ON(condition) #define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else -#define f2fs_bug_on(condition) +#define f2fs_bug_on(condition) WARN_ON(condition) #define f2fs_down_write(x, y) down_write(x) #endif -- cgit v1.2.3 From 14f4e690857715d5e0cbe403b4cb8e8c904a6c15 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 13 Aug 2014 16:30:46 -0700 Subject: f2fs: prevent checkpoint during roll-forward Any checkpoint should not be done during the core roll-forward procedure. Especially, it includes error cases too. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 7ca7aadaa607..d36ef35353b2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -459,6 +459,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ sbi->por_doing = true; + /* prevent checkpoint */ + mutex_lock(&sbi->cp_mutex); + blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); err = find_fsync_dnodes(sbi, &inode_list); @@ -490,8 +493,13 @@ out: /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) sync_meta_pages(sbi, META, LONG_MAX); + set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); } else if (need_writecp) { + mutex_unlock(&sbi->cp_mutex); write_checkpoint(sbi, false); + } else { + mutex_unlock(&sbi->cp_mutex); } return err; } -- cgit v1.2.3 From 764aa3e978020121cbb86111b5d8f42830015a06 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 14 Aug 2014 16:32:54 -0700 Subject: f2fs: avoid double lock in truncate_blocks The init_inode_metadata calls truncate_blocks when error is occurred. The callers holds f2fs_lock_op, so we should not call it again in truncate_blocks. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 13 ++++++++----- fs/f2fs/inline.c | 2 +- 5 files changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6ba52a393063..76de83e25a89 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -936,7 +936,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size); + truncate_blocks(inode, inode->i_size, true); } } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index a69bbfa90c99..155fb056b7f1 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -391,7 +391,7 @@ put_error: error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0); + truncate_blocks(inode, 0, false); remove_dirty_dir_inode(inode); remove_inode_page(inode); return ERR_PTR(err); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2723b2d45479..7f976c1d1723 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1122,7 +1122,7 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) */ int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64); +int truncate_blocks(struct inode *, u64, bool); void f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ecbdf6a6381c..a8e97f8ed163 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -422,7 +422,7 @@ out: f2fs_put_page(page, 1); } -int truncate_blocks(struct inode *inode, u64 from) +int truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blocksize = inode->i_sb->s_blocksize; @@ -438,14 +438,16 @@ int truncate_blocks(struct inode *inode, u64 from) free_from = (pgoff_t) ((from + blocksize - 1) >> (sbi->log_blocksize)); - f2fs_lock_op(sbi); + if (lock) + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - f2fs_unlock_op(sbi); + if (lock) + f2fs_unlock_op(sbi); trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -463,7 +465,8 @@ int truncate_blocks(struct inode *inode, u64 from) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); - f2fs_unlock_op(sbi); + if (lock) + f2fs_unlock_op(sbi); done: /* lastly zero out the first data page */ truncate_partial_data_page(inode, from); @@ -480,7 +483,7 @@ void f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); - if (!truncate_blocks(inode, i_size_read(inode))) { + if (!truncate_blocks(inode, i_size_read(inode), true)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 520758b91999..4d1f39f4583c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -247,7 +247,7 @@ process_inline: update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - truncate_blocks(inode, 0); + truncate_blocks(inode, 0, false); set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); goto process_inline; } -- cgit v1.2.3 From 202095a7a0ec075b924cb15dde330bf76e485f61 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 15 Aug 2014 09:56:46 -0700 Subject: f2fs: remove rewrite_node_page I think we need to let the dirty node pages remain in the page cache instead of rewriting them in their places. So, after done with successful recovery, write_checkpoint will flush all of them through the normal write path. Through this, we can avoid potential error cases in terms of block allocation. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 4 ---- fs/f2fs/node.c | 9 --------- fs/f2fs/recovery.c | 2 -- fs/f2fs/segment.c | 49 ------------------------------------------------- 4 files changed, 64 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7f976c1d1723..e921242186f6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1207,8 +1207,6 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -void recover_node_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, struct node_info *, block_t); void recover_inline_xattr(struct inode *, struct page *); void recover_xattr_data(struct inode *, struct page *, block_t); int recover_inode_page(struct f2fs_sb_info *, struct page *); @@ -1243,8 +1241,6 @@ void write_data_page(struct page *, struct dnode_of_data *, block_t *, void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); void recover_data_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); -void rewrite_node_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, block_t, block_t); void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); void f2fs_wait_on_page_writeback(struct page *, enum page_type); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d2f784283425..b4d964029fc7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1545,15 +1545,6 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_summary *sum, struct node_info *ni, - block_t new_blkaddr) -{ - rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr); - set_node_addr(sbi, ni, new_blkaddr, false); - clear_node_page_dirty(page); -} - void recover_inline_xattr(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d36ef35353b2..756c41cd2582 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -371,8 +371,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, fill_node_footer(dn.node_page, dn.nid, ni.ino, ofs_of_node(page), false); set_page_dirty(dn.node_page); - - recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr); err: f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 31b630efe4db..0aa337cd5bba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1103,55 +1103,6 @@ void recover_data_page(struct f2fs_sb_info *sbi, mutex_unlock(&curseg->curseg_mutex); } -void rewrite_node_page(struct f2fs_sb_info *sbi, - struct page *page, struct f2fs_summary *sum, - block_t old_blkaddr, block_t new_blkaddr) -{ - struct sit_info *sit_i = SIT_I(sbi); - int type = CURSEG_WARM_NODE; - struct curseg_info *curseg; - unsigned int segno, old_cursegno; - block_t next_blkaddr = next_blkaddr_of_node(page); - unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr); - struct f2fs_io_info fio = { - .type = NODE, - .rw = WRITE_SYNC, - }; - - curseg = CURSEG_I(sbi, type); - - mutex_lock(&curseg->curseg_mutex); - mutex_lock(&sit_i->sentry_lock); - - segno = GET_SEGNO(sbi, new_blkaddr); - old_cursegno = curseg->segno; - - /* change the current segment */ - if (segno != curseg->segno) { - curseg->next_segno = segno; - change_curseg(sbi, type, true); - } - curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); - __add_sum_entry(sbi, type, sum); - - /* change the current log to the next block addr in advance */ - if (next_segno != segno) { - curseg->next_segno = next_segno; - change_curseg(sbi, type, true); - } - curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr); - - /* rewrite node page */ - set_page_writeback(page); - f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); - locate_dirty_segment(sbi, old_cursegno); - - mutex_unlock(&sit_i->sentry_lock); - mutex_unlock(&curseg->curseg_mutex); -} - static inline bool is_merged_page(struct f2fs_sb_info *sbi, struct page *page, enum page_type type) { -- cgit v1.2.3 From ec4e7af4ca04ff81f786554d670876f1037a9ded Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 18 Aug 2014 14:41:11 -0700 Subject: f2fs: skip if inline_data was converted already This patch checks inline_data one more time under the inode page lock whether its inline_data is converted or not. Signed-off-by: Jaegeuk Kim --- fs/f2fs/inline.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4d1f39f4583c..3e8ecdf3742b 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -68,7 +68,7 @@ out: static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) { - int err; + int err = 0; struct page *ipage; struct dnode_of_data dn; void *src_addr, *dst_addr; @@ -86,6 +86,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) goto out; } + /* someone else converted inline_data already */ + if (!f2fs_has_inline_data(inode)) + goto out; + /* * i_addr[0] is not used for inline data, * so reserving new block will not destroy inline data -- cgit v1.2.3 From 04859dba50e6cd85c5d683d06010c5eafb27c893 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 19 Aug 2014 14:14:27 -0700 Subject: f2fs: remove rename and use rename2 Refer the following patch. commit 7177a9c4b509eb357cc450256bc3cf39f1a1e639 Author: Miklos Szeredi Date: Wed Jul 23 15:15:30 2014 +0200 fs: call rename2 if exists Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index eb407d207289..6b53ce924d95 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -704,7 +704,6 @@ const struct inode_operations f2fs_dir_inode_operations = { .mkdir = f2fs_mkdir, .rmdir = f2fs_rmdir, .mknod = f2fs_mknod, - .rename = f2fs_rename, .rename2 = f2fs_rename2, .tmpfile = f2fs_tmpfile, .getattr = f2fs_getattr, -- cgit v1.2.3 From c200b1aa6cb460ce8c3ecf6fdc690d3949c3cc5d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 20 Aug 2014 18:36:46 +0800 Subject: f2fs: fix incorrect calculation with total/free inode num Theoretically, our total inodes number is the same as total node number, but there are three node ids are reserved in f2fs, they are 0, 1 (node nid), and 2 (meta nid), and they should never be used by user, so our total/free inode number calculated in ->statfs is wrong. This patch indroduces F2FS_RESERVED_NODE_NUM and then fixes this issue by recalculating total/free inode number with the macro. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- fs/f2fs/super.c | 4 ++-- include/linux/f2fs_fs.h | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b4d964029fc7..044395c20ee9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1957,7 +1957,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - 3; + nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; nm_i->fcnt = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ddb1e9d36363..0f61f5aa587c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -508,8 +508,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); - buf->f_files = sbi->total_node_count; - buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); + buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + buf->f_ffree = buf->f_files - valid_inode_count(sbi); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 6ff0b0b42d47..0ed77f32c9ac 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -24,6 +24,9 @@ #define NULL_ADDR ((block_t)0) /* used as block_t addresses */ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ +/* 0, 1(node nid), 2(meta nid) are reserved node id */ +#define F2FS_RESERVED_NODE_NUM 3 + #define F2FS_ROOT_INO(sbi) (sbi->root_ino_num) #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) -- cgit v1.2.3 From 9d1589ef2edeb4565161771bd0faf3fd5288844b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 20 Aug 2014 18:37:35 +0800 Subject: f2fs: introduce need_do_checkpoint for readability This patch introduce need_do_checkpoint() to include numerous judgment condition for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a8e97f8ed163..060aee65aee8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -115,6 +115,25 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) return 1; } +static inline bool need_do_checkpoint(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + bool need_cp = false; + + if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) + need_cp = true; + else if (file_wrong_pino(inode)) + need_cp = true; + else if (!space_for_roll_forward(sbi)) + need_cp = true; + else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) + need_cp = true; + else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) + need_cp = true; + + return need_cp; +} + int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; @@ -159,23 +178,12 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); - down_read(&fi->i_sem); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) - need_cp = true; - else if (file_wrong_pino(inode)) - need_cp = true; - else if (!space_for_roll_forward(sbi)) - need_cp = true; - else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) - need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; - + down_read(&fi->i_sem); + need_cp = need_do_checkpoint(inode); up_read(&fi->i_sem); if (need_cp) { -- cgit v1.2.3 From 787aded65044e4cabefcf7eb7576c2dd6b151468 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 22 Aug 2014 14:22:51 +0900 Subject: cifs: Allow directIO read/write during cache=strict Currently cifs have all or nothing approach for directIO operations. cache=strict mode does not allow directIO while cache=none mode performs all the operations as directIO even when user does not specify O_DIRECT flag. This patch enables strict cache mode to honour directIO semantics. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/dir.c | 8 ++++++++ fs/cifs/file.c | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 3db0c5fd9a11..6cbd9c688cfe 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, goto out; } + if (file->f_flags & O_DIRECT && + CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + file->f_op = &cifs_file_direct_nobrl_ops; + else + file->f_op = &cifs_file_direct_ops; + } + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); if (file_info == NULL) { if (server->ops->close) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d5fec92e0360..7c018a1c52f7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file) cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", inode, file->f_flags, full_path); + if (file->f_flags & O_DIRECT && + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + file->f_op = &cifs_file_direct_nobrl_ops; + else + file->f_op = &cifs_file_direct_ops; + } + if (server->oplocks) oplock = REQ_OPLOCK; else -- cgit v1.2.3 From 52a36244443eabb594bdb63622ff2dd7a083f0e2 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Aug 2014 19:39:17 +0900 Subject: cifs: No need to send SIGKILL to demux_thread during umount There is no need to explicitly send SIGKILL to cifs_demultiplex_thread as it is calling module_put_and_exit to exit cleanly. socket sk_rcvtimeo is set to 7 HZ so the thread will wake up in 7 seconds and clean itself. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 36ca2045009b..8a9fded7c135 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -837,7 +837,6 @@ cifs_demultiplex_thread(void *p) struct TCP_Server_Info *server = p; unsigned int pdu_length; char *buf = NULL; - struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; current->flags |= PF_MEMALLOC; @@ -928,19 +927,7 @@ cifs_demultiplex_thread(void *p) if (server->smallbuf) /* no sense logging a debug message if NULL */ cifs_small_buf_release(server->smallbuf); - task_to_wake = xchg(&server->tsk, NULL); clean_demultiplex_info(server); - - /* if server->tsk was NULL then wait for a signal before exiting */ - if (!task_to_wake) { - set_current_state(TASK_INTERRUPTIBLE); - while (!signal_pending(current)) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - set_current_state(TASK_RUNNING); - } - module_put_and_exit(0); } @@ -2063,8 +2050,6 @@ cifs_find_tcp_session(struct smb_vol *vol) static void cifs_put_tcp_session(struct TCP_Server_Info *server) { - struct task_struct *task; - spin_lock(&cifs_tcp_ses_lock); if (--server->srv_count > 0) { spin_unlock(&cifs_tcp_ses_lock); @@ -2088,10 +2073,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; - - task = xchg(&server->tsk, NULL); - if (task) - force_sig(SIGKILL, task); } static struct TCP_Server_Info * -- cgit v1.2.3 From a07d322059db66b84c9eb4f98959df468e88b34b Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 22 Aug 2014 13:32:09 +0400 Subject: CIFS: Fix directory rename error CIFS servers process nlink counts differently for files and directories. In cifs_rename() if we the request fails on the existing target, we try to remove it through cifs_unlink() but this is not what we want to do for directories. As the result the following sequence of commands mkdir {1,2}; mv -T 1 2; rmdir {1,2}; mkdir {1,2}; echo foo > 2/bar and XFS test generic/023 fail with -ENOENT error. That's why the second mkdir reuses the existing inode (target inode of the mv -T command) with S_DEAD flag. Fix this by checking whether the target is directory or not and calling cifs_rmdir() rather than cifs_unlink() for directories. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 949ec909ec9a..7899a40465b3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1720,7 +1720,10 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, unlink_target: /* Try unlinking the target dentry if it's not negative */ if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { - tmprc = cifs_unlink(target_dir, target_dentry); + if (d_is_dir(target_dentry)) + tmprc = cifs_rmdir(target_dir, target_dentry); + else + tmprc = cifs_unlink(target_dir, target_dentry); if (tmprc) goto cifs_rename_exit; rc = cifs_do_rename(xid, source_dentry, from_name, -- cgit v1.2.3 From e0b760ff71be168d4e623f7c3612e98902ab93e9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Aug 2014 09:58:22 -0400 Subject: locks: pass correct "before" pointer to locks_unlink_lock in generic_add_lease The argument to locks_unlink_lock can't be just any pointer to a pointer. It must be a pointer to the fl_next field in the previous lock in the list. Cc: # v3.15+ Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig --- fs/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index cb66fb05ad4a..bb08857f90b5 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1619,7 +1619,7 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp smp_mb(); error = check_conflicting_open(dentry, arg); if (error) - locks_unlink_lock(flp); + locks_unlink_lock(before); out: if (is_deleg) mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From b5b822050ca3c4fc1f475100cc197cc00ba2d492 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Aug 2014 16:17:38 +0800 Subject: f2fs: use macro for code readability This patch introduces DEF_NIDS_PER_INODE/GET_ORPHAN_BLOCKS/F2FS_CP_PACKS macro instead of numbers in code for readability. change log from v1: o fix typo pointed out by Jaegeuk Kim. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 21 ++++++++++----------- include/linux/f2fs_fs.h | 13 ++++++++++--- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index c9c08d52ecfd..ec3b7a5381fa 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -443,8 +443,8 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) struct f2fs_orphan_block *orphan_blk = NULL; unsigned int nentries = 0; unsigned short index; - unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + - (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); + unsigned short orphan_blocks = + (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans); struct page *page = NULL; struct ino_entry *orphan = NULL; @@ -837,7 +837,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); - for (i = 0; i < 3; i++) { + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); ckpt->cur_node_blkoff[i] = @@ -845,7 +845,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); } - for (i = 0; i < 3; i++) { + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { ckpt->cur_data_segno[i] = cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); ckpt->cur_data_blkoff[i] = @@ -860,24 +860,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi); - if (data_sum_blocks < 3) + if (data_sum_blocks < NR_CURSEG_DATA_TYPE) set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) - / F2FS_ORPHANS_PER_BLOCK; + orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); if (is_umount) { set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - ckpt->cp_pack_total_block_count = cpu_to_le32(2 + + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); } else { clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - ckpt->cp_pack_total_block_count = cpu_to_le32(2 + + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks); } @@ -1022,8 +1021,8 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) * for cp pack we can have max 1020*504 orphan entries */ sbi->n_orphans = 0; - sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) - * F2FS_ORPHANS_PER_BLOCK; + sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 0ed77f32c9ac..08ed2b0a96e6 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -90,6 +90,8 @@ struct f2fs_super_block { #define CP_ORPHAN_PRESENT_FLAG 0x00000002 #define CP_UMOUNT_FLAG 0x00000001 +#define F2FS_CP_PACKS 2 /* # of checkpoint packs */ + struct f2fs_checkpoint { __le64 checkpoint_ver; /* checkpoint block version number */ __le64 user_block_count; /* # of user blocks */ @@ -126,6 +128,9 @@ struct f2fs_checkpoint { */ #define F2FS_ORPHANS_PER_BLOCK 1020 +#define GET_ORPHAN_BLOCKS(n) ((n + F2FS_ORPHANS_PER_BLOCK - 1) / \ + F2FS_ORPHANS_PER_BLOCK) + struct f2fs_orphan_block { __le32 ino[F2FS_ORPHANS_PER_BLOCK]; /* inode numbers */ __le32 reserved; /* reserved */ @@ -147,6 +152,7 @@ struct f2fs_extent { #define F2FS_NAME_LEN 255 #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ +#define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ #define ADDRS_PER_INODE(fi) addrs_per_inode(fi) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ #define NIDS_PER_BLOCK 1018 /* Node IDs in an Indirect Block */ @@ -166,8 +172,9 @@ struct f2fs_extent { #define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \ F2FS_INLINE_XATTR_ADDRS - 1)) -#define INLINE_DATA_OFFSET (PAGE_CACHE_SIZE - sizeof(struct node_footer) \ - - sizeof(__le32) * (DEF_ADDRS_PER_INODE + 5 - 1)) +#define INLINE_DATA_OFFSET (PAGE_CACHE_SIZE - sizeof(struct node_footer) -\ + sizeof(__le32) * (DEF_ADDRS_PER_INODE + \ + DEF_NIDS_PER_INODE - 1)) struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -197,7 +204,7 @@ struct f2fs_inode { __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ - __le32 i_nid[5]; /* direct(2), indirect(2), + __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), double_indirect(1) node id */ } __packed; -- cgit v1.2.3 From fd2f3a06d30c85a17cf035ebc60c88c2a13a8ece Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 8 Aug 2014 11:00:53 -0400 Subject: nfs: change nfs_page_group_lock argument Flip the meaning of the second argument from 'wait' to 'nonblock' to match related functions. Update all five calls to reflect this change. Signed-off-by: Weston Andros Adamson Reviewed-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 11 ++++++----- fs/nfs/write.c | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ba491926df5f..7efa61586bc4 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -139,13 +139,14 @@ nfs_iocounter_wait(struct nfs_io_counter *c) /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked + * @nonblock - if true don't block waiting for lock * * this lock must be held if modifying the page group list * * returns result from wait_on_bit_lock: 0 on success, < 0 on error */ int -nfs_page_group_lock(struct nfs_page *req, bool wait) +nfs_page_group_lock(struct nfs_page *req, bool nonblock) { struct nfs_page *head = req->wb_head; int ret; @@ -155,7 +156,7 @@ nfs_page_group_lock(struct nfs_page *req, bool wait) do { ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); - } while (wait && ret != 0); + } while (!nonblock && ret != 0); WARN_ON_ONCE(ret > 0); return ret; @@ -219,7 +220,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) { bool ret; - nfs_page_group_lock(req, true); + nfs_page_group_lock(req, false); ret = nfs_page_group_sync_on_bit_locked(req, bit); nfs_page_group_unlock(req); @@ -860,7 +861,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, unsigned int offset, pgbase; int ret; - ret = nfs_page_group_lock(req, false); + ret = nfs_page_group_lock(req, true); if (ret < 0) { desc->pg_error = ret; return 0; @@ -886,7 +887,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_recoalesce) return 0; /* retry add_request for this subreq */ - ret = nfs_page_group_lock(req, false); + ret = nfs_page_group_lock(req, true); if (ret < 0) { desc->pg_error = ret; return 0; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e3b5cf28bdc5..27715306f24b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -241,7 +241,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) unsigned int pos = 0; unsigned int len = nfs_page_length(req->wb_page); - nfs_page_group_lock(req, true); + nfs_page_group_lock(req, false); do { tmp = nfs_page_group_search_locked(req->wb_head, pos); @@ -479,7 +479,7 @@ try_again: } /* lock each request in the page group */ - ret = nfs_page_group_lock(head, false); + ret = nfs_page_group_lock(head, true); if (ret < 0) return ERR_PTR(ret); subreq = head; -- cgit v1.2.3 From bc8a309e88a86205fc3e17f06e42a2e56fc6f807 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 8 Aug 2014 11:00:54 -0400 Subject: nfs: fix nonblocking calls to nfs_page_group_lock nfs_page_group_lock was calling wait_on_bit_lock even when told not to block. Fix by first trying test_and_set_bit, followed by wait_on_bit_lock if and only if blocking is allowed. Return -EAGAIN if nonblocking and the test_and_set of the bit was already locked. Signed-off-by: Weston Andros Adamson Reviewed-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7efa61586bc4..89d5d433e351 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -143,23 +143,28 @@ nfs_iocounter_wait(struct nfs_io_counter *c) * * this lock must be held if modifying the page group list * - * returns result from wait_on_bit_lock: 0 on success, < 0 on error + * return 0 on success, < 0 on error: -EDELAY if nonblocking or the + * result from wait_on_bit_lock + * + * NOTE: calling with nonblock=false should always have set the + * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock + * with TASK_UNINTERRUPTIBLE), so there is no need to check the result. */ int nfs_page_group_lock(struct nfs_page *req, bool nonblock) { struct nfs_page *head = req->wb_head; - int ret; WARN_ON_ONCE(head != head->wb_head); - do { - ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, - TASK_UNINTERRUPTIBLE); - } while (!nonblock && ret != 0); + if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) + return 0; - WARN_ON_ONCE(ret > 0); - return ret; + if (!nonblock) + return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + TASK_UNINTERRUPTIBLE); + + return -EAGAIN; } /* -- cgit v1.2.3 From bfd484a5606d6a0379a0a2f04251b1e5c1f8995c Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 8 Aug 2014 11:00:55 -0400 Subject: nfs: use blocking page_group_lock in add_request __nfs_pageio_add_request was calling nfs_page_group_lock nonblocking, but this can return -EAGAIN which would end up passing -EIO to the application. There is no reason not to block in this path, so change the two calls to do so. Also, there is no need to check the return value of nfs_page_group_lock when nonblock=false, so remove the error handling code. Signed-off-by: Weston Andros Adamson Reviewed-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 89d5d433e351..30c9626f96b0 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -864,13 +864,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *subreq; unsigned int bytes_left = 0; unsigned int offset, pgbase; - int ret; - ret = nfs_page_group_lock(req, true); - if (ret < 0) { - desc->pg_error = ret; - return 0; - } + nfs_page_group_lock(req, false); subreq = req; bytes_left = subreq->wb_bytes; @@ -892,11 +887,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (desc->pg_recoalesce) return 0; /* retry add_request for this subreq */ - ret = nfs_page_group_lock(req, true); - if (ret < 0) { - desc->pg_error = ret; - return 0; - } + nfs_page_group_lock(req, false); continue; } -- cgit v1.2.3 From 94970014c46223cbcdfbfc67b89596a412f9e3dd Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 8 Aug 2014 11:00:56 -0400 Subject: nfs: fix error handling in lock_and_join_requests This fixes handling of errors from nfs_page_group_lock in nfs_lock_and_join_requests. It now releases the inode lock and the reference to the head request. Reported-by: Peng Tao Signed-off-by: Weston Andros Adamson Reviewed-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 27715306f24b..e056f617adf2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -480,8 +480,11 @@ try_again: /* lock each request in the page group */ ret = nfs_page_group_lock(head, true); - if (ret < 0) + if (ret < 0) { + spin_unlock(&inode->i_lock); + nfs_release_request(head); return ERR_PTR(ret); + } subreq = head; do { /* -- cgit v1.2.3 From 7c3af975257383ece54b83c0505d3e0656cb7daf Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Fri, 8 Aug 2014 11:00:57 -0400 Subject: nfs: don't sleep with inode lock in lock_and_join_requests This handles the 'nonblock=false' case in nfs_lock_and_join_requests. If the group is already locked and blocking is allowed, drop the inode lock and wait for the group lock to be cleared before trying it all again. This should fix warnings found in peterz's tree (sched/wait branch), where might_sleep() checks are added to wait.[ch]. Reported-by: Fengguang Wu Signed-off-by: Weston Andros Adamson Reviewed-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 17 +++++++++++++++++ fs/nfs/write.c | 12 +++++++++++- include/linux/nfs_page.h | 1 + 3 files changed, 29 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 30c9626f96b0..4ec67f8d70aa 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -167,6 +167,23 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock) return -EAGAIN; } +/* + * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it + * @req - a request in the group + * + * This is a blocking call to wait for the group lock to be cleared. + */ +void +nfs_page_group_lock_wait(struct nfs_page *req) +{ + struct nfs_page *head = req->wb_head; + + WARN_ON_ONCE(head != head->wb_head); + + wait_on_bit(&head->wb_flags, PG_HEADLOCK, + TASK_UNINTERRUPTIBLE); +} + /* * nfs_page_group_unlock - unlock the head of the page group * @req - request in group that is to be unlocked diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e056f617adf2..175d5d073ccf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -478,13 +478,23 @@ try_again: return NULL; } - /* lock each request in the page group */ + /* holding inode lock, so always make a non-blocking call to try the + * page group lock */ ret = nfs_page_group_lock(head, true); if (ret < 0) { spin_unlock(&inode->i_lock); + + if (!nonblock && ret == -EAGAIN) { + nfs_page_group_lock_wait(head); + nfs_release_request(head); + goto try_again; + } + nfs_release_request(head); return ERR_PTR(ret); } + + /* lock each request in the page group */ subreq = head; do { /* diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 6ad2bbcad405..6c3e06ee2fb7 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -123,6 +123,7 @@ extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); extern int nfs_page_group_lock(struct nfs_page *, bool); +extern void nfs_page_group_lock_wait(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); -- cgit v1.2.3 From bba5c1887a925a9945d22217d38d58d8b3ba1043 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 14 Aug 2014 17:39:32 -0400 Subject: nfs: disallow duplicate pages in pgio page vectors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adjacent requests that share the same page are allowed, but should only use one entry in the page vector. This avoids overruning the page vector - it is sized based on how many bytes there are, not by request count. This fixes issues that manifest as "Redzone overwritten" bugs (the vector overrun) and hangs waiting on page read / write, as it waits on the same page more than once. This also adds bounds checking to the page vector with a graceful failure (WARN_ON_ONCE and pgio error returned to application). Reported-by: Toralf Förster Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 4ec67f8d70aa..a1d1de7b97c5 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -724,10 +724,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { struct nfs_page *req; - struct page **pages; + struct page **pages, + *last_page; struct list_head *head = &desc->pg_list; struct nfs_commit_info cinfo; - unsigned int pagecount; + unsigned int pagecount, pageused; pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count); if (!nfs_pgarray_set(&hdr->page_array, pagecount)) @@ -735,12 +736,23 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); pages = hdr->page_array.pagevec; + last_page = NULL; + pageused = 0; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_list_add_request(req, &hdr->pages); - *pages++ = req->wb_page; + + if (WARN_ON_ONCE(pageused >= pagecount)) + return nfs_pgio_error(desc, hdr); + + if (!last_page || last_page != req->wb_page) { + *pages++ = last_page = req->wb_page; + pageused++; + } } + if (WARN_ON_ONCE(pageused != pagecount)) + return nfs_pgio_error(desc, hdr); if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) -- cgit v1.2.3 From 78270e8fbc2916bfc8305b8f58f33474cce1ec0e Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 14 Aug 2014 17:39:33 -0400 Subject: nfs: can_coalesce_requests must enforce contiguity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 6094f83864c1d1296566a282cba05ba613f151ee "nfs: allow coalescing of subpage requests" got rid of the requirement that requests cover whole pages, but it made some incorrect assumptions. It turns out that callers of this interface can map adjacent requests (by file position as seen by req_offset + req->wb_bytes) to different pages, even when they could share a page. An example is the direct I/O interface - iov_iter_get_pages_alloc may return one segment with a partial page filled and the next segment (which is adjacent in the file position) starts with a new page. Reported-by: Toralf Förster Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index a1d1de7b97c5..932c6cc27a76 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -823,6 +823,14 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, return false; if (req_offset(req) != req_offset(prev) + prev->wb_bytes) return false; + if (req->wb_page == prev->wb_page) { + if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes) + return false; + } else { + if (req->wb_pgbase != 0 || + prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) + return false; + } } size = pgio->pg_ops->pg_test(pgio, prev, req); WARN_ON_ONCE(size > req->wb_bytes); -- cgit v1.2.3 From 92a56555bd576c61b27a5cab9f38a33a1e9a1df5 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Tue, 5 Aug 2014 11:19:42 -0400 Subject: nfs: Don't busy-wait on SIGKILL in __nfs_iocounter_wait If a SIGKILL is sent to a task waiting in __nfs_iocounter_wait, it will busy-wait or soft lockup in its while loop. nfs_wait_bit_killable won't sleep, and the loop won't exit on the error return. Stop the busy-wait by breaking out of the loop when nfs_wait_bit_killable returns an error. Signed-off-by: David Jeffery Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 932c6cc27a76..be7cbce6e4c7 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -116,7 +116,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) if (atomic_read(&c->io_count) == 0) break; ret = nfs_wait_bit_killable(&q.key); - } while (atomic_read(&c->io_count) != 0); + } while (atomic_read(&c->io_count) != 0 && !ret); finish_wait(wq, &q.wait); return ret; } -- cgit v1.2.3 From 36de928641ee48b2078d3fe9514242aaa2f92013 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 23 Aug 2014 17:47:19 -0400 Subject: ext4: propagate errors up to ext4_find_entry()'s callers If we run into some kind of error, such as ENOMEM, while calling ext4_getblk() or ext4_dx_find_entry(), we need to make sure this error gets propagated up to ext4_find_entry() and then to its callers. This way, transient errors such as ENOMEM can get propagated to the VFS. This is important so that the system calls return the appropriate error, and also so that in the case of ext4_lookup(), we return an error instead of a NULL inode, since that will result in a negative dentry cache entry that will stick around long past the OOM condition which caused a transient ENOMEM error. Google-Bug-Id: #17142205 Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 2 +- fs/ext4/namei.c | 35 +++++++++++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5b19760b1de5..4d95c3301775 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1825,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) /* * Special error return code only used by dx_probe() and its callers. */ -#define ERR_BAD_DX_DIR -75000 +#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) /* * Timeout and state flag for lazy initialization inode thread. diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b147a67baa0d..ae7088b446d1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, err; + int i, err = 0; int namelen; *res_dir = NULL; @@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (bh || (err != ERR_BAD_DX_DIR)) + if (err == -ENOENT) + return NULL; + if (err && err != ERR_BAD_DX_DIR) + return ERR_PTR(err); + if (bh) return bh; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); @@ -1295,6 +1299,11 @@ restart: } num++; bh = ext4_getblk(NULL, dir, b++, 0, &err); + if (unlikely(err)) { + if (ra_max == 0) + return ERR_PTR(err); + break; + } bh_use[ra_max] = bh; if (bh) ll_rw_block(READ | REQ_META | REQ_PRIO, @@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-ENAMETOOLONG); bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return (struct dentry *) bh; inode = NULL; if (bh) { __u32 ino = le32_to_cpu(de->inode); @@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child) struct buffer_head *bh; bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + if (IS_ERR(bh)) + return (struct dentry *) bh; if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto end_rmdir; @@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto end_unlink; @@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de; bh = ext4_find_entry(dir, d_name, &de, NULL); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (bh) { retval = ext4_delete_entry(handle, dir, de, bh); brelse(bh); @@ -3202,6 +3221,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(new.inode); old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3214,6 +3235,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + goto end_rename; + } if (new.bh) { if (!new.inode) { brelse(new.bh); @@ -3330,6 +3355,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, &old.inlined); + if (IS_ERR(old.bh)) + return PTR_ERR(old.bh); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -3342,6 +3369,10 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); + if (IS_ERR(new.bh)) { + retval = PTR_ERR(new.bh); + goto end_rename; + } /* RENAME_EXCHANGE case: old *and* new must both exist */ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) -- cgit v1.2.3 From c99d1e6e83b06744c75d9f5e491ed495a7086b7b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 23 Aug 2014 17:47:28 -0400 Subject: ext4: fix BUG_ON in mb_free_blocks() If we suffer a block allocation failure (for example due to a memory allocation failure), it's possible that we will call ext4_discard_allocated_blocks() before we've actually allocated any blocks. In that case, fe_len and fe_start in ac->ac_f_ex will still be zero, and this will result in mb_free_blocks(inode, e4b, 0, 0) triggering the BUG_ON on mb_free_blocks(): BUG_ON(last >= (sb->s_blocksize << 3)); Fix this by bailing out of ext4_discard_allocated_blocks() if fs_len is zero. Also fix a missing ext4_mb_unload_buddy() call in ext4_discard_allocated_blocks(). Google-Bug-Id: 16844242 Fixes: 86f0afd463215fc3e58020493482faa4ac3a4d69 Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/mballoc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 956027711faf..8b0f9ef517d6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int last = first + count - 1; struct super_block *sb = e4b->bd_sb; + if (WARN_ON(count == 0)) + return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ @@ -3221,6 +3223,8 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) int err; if (pa == NULL) { + if (ac->ac_f_ex.fe_len == 0) + return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); if (err) { /* @@ -3235,6 +3239,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); + ext4_mb_unload_buddy(&e4b); return; } if (pa->pa_type == MB_INODE_PA) -- cgit v1.2.3 From 4631dbf677ded0419fee35ca7408285dabfaef1a Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 23 Aug 2014 17:48:28 -0400 Subject: ext4: move i_size,i_disksize update routines to helper function Cc: stable@vger.kernel.org # needed for bug fix patches Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 16 ++++++++++++++++ fs/ext4/extents.c | 17 ++++------------- fs/ext4/inode.c | 34 ++++++++-------------------------- 3 files changed, 28 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4d95c3301775..b0c225cdb52c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2454,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } +/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */ +static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) +{ + int changed = 0; + + if (newsize > inode->i_size) { + i_size_write(inode, newsize); + changed = 1; + } + if (newsize > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, newsize); + changed |= 2; + } + return changed; +} + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 76c2df382b7d..f0e6934291dd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4839,12 +4839,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, } inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - if (new_size) { - if (new_size > i_size_read(inode)) - i_size_write(inode, new_size); - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); + ext4_update_inode_size(inode, new_size); } else { /* * Mark that we allocate beyond EOF so the subsequent truncate @@ -4886,7 +4882,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) int ret = 0; int flags; ext4_lblk_t lblk; - struct timespec tv; unsigned int blkbits = inode->i_blkbits; /* Return error if mode is not supported */ @@ -4945,15 +4940,11 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (IS_ERR(handle)) goto out; - tv = inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = ext4_current_time(inode); if (new_size) { - if (new_size > i_size_read(inode)) { - i_size_write(inode, new_size); - inode->i_mtime = tv; - } - if (new_size > EXT4_I(inode)->i_disksize) - ext4_update_i_disksize(inode, new_size); + if (ext4_update_inode_size(inode, new_size) & 0x1) + inode->i_mtime = inode->i_ctime; } else { /* * Mark that we allocate beyond EOF so the subsequent truncate diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 367a60c07cf0..b1ddd9352644 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1055,27 +1055,11 @@ static int ext4_write_end(struct file *file, } else copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hole i_mutex. - * - * But it's important to update i_size while still holding page lock: + * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. */ - if (pos + copied > inode->i_size) { - i_size_write(inode, pos + copied); - i_size_changed = 1; - } - - if (pos + copied > EXT4_I(inode)->i_disksize) { - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * but greater than i_disksize. (hint delalloc) - */ - ext4_update_i_disksize(inode, (pos + copied)); - i_size_changed = 1; - } + i_size_changed = ext4_update_inode_size(inode, pos + copied); unlock_page(page); page_cache_release(page); @@ -1123,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file, int ret = 0, ret2; int partial = 0; unsigned from, to; - loff_t new_i_size; + int size_changed = 0; trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); @@ -1146,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file, if (!partial) SetPageUptodate(page); } - new_i_size = pos + copied; - if (new_i_size > inode->i_size) - i_size_write(inode, pos+copied); + size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - if (new_i_size > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, new_i_size); + unlock_page(page); + page_cache_release(page); + + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } - unlock_page(page); - page_cache_release(page); if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside -- cgit v1.2.3 From 9e0af23764344f7f1b68e4eefbe7dc865018b63d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 15 Aug 2014 23:36:53 +0800 Subject: Btrfs: fix task hang under heavy compressed write This has been reported and discussed for a long time, and this hang occurs in both 3.15 and 3.16. Btrfs now migrates to use kernel workqueue, but it introduces this hang problem. Btrfs has a kind of work queued as an ordered way, which means that its ordered_func() must be processed in the way of FIFO, so it usually looks like -- normal_work_helper(arg) work = container_of(arg, struct btrfs_work, normal_work); work->func() <---- (we name it work X) for ordered_work in wq->ordered_list ordered_work->ordered_func() ordered_work->ordered_free() The hang is a rare case, first when we find free space, we get an uncached block group, then we go to read its free space cache inode for free space information, so it will file a readahead request btrfs_readpages() for page that is not in page cache __do_readpage() submit_extent_page() btrfs_submit_bio_hook() btrfs_bio_wq_end_io() submit_bio() end_workqueue_bio() <--(ret by the 1st endio) queue a work(named work Y) for the 2nd also the real endio() So the hang occurs when work Y's work_struct and work X's work_struct happens to share the same address. A bit more explanation, A,B,C -- struct btrfs_work arg -- struct work_struct kthread: worker_thread() pick up a work_struct from @worklist process_one_work(arg) worker->current_work = arg; <-- arg is A->normal_work worker->current_func(arg) normal_work_helper(arg) A = container_of(arg, struct btrfs_work, normal_work); A->func() A->ordered_func() A->ordered_free() <-- A gets freed B->ordered_func() submit_compressed_extents() find_free_extent() load_free_space_inode() ... <-- (the above readhead stack) end_workqueue_bio() btrfs_queue_work(work C) B->ordered_free() As if work A has a high priority in wq->ordered_list and there are more ordered works queued after it, such as B->ordered_func(), its memory could have been freed before normal_work_helper() returns, which means that kernel workqueue code worker_thread() still has worker->current_work pointer to be work A->normal_work's, ie. arg's address. Meanwhile, work C is allocated after work A is freed, work C->normal_work and work A->normal_work are likely to share the same address(I confirmed this with ftrace output, so I'm not just guessing, it's rare though). When another kthread picks up work C->normal_work to process, and finds our kthread is processing it(see find_worker_executing_work()), it'll think work C as a collision and skip then, which ends up nobody processing work C. So the situation is that our kthread is waiting forever on work C. Besides, there're other cases that can lead to deadlock, but the real problem is that all btrfs workqueue shares one work->func, -- normal_work_helper, so this makes each workqueue to have its own helper function, but only a wraper pf normal_work_helper. With this patch, I no long hit the above hang. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 44 ++++++++++++++++++++++++++++++++-------- fs/btrfs/async-thread.h | 28 ++++++++++++++++++++++++- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/disk-io.c | 53 ++++++++++++++++++++++++++---------------------- fs/btrfs/extent-tree.c | 7 ++++--- fs/btrfs/inode.c | 35 +++++++++++++++++++++----------- fs/btrfs/ordered-data.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/raid56.c | 9 +++++--- fs/btrfs/reada.c | 3 ++- fs/btrfs/scrub.c | 14 +++++++------ fs/btrfs/volumes.c | 3 ++- 12 files changed, 141 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 5a201d81049c..fbd76ded9a34 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -22,7 +22,6 @@ #include #include #include -#include #include "async-thread.h" #include "ctree.h" @@ -55,8 +54,39 @@ struct btrfs_workqueue { struct __btrfs_workqueue *high; }; -static inline struct __btrfs_workqueue -*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, +static void normal_work_helper(struct btrfs_work *work); + +#define BTRFS_WORK_HELPER(name) \ +void btrfs_##name(struct work_struct *arg) \ +{ \ + struct btrfs_work *work = container_of(arg, struct btrfs_work, \ + normal_work); \ + normal_work_helper(work); \ +} + +BTRFS_WORK_HELPER(worker_helper); +BTRFS_WORK_HELPER(delalloc_helper); +BTRFS_WORK_HELPER(flush_delalloc_helper); +BTRFS_WORK_HELPER(cache_helper); +BTRFS_WORK_HELPER(submit_helper); +BTRFS_WORK_HELPER(fixup_helper); +BTRFS_WORK_HELPER(endio_helper); +BTRFS_WORK_HELPER(endio_meta_helper); +BTRFS_WORK_HELPER(endio_meta_write_helper); +BTRFS_WORK_HELPER(endio_raid56_helper); +BTRFS_WORK_HELPER(rmw_helper); +BTRFS_WORK_HELPER(endio_write_helper); +BTRFS_WORK_HELPER(freespace_write_helper); +BTRFS_WORK_HELPER(delayed_meta_helper); +BTRFS_WORK_HELPER(readahead_helper); +BTRFS_WORK_HELPER(qgroup_rescan_helper); +BTRFS_WORK_HELPER(extent_refs_helper); +BTRFS_WORK_HELPER(scrub_helper); +BTRFS_WORK_HELPER(scrubwrc_helper); +BTRFS_WORK_HELPER(scrubnc_helper); + +static struct __btrfs_workqueue * +__btrfs_alloc_workqueue(const char *name, int flags, int max_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -232,13 +262,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) spin_unlock_irqrestore(lock, flags); } -static void normal_work_helper(struct work_struct *arg) +static void normal_work_helper(struct btrfs_work *work) { - struct btrfs_work *work; struct __btrfs_workqueue *wq; int need_order = 0; - work = container_of(arg, struct btrfs_work, normal_work); /* * We should not touch things inside work in the following cases: * 1) after work->func() if it has no ordered_free @@ -262,7 +290,7 @@ static void normal_work_helper(struct work_struct *arg) trace_btrfs_all_work_done(work); } -void btrfs_init_work(struct btrfs_work *work, +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free) @@ -270,7 +298,7 @@ void btrfs_init_work(struct btrfs_work *work, work->func = func; work->ordered_func = ordered_func; work->ordered_free = ordered_free; - INIT_WORK(&work->normal_work, normal_work_helper); + INIT_WORK(&work->normal_work, uniq_func); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 9c6b66d15fb0..e9e31c94758f 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -19,12 +19,14 @@ #ifndef __BTRFS_ASYNC_THREAD_ #define __BTRFS_ASYNC_THREAD_ +#include struct btrfs_workqueue; /* Internal use only */ struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; @@ -38,11 +40,35 @@ struct btrfs_work { unsigned long flags; }; +#define BTRFS_WORK_HELPER_PROTO(name) \ +void btrfs_##name(struct work_struct *arg) + +BTRFS_WORK_HELPER_PROTO(worker_helper); +BTRFS_WORK_HELPER_PROTO(delalloc_helper); +BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper); +BTRFS_WORK_HELPER_PROTO(cache_helper); +BTRFS_WORK_HELPER_PROTO(submit_helper); +BTRFS_WORK_HELPER_PROTO(fixup_helper); +BTRFS_WORK_HELPER_PROTO(endio_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_helper); +BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); +BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); +BTRFS_WORK_HELPER_PROTO(rmw_helper); +BTRFS_WORK_HELPER_PROTO(endio_write_helper); +BTRFS_WORK_HELPER_PROTO(freespace_write_helper); +BTRFS_WORK_HELPER_PROTO(delayed_meta_helper); +BTRFS_WORK_HELPER_PROTO(readahead_helper); +BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper); +BTRFS_WORK_HELPER_PROTO(extent_refs_helper); +BTRFS_WORK_HELPER_PROTO(scrub_helper); +BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); +BTRFS_WORK_HELPER_PROTO(scrubnc_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, int flags, int max_active, int thresh); -void btrfs_init_work(struct btrfs_work *work, +void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, btrfs_func_t func, btrfs_func_t ordered_func, btrfs_func_t ordered_free); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index da775bfdebc9..a2e90f855d7d 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, - NULL, NULL); + btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper, + btrfs_async_run_delayed_root, NULL, NULL); async_work->nr = nr; btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c99a414813c1..a1d36e62179c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -39,7 +39,6 @@ #include "btrfs_inode.h" #include "volumes.h" #include "print-tree.h" -#include "async-thread.h" #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" @@ -693,35 +692,41 @@ static void end_workqueue_bio(struct bio *bio, int err) { struct end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; fs_info = end_io_wq->info; end_io_wq->error = err; - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - btrfs_queue_work(fs_info->endio_meta_write_workers, - &end_io_wq->work); - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - btrfs_queue_work(fs_info->endio_freespace_worker, - &end_io_wq->work); - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_work(fs_info->endio_raid56_workers, - &end_io_wq->work); - else - btrfs_queue_work(fs_info->endio_write_workers, - &end_io_wq->work); + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) { + wq = fs_info->endio_meta_write_workers; + func = btrfs_endio_meta_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - btrfs_queue_work(fs_info->endio_raid56_workers, - &end_io_wq->work); - else if (end_io_wq->metadata) - btrfs_queue_work(fs_info->endio_meta_workers, - &end_io_wq->work); - else - btrfs_queue_work(fs_info->endio_workers, - &end_io_wq->work); + if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + wq = fs_info->endio_raid56_workers; + func = btrfs_endio_raid56_helper; + } else if (end_io_wq->metadata) { + wq = fs_info->endio_meta_workers; + func = btrfs_endio_meta_helper; + } else { + wq = fs_info->endio_workers; + func = btrfs_endio_helper; + } } + + btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL); + btrfs_queue_work(wq, &end_io_wq->work); } /* @@ -828,7 +833,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_start = submit_bio_start; async->submit_bio_done = submit_bio_done; - btrfs_init_work(&async->work, run_one_async_start, + btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start, run_one_async_done, run_one_async_free); async->bio_flags = bio_flags; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5524434da059..3efe1c3877bf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; atomic_set(&caching_ctl->count, 1); - btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, + caching_thread, NULL, NULL); spin_lock(&cache->lock); /* @@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, async->sync = 0; init_completion(&async->wait); - btrfs_init_work(&async->work, delayed_ref_async_start, - NULL, NULL); + btrfs_init_work(&async->work, btrfs_extent_refs_helper, + delayed_ref_async_start, NULL, NULL); btrfs_queue_work(root->fs_info->extent_workers, &async->work); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ae98df67950f..3d020d6d9ace 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1096,8 +1096,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); - btrfs_init_work(&async_cow->work, async_cow_start, - async_cow_submit, async_cow_free); + btrfs_init_work(&async_cow->work, + btrfs_delalloc_helper, + async_cow_start, async_cow_submit, + async_cow_free); nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; @@ -1881,7 +1883,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) SetPageChecked(page); page_cache_get(page); - btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_fixup_helper, + btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); return -EBUSY; @@ -2822,7 +2825,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct inode *inode = page->mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered_extent = NULL; - struct btrfs_workqueue *workers; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); @@ -2831,13 +2835,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, end - start + 1, uptodate)) return 0; - btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); + if (btrfs_is_free_space_inode(inode)) { + wq = root->fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = root->fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } - if (btrfs_is_free_space_inode(inode)) - workers = root->fs_info->endio_freespace_worker; - else - workers = root->fs_info->endio_write_workers; - btrfs_queue_work(workers, &ordered_extent->work); + btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL, + NULL); + btrfs_queue_work(wq, &ordered_extent->work); return 0; } @@ -7208,7 +7216,8 @@ again: if (!ret) goto out_test; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, btrfs_endio_write_helper, + finish_ordered_fn, NULL, NULL); btrfs_queue_work(root->fs_info->endio_write_workers, &ordered->work); out_test: @@ -8535,7 +8544,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, work->inode = inode; work->wait = wait; work->delay_iput = delay_iput; - btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + WARN_ON_ONCE(!inode); + btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, + btrfs_run_delalloc_work, NULL, NULL); return work; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 963895c1f801..ac734ec4cc20 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -615,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, + btrfs_flush_delalloc_helper, btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(root->fs_info->flush_workers, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8abe45524de9..ded5c601d916 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2720,6 +2720,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, memset(&fs_info->qgroup_rescan_work, 0, sizeof(fs_info->qgroup_rescan_work)); btrfs_init_work(&fs_info->qgroup_rescan_work, + btrfs_qgroup_rescan_helper, btrfs_qgroup_rescan_worker, NULL, NULL); if (ret) { diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4a88f073fdd7..0a6b6e4bcbb9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1416,7 +1416,8 @@ cleanup: static void async_rmw_stripe(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + rmw_work, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); @@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio) static void async_read_rebuild(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + read_rebuild_work, NULL, NULL); btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); @@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); + btrfs_init_work(&plug->work, btrfs_rmw_helper, + unplug_work, NULL, NULL); btrfs_queue_work(plug->info->rmw_workers, &plug->work); return; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 09230cf3a244..20408c6b665a 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info) /* FIXME we cannot handle this properly right now */ BUG(); } - btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); + btrfs_init_work(&rmw->work, btrfs_readahead_helper, + reada_start_machine_worker, NULL, NULL); rmw->fs_info = fs_info; btrfs_queue_work(fs_info->readahead_workers, &rmw->work); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 23d3f6e6a482..f4a41f37be22 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -428,8 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sbio->index = i; sbio->sctx = sctx; sbio->page_count = 0; - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, - NULL, NULL); + btrfs_init_work(&sbio->work, btrfs_scrub_helper, + scrub_bio_end_io_worker, NULL, NULL); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -999,8 +999,8 @@ nodatasum_case: fixup_nodatasum->root = fs_info->extent_root; fixup_nodatasum->mirror_num = failed_mirror_index + 1; scrub_pending_trans_workers_inc(sctx); - btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, - NULL, NULL); + btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper, + scrub_fixup_nodatasum, NULL, NULL); btrfs_queue_work(fs_info->scrub_workers, &fixup_nodatasum->work); goto out; @@ -1616,7 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err) sbio->err = err; sbio->bio = bio; - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); + btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper, + scrub_wr_bio_end_io_worker, NULL, NULL); btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } @@ -3214,7 +3215,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, nocow_ctx->len = len; nocow_ctx->mirror_num = mirror_num; nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; - btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); + btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper, + copy_nocow_pages_worker, NULL, NULL); INIT_LIST_HEAD(&nocow_ctx->inodes); btrfs_queue_work(fs_info->scrub_nocow_workers, &nocow_ctx->work); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d4ce53d7569..340a92d08e84 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5854,7 +5854,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); - btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); + btrfs_init_work(&dev->work, btrfs_submit_helper, + pending_bios_fn, NULL, NULL); return dev; } -- cgit v1.2.3 From d856f32a86b2b015ab180ab7a55e455ed8d3ccc5 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sun, 24 Aug 2014 13:14:05 -0400 Subject: aio: fix reqs_available handling As reported by Dan Aloni, commit f8567a3845ac ("aio: fix aio request leak when events are reaped by userspace") introduces a regression when user code attempts to perform io_submit() with more events than are available in the ring buffer. Reverting that commit would reintroduce a regression when user space event reaping is used. Fixing this bug is a bit more involved than the previous attempts to fix this regression. Since we do not have a single point at which we can count events as being reaped by user space and io_getevents(), we have to track event completion by looking at the number of events left in the event ring. So long as there are as many events in the ring buffer as there have been completion events generate, we cannot call put_reqs_available(). The code to check for this is now placed in refill_reqs_available(). A test program from Dan and modified by me for verifying this bug is available at http://www.kvack.org/~bcrl/20140824-aio_bug.c . Reported-by: Dan Aloni Signed-off-by: Benjamin LaHaise Acked-by: Dan Aloni Cc: Kent Overstreet Cc: Mateusz Guzik Cc: Petr Matousek Cc: stable@vger.kernel.org # v3.16 and anything that f8567a3845ac was backported to Signed-off-by: Linus Torvalds --- fs/aio.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index ae635872affb..97bc62cbe2da 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -141,6 +141,7 @@ struct kioctx { struct { unsigned tail; + unsigned completed_events; spinlock_t completion_lock; } ____cacheline_aligned_in_smp; @@ -857,6 +858,68 @@ out: return ret; } +/* refill_reqs_available + * Updates the reqs_available reference counts used for tracking the + * number of free slots in the completion ring. This can be called + * from aio_complete() (to optimistically update reqs_available) or + * from aio_get_req() (the we're out of events case). It must be + * called holding ctx->completion_lock. + */ +static void refill_reqs_available(struct kioctx *ctx, unsigned head, + unsigned tail) +{ + unsigned events_in_ring, completed; + + /* Clamp head since userland can write to it. */ + head %= ctx->nr_events; + if (head <= tail) + events_in_ring = tail - head; + else + events_in_ring = ctx->nr_events - (head - tail); + + completed = ctx->completed_events; + if (events_in_ring < completed) + completed -= events_in_ring; + else + completed = 0; + + if (!completed) + return; + + ctx->completed_events -= completed; + put_reqs_available(ctx, completed); +} + +/* user_refill_reqs_available + * Called to refill reqs_available when aio_get_req() encounters an + * out of space in the completion ring. + */ +static void user_refill_reqs_available(struct kioctx *ctx) +{ + spin_lock_irq(&ctx->completion_lock); + if (ctx->completed_events) { + struct aio_ring *ring; + unsigned head; + + /* Access of ring->head may race with aio_read_events_ring() + * here, but that's okay since whether we read the old version + * or the new version, and either will be valid. The important + * part is that head cannot pass tail since we prevent + * aio_complete() from updating tail by holding + * ctx->completion_lock. Even if head is invalid, the check + * against ctx->completed_events below will make sure we do the + * safe/right thing. + */ + ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; + kunmap_atomic(ring); + + refill_reqs_available(ctx, head, ctx->tail); + } + + spin_unlock_irq(&ctx->completion_lock); +} + /* aio_get_req * Allocate a slot for an aio request. * Returns NULL if no requests are free. @@ -865,8 +928,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (!get_reqs_available(ctx)) - return NULL; + if (!get_reqs_available(ctx)) { + user_refill_reqs_available(ctx); + if (!get_reqs_available(ctx)) + return NULL; + } req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) @@ -925,8 +991,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; + unsigned tail, pos, head; unsigned long flags; - unsigned tail, pos; /* * Special case handling for sync iocbs: @@ -987,10 +1053,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2) ctx->tail = tail; ring = kmap_atomic(ctx->ring_pages[0]); + head = ring->head; ring->tail = tail; kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); + ctx->completed_events++; + if (ctx->completed_events > 1) + refill_reqs_available(ctx, head, tail); spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1005,7 +1075,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* everything turned out well, dispose of the aiocb. */ kiocb_free(iocb); - put_reqs_available(ctx, 1); /* * We have to order our ring_info tail store above and test -- cgit v1.2.3 From f736906a7669a77cf8cabdcbcf1dc8cb694e12ef Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 26 Aug 2014 19:04:44 +0400 Subject: CIFS: Fix wrong restart readdir for SMB1 The existing code calls server->ops->close() that is not right. This causes XFS test generic/310 to fail. Fix this by using server->ops->closedir() function. Cc: # v3.7+ Signed-off-by: Dan Carpenter Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 798c80a41c88..b334a89d6a66 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -596,8 +596,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; spin_unlock(&cifs_file_list_lock); - if (server->ops->close) - server->ops->close(xid, tcon, &cfile->fid); + if (server->ops->close_dir) + server->ops->close_dir(xid, tcon, &cfile->fid); } else spin_unlock(&cifs_file_list_lock); if (cfile->srch_inf.ntwrk_buf_start) { -- cgit v1.2.3 From 1bbe4997b13de903c421c1cc78440e544b5f9064 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 22 Aug 2014 13:32:11 +0400 Subject: CIFS: Fix wrong filename length for SMB2 The existing code uses the old MAX_NAME constant. This causes XFS test generic/013 to fail. Fix it by replacing MAX_NAME with PATH_MAX that SMB1 uses. Also remove an unused MAX_NAME constant definition. Cc: # v3.7+ Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 5 ----- fs/cifs/smb2file.c | 2 +- fs/cifs/smb2inode.c | 2 +- fs/cifs/smb2ops.c | 2 +- fs/cifs/smb2pdu.c | 2 +- 5 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index dfc731b02aa9..25b8392bfdd2 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -70,11 +70,6 @@ #define SERVER_NAME_LENGTH 40 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) -/* used to define string lengths for reversing unicode strings */ -/* (256+1)*2 = 514 */ -/* (max path length + 1 for null) * 2 for unicode */ -#define MAX_NAME 514 - /* SMB echo "timeout" -- FIXME: tunable? */ #define SMB_ECHO_INTERVAL (60 * HZ) diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 3f17b4550831..45992944e238 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, goto out; } - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 0150182a4494..899bbc86f73e 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, *adjust_tz = false; *symlink = false; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 4e4eecdec4f9..f522193b7184 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -389,7 +389,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index cb39c51cd3e0..74b3a6684383 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1532,7 +1532,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, { return query_info(xid, tcon, persistent_fid, volatile_fid, FILE_ALL_INFORMATION, - sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + sizeof(struct smb2_file_all_info) + PATH_MAX * 2, sizeof(struct smb2_file_all_info), data); } -- cgit v1.2.3 From c2e69583a4787b252f6be9a9daea4662eebc26f8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 25 Aug 2014 14:45:59 -0700 Subject: f2fs: truncate stale block for inline_data This verifies to truncate any allocated blocks, offset[0], by inline_data. Not figured out, but for making sure. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 044395c20ee9..45378196e19a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -823,22 +823,26 @@ int truncate_xattr_node(struct inode *inode, struct page *page) */ void remove_inode_page(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - struct page *page; - nid_t ino = inode->i_ino; struct dnode_of_data dn; - page = get_node_page(sbi, ino); - if (IS_ERR(page)) + set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); + if (get_dnode_of_data(&dn, 0, LOOKUP_NODE)) return; - if (truncate_xattr_node(inode, page)) { - f2fs_put_page(page, 1); + if (truncate_xattr_node(inode, dn.inode_page)) { + f2fs_put_dnode(&dn); return; } + + /* remove potential inline_data blocks */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + truncate_data_blocks_range(&dn, 1); + /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); - set_new_dnode(&dn, inode, page, page, ino); + + /* will put inode & node pages */ truncate_node(&dn); } -- cgit v1.2.3 From ca5d13fc33cc56f7405004574d18172ddbdea2ef Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 22 Aug 2014 04:40:46 -0500 Subject: Clarify Kconfig help text for CIFS and SMB2/SMB3 Clarify descriptions of SMB2 and SMB3 support in Kconfig Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/Kconfig | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 603f18a65c12..a2172f3f69e3 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -22,6 +22,11 @@ config CIFS support for OS/2 and Windows ME and similar servers is provided as well. + The module also provides optional support for the followon + protocols for CIFS including SMB3, which enables + useful performance and security features (see the description + of CONFIG_CIFS_SMB2). + The cifs module provides an advanced network file system client for mounting to CIFS compliant servers. It includes support for DFS (hierarchical name space), secure per-user @@ -121,7 +126,8 @@ config CIFS_ACL depends on CIFS_XATTR && KEYS help Allows fetching CIFS/NTFS ACL from the server. The DACL blob - is handed over to the application/caller. + is handed over to the application/caller. See the man + page for getcifsacl for more information. config CIFS_DEBUG bool "Enable CIFS debugging routines" @@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB2 - bool "SMB2 network file system support" + bool "SMB2 and SMB3 network file system support" depends on CIFS && INET select NLS select KEYS @@ -170,16 +176,21 @@ config CIFS_SMB2 select DNS_RESOLVER help - This enables experimental support for the SMB2 (Server Message Block - version 2) protocol. The SMB2 protocol is the successor to the - popular CIFS and SMB network file sharing protocols. SMB2 is the - native file sharing mechanism for recent versions of Windows - operating systems (since Vista). SMB2 enablement will eventually - allow users better performance, security and features, than would be - possible with cifs. Note that smb2 mount options also are simpler - (compared to cifs) due to protocol improvements. - - Unless you are a developer or tester, say N. + This enables support for the Server Message Block version 2 + family of protocols, including SMB3. SMB3 support is + enabled on mount by specifying "vers=3.0" in the mount + options. These protocols are the successors to the popular + CIFS and SMB network file sharing protocols. SMB3 is the + native file sharing mechanism for the more recent + versions of Windows (Windows 8 and Windows 2012 and + later) and Samba server and many others support SMB3 well. + In general SMB3 enables better performance, security + and features, than would be possible with CIFS (Note that + when mounting to Samba, due to the CIFS POSIX extensions, + CIFS mounts can provide slightly better POSIX compatibility + than SMB3 mounts do though). Note that SMB2/SMB3 mount + options are also slightly simpler (compared to CIFS) due + to protocol improvements. config CIFS_FSCACHE bool "Provide CIFS client caching support" -- cgit v1.2.3 From aee7af356e151494d5014f57b33460b162f181b5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Aug 2014 22:33:12 -0400 Subject: NFSv4: Fix problems with close in the presence of a delegation In the presence of delegations, we can no longer assume that the state->n_rdwr, state->n_rdonly, state->n_wronly reflect the open stateid share mode, and so we need to calculate the initial value for calldata->arg.fmode using the state->flags. Reported-by: James Drews Fixes: 88069f77e1ac5 (NFSv41: Fix a potential state leakage when...) Cc: stable@vger.kernel.org # 2.6.33+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 75ae8d22f067..ff94a2f36051 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2601,6 +2601,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct inode *inode = calldata->inode; + bool is_rdonly, is_wronly, is_rdwr; int call_close = 0; dprintk("%s: begin!\n", __func__); @@ -2608,18 +2609,24 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_wait; task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; - calldata->arg.fmode = FMODE_READ|FMODE_WRITE; spin_lock(&state->owner->so_lock); + is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); + is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); + is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); + /* Calculate the current open share mode */ + calldata->arg.fmode = 0; + if (is_rdonly || is_rdwr) + calldata->arg.fmode |= FMODE_READ; + if (is_wronly || is_rdwr) + calldata->arg.fmode |= FMODE_WRITE; /* Calculate the change in open mode */ if (state->n_rdwr == 0) { if (state->n_rdonly == 0) { - call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); - call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= is_rdonly || is_rdwr; calldata->arg.fmode &= ~FMODE_READ; } if (state->n_wronly == 0) { - call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); - call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); + call_close |= is_wronly || is_rdwr; calldata->arg.fmode &= ~FMODE_WRITE; } } -- cgit v1.2.3 From 412f6c4c26fb1eba8844290663837561ac53fa6e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 25 Aug 2014 22:09:08 -0400 Subject: NFSv4: Don't clear the open state when we just did an OPEN_DOWNGRADE If we did an OPEN_DOWNGRADE, then the right thing to do on success, is to apply the new open mode to the struct nfs4_state. Instead, we were unconditionally clearing the state, making it appear to our state machinery as if we had just performed a CLOSE. Fixes: 226056c5c312b (NFSv4: Use correct locking when updating nfs4_state...) Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ff94a2f36051..7dd8aca31c29 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2560,6 +2560,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); + nfs4_stateid *res_stateid = NULL; dprintk("%s: begin!\n", __func__); if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -2570,12 +2571,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data) */ switch (task->tk_status) { case 0: - if (calldata->roc) + res_stateid = &calldata->res.stateid; + if (calldata->arg.fmode == 0 && calldata->roc) pnfs_roc_set_barrier(state->inode, calldata->roc_barrier); - nfs_clear_open_stateid(state, &calldata->res.stateid, 0); renew_lease(server, calldata->timestamp); - goto out_release; + break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_OLD_STATEID: @@ -2589,7 +2590,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) goto out_release; } } - nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); + nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, calldata->res.fattr); -- cgit v1.2.3 From f87d928f6d98644d39809a013a22f981d39017cf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 24 Aug 2014 14:46:48 -0400 Subject: NFSv3: Fix another acl regression When creating a new object on the NFS server, we should not be sending posix setacl requests unless the preceding posix_acl_create returned a non-trivial acl. Doing so, causes Solaris servers in particular to return an EINVAL. Fixes: 013cdf1088d72 (nfs: use generic posix ACL infrastructure,,,) Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1132786 Cc: stable@vger.kernel.org # 3.14+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index d0fec260132a..24c6898159cc 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -129,7 +129,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status; + int status = 0; + + if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL)) + goto out; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) -- cgit v1.2.3 From 9776de96e51565286da25c74d6f631abc50c63ef Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 13 Aug 2014 12:58:16 -0400 Subject: FS-Cache: Timeout for releasepage() This is meant to avoid a recusive hang caused by underlying filesystem trying to grab a free page and causing a write-out. INFO: task kworker/u30:7:28375 blocked for more than 120 seconds. Not tainted 3.15.0-virtual #74 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/u30:7 D 0000000000000000 0 28375 2 0x00000000 Workqueue: fscache_operation fscache_op_work_func [fscache] ffff88000b147148 0000000000000046 0000000000000000 ffff88000b1471c8 ffff8807aa031820 0000000000014040 ffff88000b147fd8 0000000000014040 ffff880f0c50c860 ffff8807aa031820 ffff88000b147158 ffff88007be59cd0 Call Trace: [] schedule+0x29/0x70 [] __fscache_wait_on_page_write+0x55/0x90 [fscache] [] ? __wake_up_sync+0x20/0x20 [] __fscache_maybe_release_page+0x65/0x1e0 [fscache] [] ceph_releasepage+0x83/0x100 [ceph] [] ? anon_vma_fork+0x130/0x130 [] try_to_release_page+0x32/0x50 [] shrink_page_list+0x7e6/0x9d0 [] ? isolate_lru_pages.isra.73+0x78/0x1e0 [] shrink_inactive_list+0x252/0x4c0 [] shrink_lruvec+0x3e1/0x670 [] shrink_zone+0x3f/0x110 [] do_try_to_free_pages+0x1d6/0x450 [] ? zone_statistics+0x99/0xc0 [] try_to_free_pages+0xc4/0x180 [] __alloc_pages_nodemask+0x6b2/0xa60 [] ? __find_get_block+0xbe/0x250 [] ? wake_up_bit+0x2e/0x40 [] alloc_pages_current+0xb3/0x180 [] __page_cache_alloc+0xb7/0xd0 [] grab_cache_page_write_begin+0x7c/0xe0 [] ? ext4_mark_inode_dirty+0x82/0x220 [] ext4_da_write_begin+0x89/0x2d0 [] generic_perform_write+0xbe/0x1d0 [] ? update_time+0x81/0xc0 [] ? mnt_clone_write+0x12/0x30 [] __generic_file_aio_write+0x1ce/0x3f0 [] generic_file_aio_write+0x5e/0xe0 [] ext4_file_write+0x9f/0x410 [] ? ext4_file_open+0x66/0x180 [] do_sync_write+0x5a/0x90 [] cachefiles_write_page+0x149/0x430 [cachefiles] [] ? radix_tree_gang_lookup_tag+0x89/0xd0 [] fscache_write_op+0x222/0x3b0 [fscache] [] fscache_op_work_func+0x3a/0x100 [fscache] [] process_one_work+0x179/0x4a0 [] worker_thread+0x11b/0x370 [] ? manage_workers.isra.21+0x2e0/0x2e0 [] kthread+0xc9/0xe0 [] ? ftrace_raw_event_xen_mmu_release_ptpage+0x70/0x90 [] ? flush_kthread_worker+0xb0/0xb0 [] ret_from_fork+0x7c/0xb0 [] ? flush_kthread_worker+0xb0/0xb0 Signed-off-by: Milosz Tanski Signed-off-by: David Howells --- fs/fscache/page.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 85332b9d19d1..781ac7b0b53e 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -43,6 +43,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa } EXPORT_SYMBOL(__fscache_wait_on_page_write); +/* + * wait for a page to finish being written to the cache. Put a timeout here + * since we might be called recursively via parent fs. + */ +static +bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) +{ + wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + + return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page), + HZ); +} + /* * decide whether a page can be released, possibly by cancelling a store to it * - we're allowed to sleep if __GFP_WAIT is flagged @@ -115,7 +128,10 @@ page_busy: } fscache_stat(&fscache_n_store_vmscan_wait); - __fscache_wait_on_page_write(cookie, page); + if (!release_page_wait_timeout(cookie, page)) + _debug("fscache writeout timeout page: %p{%lx}", + page, page->index); + gfp &= ~__GFP_WAIT; goto try_again; } -- cgit v1.2.3 From 920bce20d74817bdd8bfcbc28ecb1179c9e01081 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 13 Aug 2014 12:58:21 -0400 Subject: FS-Cache: Reduce cookie ref count if submit fails. I've been seeing issues with disposing cookies under vma pressure. The symptom is that the refcount gets out of sync. In this case we fail to decrement the refcount if submit fails. I found this while auditing the error in and around cookie operations. Signed-off-by: Milosz Tanski Signed-off-by: David Howells --- fs/fscache/object.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index d3b4539f1651..da032daf0e0d 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -982,6 +982,7 @@ nomem: submit_op_failed: clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); spin_unlock(&cookie->lock); + fscache_unuse_cookie(object); kfree(op); _leave(" [EIO]"); return transit_to(KILL_OBJECT); -- cgit v1.2.3 From e9512d72e8e61c750c90efacd720abe3c4569822 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 26 Aug 2014 13:55:54 -0700 Subject: Btrfs: fix autodefrag with compression The autodefrag code skips defrag when two extents are adjacent. But one big advantage for autodefrag is cutting down on the number of small extents, even when they are adjacent. This commit changes it to defrag all small extents. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fce6fd0e3f50..a018ea484d39 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1019,8 +1019,10 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return false; next = defrag_lookup_extent(inode, em->start + em->len); - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || - (em->block_start + em->block_len == next->block_start)) + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + ret = false; + else if ((em->block_start + em->block_len == next->block_start) && + (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) ret = false; free_extent_map(next); @@ -1055,7 +1057,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh, } next_mergeable = defrag_check_next_extent(inode, em); - /* * we hit a real extent, if it is big or the next extent is not a * real extent, don't bother defragging it -- cgit v1.2.3 From 69dc9536405213c1d545fcace1fc15c481d00aae Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Aug 2014 18:33:49 -0400 Subject: ext4: fix incorect journal credits reservation in ext4_zero_range Currently we reserve only 4 blocks but in worst case scenario ext4_zero_partial_blocks() may want to zeroout and convert two non adjacent blocks. Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f0e6934291dd..0e9de2328bd2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4731,6 +4731,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t new_size = 0; int ret = 0; int flags; + int credits; int partial; loff_t start, end; ext4_lblk_t lblk; @@ -4830,8 +4831,14 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (ret) goto out_dio; } - - handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); + /* + * In worst case we have to writeout two nonadjacent unwritten + * blocks and update the inode + */ + credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; + if (ext4_should_journal_data(inode)) + credits += 2; + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); -- cgit v1.2.3 From c174e6d6979a04b7b77b93f244396be4b81f8bfb Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Aug 2014 18:40:00 -0400 Subject: ext4: fix transaction issues for ext4_fallocate and ext_zero_range After commit f282ac19d86f we use different transactions for preallocation and i_disksize update which result in complain from fsck after power-failure. spotted by generic/019. IMHO this is regression because fs becomes inconsistent, even more 'e2fsck -p' will no longer works (which drives admins go crazy) Same transaction requirement applies ctime,mtime updates testcase: xfstest generic/019 Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 68 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0e9de2328bd2..74292a71b384 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4665,7 +4665,8 @@ retry: } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, - ext4_lblk_t len, int flags, int mode) + ext4_lblk_t len, loff_t new_size, + int flags, int mode) { struct inode *inode = file_inode(file); handle_t *handle; @@ -4674,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int retries = 0; struct ext4_map_blocks map; unsigned int credits; + loff_t epos; map.m_lblk = offset; + map.m_len = len; /* * Don't normalize the request if it can fit in one extent so * that it doesn't get unnecessarily split into multiple @@ -4690,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, credits = ext4_chunk_trans_blocks(inode, len); retry: - while (ret >= 0 && ret < len) { - map.m_lblk = map.m_lblk + ret; - map.m_len = len = len - ret; + while (ret >= 0 && len) { handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { @@ -4709,6 +4710,21 @@ retry: ret2 = ext4_journal_stop(handle); break; } + map.m_lblk += ret; + map.m_len = len = len - ret; + epos = (loff_t)map.m_lblk << inode->i_blkbits; + inode->i_ctime = ext4_current_time(inode); + if (new_size) { + if (epos > new_size) + epos = new_size; + if (ext4_update_inode_size(inode, epos) & 0x1) + inode->i_mtime = inode->i_ctime; + } else { + if (epos > inode->i_size) + ext4_set_inode_flag(inode, + EXT4_INODE_EOFBLOCKS); + } + ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); if (ret2) break; @@ -4732,7 +4748,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, int ret = 0; int flags; int credits; - int partial; + int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; struct address_space *mapping = inode->i_mapping; @@ -4772,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (start < offset || end > offset + len) return -EINVAL; - partial = (offset + len) & ((1 << blkbits) - 1); + partial_begin = offset & ((1 << blkbits) - 1); + partial_end = (offset + len) & ((1 << blkbits) - 1); lblk = start >> blkbits; max_blocks = (end >> blkbits); @@ -4806,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, * If we have a partial block after EOF we have to allocate * the entire block. */ - if (partial) + if (partial_end) max_blocks += 1; } @@ -4814,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Now release the pages and zero block aligned part of pages*/ truncate_pagecache_range(inode, start, end - 1); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); @@ -4826,11 +4844,14 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (ret) goto out_dio; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, - mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); if (ret) goto out_dio; } + if (!partial_begin && !partial_end) + goto out_dio; + /* * In worst case we have to writeout two nonadjacent unwritten * blocks and update the inode @@ -4856,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, if ((offset + len) > i_size_read(inode)) ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); } - ext4_mark_inode_dirty(handle, inode); /* Zero out partial block at the edges of the range */ @@ -4883,7 +4903,6 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - handle_t *handle; loff_t new_size = 0; unsigned int max_blocks; int ret = 0; @@ -4939,32 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); if (ret) goto out; - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - goto out; - - inode->i_ctime = ext4_current_time(inode); - - if (new_size) { - if (ext4_update_inode_size(inode, new_size) & 0x1) - inode->i_mtime = inode->i_ctime; - } else { - /* - * Mark that we allocate beyond EOF so the subsequent truncate - * can proceed even if the new size is the same as i_size. - */ - if ((offset + len) > i_size_read(inode)) - ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); } - ext4_mark_inode_dirty(handle, inode); - if (file->f_flags & O_SYNC) - ext4_handle_sync(handle); - - ext4_journal_stop(handle); out: mutex_unlock(&inode->i_mutex); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -- cgit v1.2.3 From 922cedbd00b68e506eb6f37cbe07cdf399a37b27 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 28 Aug 2014 16:13:21 +0300 Subject: f2fs: simplify by using a literal We can make the code a bit simpler because we know that "!retry" is zero. Signed-off-by: Dan Carpenter Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0f61f5aa587c..41bdf511003d 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1133,7 +1133,7 @@ free_sbi: /* give only one another chance */ if (retry) { - retry = !retry; + retry = 0; shrink_dcache_sb(sb); goto try_onemore; } -- cgit v1.2.3 From 6603120e96eae9a5d6228681ae55c7fdc998d1bb Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 27 Aug 2014 18:40:03 -0400 Subject: ext4: update i_disksize coherently with block allocation on error path In case of delalloc block i_disksize may be less than i_size. So we have to update i_disksize each time we allocated and submitted some blocks beyond i_disksize. We weren't doing this on the error paths, so fix this. testcase: xfstest generic/019 Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b1ddd9352644..3aa26e9117c4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2077,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; + int progress = 0; mpd->io_submit.io_end->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; @@ -2093,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle, * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || - (err == -ENOSPC && ext4_count_free_clusters(sb))) + (err == -ENOSPC && ext4_count_free_clusters(sb))) { + if (progress) + goto update_disksize; return err; + } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" @@ -2111,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle, *give_up_on_write = true; return err; } + progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) - return err; + goto update_disksize; } while (map->m_len); +update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. -- cgit v1.2.3 From 022eaa7517017efe4f6538750c2b59a804dc7df7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2014 18:40:05 -0400 Subject: jbd2: fix infinite loop when recovering corrupt journal blocks When recovering the journal, don't fall into an infinite loop if we encounter a corrupt journal block. Instead, just skip the block and return an error, which fails the mount and thus forces the user to run a full filesystem fsck. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/jbd2/recovery.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 3b6bb19d60b1..00e9703d7dc6 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -426,6 +426,7 @@ static int do_one_pass(journal_t *journal, int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ int descr_csum_size = 0; + int block_error = 0; /* * First thing is to establish what we expect to find in the log @@ -598,7 +599,8 @@ static int do_one_pass(journal_t *journal, "checksum recovering " "block %llu in log\n", blocknr); - continue; + block_error = 1; + goto skip_write; } /* Find a buffer for the new @@ -797,7 +799,8 @@ static int do_one_pass(journal_t *journal, success = -EIO; } } - + if (block_error && success == 0) + success = -EIO; return success; failed: -- cgit v1.2.3 From db9ee220361de03ee86388f9ea5e529eaad5323c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2014 18:40:07 -0400 Subject: jbd2: fix descriptor block size handling errors with journal_csum It turns out that there are some serious problems with the on-disk format of journal checksum v2. The foremost is that the function to calculate descriptor tag size returns sizes that are too big. This causes alignment issues on some architectures and is compounded by the fact that some parts of jbd2 use the structure size (incorrectly) to determine the presence of a 64bit journal instead of checking the feature flags. Therefore, introduce journal checksum v3, which enlarges the descriptor block tag format to allow for full 32-bit checksums of journal blocks, fix the journal tag function to return the correct sizes, and fix the jbd2 recovery code to use feature flags to determine 64bitness. Add a few function helpers so we don't have to open-code quite so many pieces. Switching to a 16-byte block size was found to increase journal size overhead by a maximum of 0.1%, to convert a 32-bit journal with no checksumming to a 32-bit journal with checksum v3 enabled. Signed-off-by: Darrick J. Wong Reported-by: TR Reardon Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 5 +++-- fs/jbd2/commit.c | 21 +++++++++++--------- fs/jbd2/journal.c | 56 ++++++++++++++++++++++++++++++++++------------------ fs/jbd2/recovery.c | 26 +++++++++++++----------- fs/jbd2/revoke.c | 6 +++--- include/linux/jbd2.h | 30 +++++++++++++++++++++++----- 6 files changed, 95 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 32b43ad154b9..0b28b36e7915 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3181,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb) if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { - /* journal checksum v2 */ + /* journal checksum v3 */ compat = 0; - incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; + incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; } else { /* journal checksum v1 */ compat = JBD2_FEATURE_COMPAT_CHECKSUM; @@ -3205,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) jbd2_journal_clear_features(sbi->s_journal, JBD2_FEATURE_COMPAT_CHECKSUM, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | + JBD2_FEATURE_INCOMPAT_CSUM_V3 | JBD2_FEATURE_INCOMPAT_CSUM_V2); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6fac74349856..b73e0215baa7 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) struct commit_header *h; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; h = (struct commit_header *)(bh->b_data); @@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) return checksum; } -static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, +static void write_tag_block(journal_t *j, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); - if (tag_bytes > JBD2_TAG_SIZE32) + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT)) tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } @@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j, struct jbd2_journal_block_tail *tail; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - @@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j, static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; struct page *page = bh->b_page; __u8 *addr; __u32 csum32; __be32 seq; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; seq = cpu_to_be32(sequence); @@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, bh->b_size); kunmap_atomic(addr); - /* We only have space to store the lower 16 bits of the crc32c. */ - tag->t_checksum = cpu_to_be16(csum32); + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + tag3->t_checksum = cpu_to_be32(csum32); + else + tag->t_checksum = cpu_to_be16(csum32); } /* * jbd2_journal_commit_transaction @@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) LIST_HEAD(io_bufs); LIST_HEAD(log_bufs); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); /* @@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) tag_flag |= JBD2_FLAG_SAME_UUID; tag = (journal_block_tag_t *) tagp; - write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); + write_tag_block(journal, tag, jh2bh(jh)->b_blocknr); tag->t_flags = cpu_to_be16(tag_flag); jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], commit_transaction->t_tid); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 67b8e303946c..19d74d86d99c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug); /* Checksumming functions */ static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; @@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; return sb->s_checksum == jbd2_superblock_csum(j, sb); @@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) { - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; sb->s_checksum = jbd2_superblock_csum(j, sb); @@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && - JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + if (jbd2_journal_has_csum_v2or3(journal) && + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " "at the same time!\n"); goto out; } + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + /* Can't have checksum v2 and v3 at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " + "at the same time!\n"); + goto out; + } + if (!jbd2_verify_csum_type(journal, sb)) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; } /* Load the checksum driver */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + if (jbd2_journal_has_csum_v2or3(journal)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); @@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal) } /* Precompute checksum seed for all metadata */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); @@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) return 0; - /* Asking for checksumming v2 and v1? Only give them v2. */ - if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 && + /* If enabling v2 checksums, turn on v3 instead */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) { + incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2; + incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3; + } + + /* Asking for checksumming v3 and v1? Only give them v3. */ + if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 && compat & JBD2_FEATURE_COMPAT_CHECKSUM) compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; @@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; - /* If enabling v2 checksums, update superblock */ - if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) { + /* If enabling v3 checksums, update superblock */ + if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); @@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, } /* Precompute checksum seed for all metadata */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); @@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, /* If enabling v1 checksums, downgrade superblock */ if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) sb->s_feature_incompat &= - ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2); + ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 | + JBD2_FEATURE_INCOMPAT_CSUM_V3); sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); @@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode) */ size_t journal_tag_bytes(journal_t *journal) { - journal_block_tag_t tag; - size_t x = 0; + size_t sz; + + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return sizeof(journal_block_tag3_t); + + sz = sizeof(journal_block_tag_t); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) - x += sizeof(tag.t_checksum); + sz += sizeof(__u16); if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) - return x + JBD2_TAG_SIZE64; + return sz; else - return x + JBD2_TAG_SIZE32; + return sz - sizeof(__u32); } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 00e9703d7dc6..9b329b55ffe3 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j, __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - @@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) int nr = 0, size = journal->j_blocksize; int tag_bytes = journal_tag_bytes(journal); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) size -= sizeof(struct jbd2_journal_block_tail); tagp = &bh->b_data[sizeof(journal_header_t)]; @@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal) return err; } -static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag) +static inline unsigned long long read_tag_block(journal_t *journal, + journal_block_tag_t *tag) { unsigned long long block = be32_to_cpu(tag->t_blocknr); - if (tag_bytes > JBD2_TAG_SIZE32) + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; return block; } @@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; h = buf; @@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, void *buf, __u32 sequence) { + journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; __u32 csum32; __be32 seq; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; seq = cpu_to_be32(sequence); csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); - return tag->t_checksum == cpu_to_be16(csum32); + if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return tag3->t_checksum == cpu_to_be32(csum32); + else + return tag->t_checksum == cpu_to_be16(csum32); } static int do_one_pass(journal_t *journal, @@ -513,8 +518,7 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* Verify checksum first */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) descr_csum_size = sizeof(struct jbd2_journal_block_tail); if (descr_csum_size > 0 && @@ -575,7 +579,7 @@ static int do_one_pass(journal_t *journal, unsigned long long blocknr; J_ASSERT(obh != NULL); - blocknr = read_tag_block(tag_bytes, + blocknr = read_tag_block(journal, tag); /* If the block has been @@ -814,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j, __be32 provided; __u32 calculated; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return 1; tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 198c9c10276d..d5e95a175c92 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -91,8 +91,8 @@ #include #include #include -#endif #include +#endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; @@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal, offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); /* Make sure we have a descriptor with space left for the record */ @@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh) struct jbd2_journal_revoke_tail *tail; __u32 csum; - if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (!jbd2_journal_has_csum_v2or3(j)) return; tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d5b50a19463c..0dae71e9971c 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -159,7 +159,11 @@ typedef struct journal_header_s * journal_block_tag (in the descriptor). The other h_chksum* fields are * not used. * - * Checksum v1 and v2 are mutually exclusive features. + * If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses + * journal_block_tag3_t to store a full 32-bit checksum. Everything else + * is the same as v2. + * + * Checksum v1, v2, and v3 are mutually exclusive features. */ struct commit_header { __be32 h_magic; @@ -179,6 +183,14 @@ struct commit_header { * raw struct shouldn't be used for pointer math or sizeof() - use * journal_tag_bytes(journal) instead to compute this. */ +typedef struct journal_block_tag3_s +{ + __be32 t_blocknr; /* The on-disk block number */ + __be32 t_flags; /* See below */ + __be32 t_blocknr_high; /* most-significant high 32bits. */ + __be32 t_checksum; /* crc32c(uuid+seq+block) */ +} journal_block_tag3_t; + typedef struct journal_block_tag_s { __be32 t_blocknr; /* The on-disk block number */ @@ -187,9 +199,6 @@ typedef struct journal_block_tag_s __be32 t_blocknr_high; /* most-significant high 32bits. */ } journal_block_tag_t; -#define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high)) -#define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t)) - /* Tail of descriptor block, for checksumming */ struct jbd2_journal_block_tail { __be32 t_checksum; /* crc32c(uuid+descr_block) */ @@ -284,6 +293,7 @@ typedef struct journal_superblock_s #define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004 #define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008 +#define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010 /* Features known to this kernel version: */ #define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM @@ -291,7 +301,8 @@ typedef struct journal_superblock_s #define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \ JBD2_FEATURE_INCOMPAT_64BIT | \ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \ - JBD2_FEATURE_INCOMPAT_CSUM_V2) + JBD2_FEATURE_INCOMPAT_CSUM_V2 | \ + JBD2_FEATURE_INCOMPAT_CSUM_V3) #ifdef __KERNEL__ @@ -1296,6 +1307,15 @@ static inline int tid_geq(tid_t x, tid_t y) extern int jbd2_journal_blocks_per_page(struct inode *inode); extern size_t journal_tag_bytes(journal_t *journal); +static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) +{ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) || + JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + return 1; + + return 0; +} + /* * We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for * transaction control blocks. -- cgit v1.2.3 From d80d448c6c5bdd32605b78a60fe8081d82d4da0f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 27 Aug 2014 18:40:09 -0400 Subject: ext4: fix same-dir rename when inline data directory overflows When performing a same-directory rename, it's possible that adding or setting the new directory entry will cause the directory to overflow the inline data area, which causes the directory to be converted to an extent-based directory. Under this circumstance it is necessary to re-read the directory when deleting the old dirent because the "old directory" context still points to i_block in the inode table, which is now an extent tree root! The delete fails with an FS error, and the subsequent fsck complains about incorrect link counts and hardlinked directories. Test case (originally found with flat_dir_test in the metadata_csum test program): # mkfs.ext4 -O inline_data /dev/sda # mount /dev/sda /mnt # mkdir /mnt/x # touch /mnt/x/changelog.gz /mnt/x/copyright /mnt/x/README.Debian # sync # for i in /mnt/x/*; do mv $i $i.longer; done # ls -la /mnt/x/ total 0 -rw-r--r-- 1 root root 0 Aug 25 12:03 changelog.gz.longer -rw-r--r-- 1 root root 0 Aug 25 12:03 copyright -rw-r--r-- 1 root root 0 Aug 25 12:03 copyright.longer -rw-r--r-- 1 root root 0 Aug 25 12:03 README.Debian.longer (Hey! Why are there four files now??) Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ae7088b446d1..90a3cdca3f88 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3147,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, return retval; } -static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) +static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, + int force_reread) { int retval; /* @@ -3159,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || ent->de->name_len != ent->dentry->d_name.len || strncmp(ent->de->name, ent->dentry->d_name.name, - ent->de->name_len)) { + ent->de->name_len) || + force_reread) { retval = ext4_find_delete_entry(handle, ent->dir, &ent->dentry->d_name); } else { @@ -3210,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, .dentry = new_dentry, .inode = new_dentry->d_inode, }; + int force_reread; int retval; dquot_initialize(old.dir); @@ -3271,6 +3274,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (retval) goto end_rename; } + /* + * If we're renaming a file within an inline_data dir and adding or + * setting the new dirent causes a conversion from inline_data to + * extents/blockmap, we need to force the dirent delete code to + * re-read the directory, or else we end up trying to delete a dirent + * from what is now the extent tree root (or a block map). + */ + force_reread = (new.dir->i_ino == old.dir->i_ino && + ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) @@ -3281,6 +3293,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (retval) goto end_rename; } + if (force_reread) + force_reread = !ext4_test_inode_flag(new.dir, + EXT4_INODE_INLINE_DATA); /* * Like most other Unix systems, set the ctime for inodes on a @@ -3292,7 +3307,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, /* * ok, that's it */ - ext4_rename_delete(handle, &old); + ext4_rename_delete(handle, &old, force_reread); if (new.inode) { ext4_dec_count(handle, new.inode); -- cgit v1.2.3 From 3304b56401c4509ffaa74705b49edc9e13cee195 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 29 Aug 2014 00:26:50 -0700 Subject: f2fs: fix wrong casting for dentry name The dentry name type is unsigned char *. If we don't match this type, some character codes can be changed by signed bit. Signed-off-by: Jaegeuk Kim --- fs/f2fs/hash.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 948d17bf7281..a844fcfb9a8d 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -42,7 +42,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[]) buf[1] += b1; } -static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) +static void str2hashbuf(const unsigned char *msg, size_t len, + unsigned int *buf, int num) { unsigned pad, val; int i; @@ -73,9 +74,9 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) { __u32 hash; f2fs_hash_t f2fs_hash; - const char *p; + const unsigned char *p; __u32 in[8], buf[4]; - const char *name = name_info->name; + const unsigned char *name = name_info->name; size_t len = name_info->len; if ((len <= 2) && (name[0] == '.') && -- cgit v1.2.3 From 2b462638e41ea62230297c21c4da9955937b7a3c Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 29 Aug 2014 15:18:58 -0700 Subject: ocfs2: do not write error flag to user structure we cannot copy from/to If we failed to copy from the structure, writing back the flags leaks 31 bits of kernel memory (the rest of the ir_flags field). In any case, if we cannot copy from/to the structure, why should we expect putting just the flags to work? Also make sure ocfs2_info_handle_freeinode() returns the right error code if the copy_to_user() fails. Fixes: ddee5cdb70e6 ('Ocfs2: Add new OCFS2_IOC_INFO ioctl for ocfs2 v8.') Signed-off-by: Ben Hutchings Cc: Joel Becker Acked-by: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ioctl.c | 129 +++++++++++++++++++------------------------------------ 1 file changed, 43 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 6f66b3751ace..53e6c40ed4c6 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -35,9 +35,8 @@ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* - * This call is void because we are already reporting an error that may - * be -EFAULT. The error will be returned from the ioctl(2) call. It's - * just a best-effort to tell userspace that this request caused the error. + * This is just a best-effort to tell userspace that this request + * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) @@ -146,136 +145,105 @@ bail: static int ocfs2_info_handle_blocksize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) - goto bail; + return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oib.ib_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_clustersize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) - goto bail; + return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oic.ic_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_maxslots(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) - goto bail; + return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oim.im_req, req); - - return status; + return 0; } static int ocfs2_info_handle_label(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) - goto bail; + return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oil.il_req, req); - - return status; + return 0; } static int ocfs2_info_handle_uuid(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) - goto bail; + return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oiu.iu_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_fs_features(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) - goto bail; + return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; @@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode, o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oif.if_req, req); - - return status; + return 0; } static int ocfs2_info_handle_journal_size(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) - goto bail; + return -EFAULT; oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oij.ij_req, req); - - return status; + return 0; } static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, @@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, u32 i; u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; @@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto out_err; } - if (o2info_from_user(*oifi, req)) - goto bail; + if (o2info_from_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } oifi->ifi_slotnum = osb->max_slots; @@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, o2info_set_request_filled(&oifi->ifi_req); - if (o2info_to_user(*oifi, req)) - goto bail; + if (o2info_to_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); - +out_free: kfree(oifi); out_err: return status; @@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, { u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto out_err; } - if (o2info_from_user(*oiff, req)) - goto bail; + if (o2info_from_user(*oiff, req)) { + status = -EFAULT; + goto out_free; + } /* * chunksize from userspace should be power of 2. */ @@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, if (o2info_to_user(*oiff, req)) { status = -EFAULT; - goto bail; + goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); - +out_free: kfree(oiff); out_err: return status; @@ -727,23 +690,17 @@ out_err: static int ocfs2_info_handle_unknown(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) - goto bail; + return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oir, req); - - return status; + return 0; } /* -- cgit v1.2.3 From c43c363def04cdaed0d9e26dae846081f55714e7 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:00 -0700 Subject: ocfs2: o2net: don't shutdown connection when idle timeout This patch series is to fix a possible message lost bug in ocfs2 when network go bad. This bug will cause ocfs2 hung forever even network become good again. The messages may lost in this case. After the tcp connection is established between two nodes, an idle timer will be set to check its state periodically, if no messages are received during this time, idle timer will timeout, it will shutdown the connection and try to reconnect, so pending messages in tcp queues will be lost. This messages may be from dlm. Dlm may get hung in this case. This may cause the whole ocfs2 cluster hung. This is very possible to happen when network state goes bad. Do the reconnect is useless, it will fail if network state is still bad. Just waiting there for network recovering may be a good idea, it will not lost messages and some node will be fenced until cluster goes into split-brain state, for this case, Tcp user timeout is used to override the tcp retransmit timeout. It will timeout after 25 days, user should have notice this through the provided log and fix the network, if they don't, ocfs2 will fall back to original reconnect way. This patch (of 3): Some messages in the tcp queue maybe lost if we shutdown the connection and reconnect when idle timeout. If packets lost and reconnect success, then the ocfs2 cluster maybe hung. To fix this, we can leave the connection there and do the fence decision when idle timeout, if network recover before fence dicision is made, the connection survive without lost any messages. This bug can be saw when network state go bad. It may cause ocfs2 hung forever if some packets lost. With this fix, ocfs2 will recover from hung if network becomes good again. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 681691bc233a..2334bfc966c1 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1536,16 +1536,20 @@ static void o2net_idle_timer(unsigned long data) #endif printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), - msecs / 1000, msecs % 1000); + "idle for %lu.%lu secs.\n", + SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in o2net_start_connect. + /* idle timerout happen, don't shutdown the connection, but + * make fence decision. Maybe the connection can recover before + * the decision is made. */ atomic_set(&nn->nn_timeout, 1); + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + + o2net_sc_reset_idle_timer(sc); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) @@ -1560,6 +1564,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + /* clear fence decision since the connection recover from timeout*/ + if (atomic_read(&nn->nn_timeout)) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_still_up); + atomic_set(&nn->nn_timeout, 0); + } + /* Only push out an existing timer */ if (timer_pending(&sc->sc_idle_timeout)) o2net_sc_reset_idle_timer(sc); -- cgit v1.2.3 From 8e9801dfe37c9e68cdbfcd15988df2187191864e Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:02 -0700 Subject: ocfs2: o2net: set tcp user timeout to max value When tcp retransmit timeout(15mins), the connection will be closed. Pending messages may be lost during this time. So we set tcp user timeout to override the retransmit timeout to the max value. This is OK for ocfs2 since we have disk heartbeat, if peer crash, the disk heartbeat will timeout and it will be evicted, if disk heartbeat not timeout and connection idle for a long time, then this means the cluster enters split-brain state, since fence can't happen, we'd better keep the connection and wait network recover. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 20 ++++++++++++++++++++ fs/ocfs2/cluster/tcp.h | 1 + 2 files changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2334bfc966c1..ea34952f9496 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock) return ret; } +static int o2net_set_usertimeout(struct socket *sock) +{ + int user_timeout = O2NET_TCP_USER_TIMEOUT; + + return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&user_timeout, sizeof(user_timeout)); +} + static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1663,6 +1671,12 @@ static void o2net_start_connect(struct work_struct *work) goto out; } + ret = o2net_set_usertimeout(sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + o2net_register_callbacks(sc->sc_sock->sk, sc); spin_lock(&nn->nn_lock); @@ -1844,6 +1858,12 @@ static int o2net_accept_one(struct socket *sock, int *more) goto out; } + ret = o2net_set_usertimeout(new_sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + slen = sizeof(sin); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, &slen, 1); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 5bada2a69b50..c571e849fda4 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 +#define O2NET_TCP_USER_TIMEOUT 0x7fffffff /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) -- cgit v1.2.3 From 8c7b638cece146234b0c0d5f6ba84d1cf6f81e83 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 29 Aug 2014 15:19:04 -0700 Subject: ocfs2: quorum: add a log for node not fenced For debug use, we can see from the log whether the fence decision is made and why it is not fenced. Signed-off-by: Junxiao Bi Reviewed-by: Srinivas Eeda Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/quorum.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ec141e758d7..62e8ec619b4c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work) } out: - spin_unlock(&qs->qs_lock); - if (fence) + if (fence) { + spin_unlock(&qs->qs_lock); o2quo_fence_self(); + } else { + mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " + "connected: %d, lowest: %d (%sreachable)\n", + qs->qs_heartbeating, qs->qs_connected, lowest_hb, + lowest_reachable ? "" : "un"); + spin_unlock(&qs->qs_lock); + + } + } static void o2quo_set_hold(struct o2quo_state *qs, u8 node) -- cgit v1.2.3 From 88b368f27a094277143d8ecd5a056116f6a41520 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 18 Aug 2014 15:09:26 -0400 Subject: get rid of propagate_umount() mistakenly treating slaves as busy. The check in __propagate_umount() ("has somebody explicitly mounted something on that slave?") is done *before* taking the already doomed victims out of the child lists. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namespace.c | 4 +++- fs/pnode.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index a01c7730e9af..3273177873f0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1253,6 +1253,9 @@ void umount_tree(struct mount *mnt, int how) hlist_add_head(&p->mnt_hash, &tmp_list); } + hlist_for_each_entry(p, &tmp_list, mnt_hash) + list_del_init(&p->mnt_child); + if (how) propagate_umount(&tmp_list); @@ -1263,7 +1266,6 @@ void umount_tree(struct mount *mnt, int how) p->mnt_ns = NULL; if (how < 2) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; - list_del_init(&p->mnt_child); if (mnt_has_parent(p)) { put_mountpoint(p->mnt_mp); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ diff --git a/fs/pnode.c b/fs/pnode.c index 302bf22c4a30..aae331a5d03b 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt) * other children */ if (child && list_empty(&child->mnt_mounts)) { + list_del_init(&child->mnt_child); hlist_del_init_rcu(&child->mnt_hash); hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); } -- cgit v1.2.3 From 81b6b06197606b4bef4e427a197aeb808e8d89e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Aug 2014 18:32:05 -0400 Subject: fix EBUSY on umount() from MNT_SHRINKABLE We need the parents of victims alive until namespace_unlock() gets to dput() of the (ex-)mountpoints. However, that screws up the "is it busy" checks in case when we have shrinkable mounts that need to be killed. Solution: go ahead and decrement refcounts of parents right in umount_tree(), increment them again just before dropping rwsem in namespace_unlock() (and let the loop in the end of namespace_unlock() finally drop those references for good, as we do now). Parents can't get freed until we drop rwsem - at least one reference is kept until then, both in case when parent is among the victims and when it is not. So they'll still be around when we get to namespace_unlock(). Cc: stable@vger.kernel.org # 3.12+ Signed-off-by: Al Viro --- fs/namespace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 3273177873f0..ef42d9bee212 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1217,6 +1217,11 @@ static void namespace_unlock(void) head.first->pprev = &head.first; INIT_HLIST_HEAD(&unmounted); + /* undo decrements we'd done in umount_tree() */ + hlist_for_each_entry(mnt, &head, mnt_hash) + if (mnt->mnt_ex_mountpoint.mnt) + mntget(mnt->mnt_ex_mountpoint.mnt); + up_write(&namespace_sem); synchronize_rcu(); @@ -1268,6 +1273,7 @@ void umount_tree(struct mount *mnt, int how) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; if (mnt_has_parent(p)) { put_mountpoint(p->mnt_mp); + mnt_add_count(p->mnt_parent, -1); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; -- cgit v1.2.3 From 22e757a49cf010703fcb9c9b4ef793248c39b0c2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 2 Sep 2014 12:12:51 +1000 Subject: xfs: don't dirty buffers beyond EOF generic/263 is failing fsx at this point with a page spanning EOF that cannot be invalidated. The operations are: 1190 mapwrite 0x52c00 thru 0x5e569 (0xb96a bytes) 1191 mapread 0x5c000 thru 0x5d636 (0x1637 bytes) 1192 write 0x5b600 thru 0x771ff (0x1bc00 bytes) where 1190 extents EOF from 0x54000 to 0x5e569. When the direct IO write attempts to invalidate the cached page over this range, it fails with -EBUSY and so any attempt to do page invalidation fails. The real question is this: Why can't that page be invalidated after it has been written to disk and cleaned? Well, there's data on the first two buffers in the page (1k block size, 4k page), but the third buffer on the page (i.e. beyond EOF) is failing drop_buffers because it's bh->b_state == 0x3, which is BH_Uptodate | BH_Dirty. IOWs, there's dirty buffers beyond EOF. Say what? OK, set_buffer_dirty() is called on all buffers from __set_page_buffers_dirty(), regardless of whether the buffer is beyond EOF or not, which means that when we get to ->writepage, we have buffers marked dirty beyond EOF that we need to clean. So, we need to implement our own .set_page_dirty method that doesn't dirty buffers beyond EOF. This is messy because the buffer code is not meant to be shared and it has interesting locking issues on the buffer dirty bits. So just copy and paste it and then modify it to suit what we need. Note: the solutions the other filesystems and generic block code use of marking the buffers clean in ->writepage does not work for XFS. It still leaves dirty buffers beyond EOF and invalidations still fail. Hence rather than play whack-a-mole, this patch simply prevents those buffers from being dirtied in the first place. cc: Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 11e9b4caa54f..b984647c24db 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1753,11 +1753,72 @@ xfs_vm_readpages( return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } +/* + * This is basically a copy of __set_page_dirty_buffers() with one + * small tweak: buffers beyond EOF do not get marked dirty. If we mark them + * dirty, we'll never be able to clean them because we don't write buffers + * beyond EOF, and that means we can't invalidate pages that span EOF + * that have been marked dirty. Further, the dirty state can leak into + * the file interior if the file is extended, resulting in all sorts of + * bad things happening as the state does not match the underlying data. + * + * XXX: this really indicates that bufferheads in XFS need to die. Warts like + * this only exist because of bufferheads and how the generic code manages them. + */ +STATIC int +xfs_vm_set_page_dirty( + struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + loff_t end_offset; + loff_t offset; + int newly_dirty; + + if (unlikely(!mapping)) + return !TestSetPageDirty(page); + + end_offset = i_size_read(inode); + offset = page_offset(page); + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + if (offset < end_offset) + set_buffer_dirty(bh); + bh = bh->b_this_page; + offset += 1 << inode->i_blkbits; + } while (bh != head); + } + newly_dirty = !TestSetPageDirty(page); + spin_unlock(&mapping->private_lock); + + if (newly_dirty) { + /* sigh - __set_page_dirty() is static, so copy it here, too */ + unsigned long flags; + + spin_lock_irqsave(&mapping->tree_lock, flags); + if (page->mapping) { /* Race with truncate? */ + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + return newly_dirty; +} + const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, + .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, .write_begin = xfs_vm_write_begin, -- cgit v1.2.3 From 85e584da3212140ee80fd047f9058bbee0bc00d5 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 2 Sep 2014 12:12:52 +1000 Subject: xfs: don't zero partial page cache pages during O_DIRECT writes xfs is using truncate_pagecache_range to invalidate the page cache during DIO reads. This is different from the other filesystems who only invalidate pages during DIO writes. truncate_pagecache_range is meant to be used when we are freeing the underlying data structs from disk, so it will zero any partial ranges in the page. This means a DIO read can zero out part of the page cache page, and it is possible the page will stay in cache. buffered reads will find an up to date page with zeros instead of the data actually on disk. This patch fixes things by using invalidate_inode_pages2_range instead. It preserves the page cache invalidation, but won't zero any pages. [dchinner: catch error and warn if it fails. Comment.] cc: stable@vger.kernel.org Signed-off-by: Chris Mason Reviewed-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 076b1708d134..827cfb2451b1 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -296,7 +296,16 @@ xfs_file_read_iter( xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } - truncate_pagecache_range(VFS_I(ip), pos, -1); + + /* + * Invalidate whole pages. This can return an error if + * we fail to invalidate a page, but this should never + * happen on XFS. Warn if it does fail. + */ + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + pos >> PAGE_CACHE_SHIFT, -1); + WARN_ON_ONCE(ret); + ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } -- cgit v1.2.3 From 834ffca6f7e345a79f6f2e2d131b0dfba8a4b67a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 2 Sep 2014 12:12:52 +1000 Subject: xfs: don't zero partial page cache pages during O_DIRECT writes Similar to direct IO reads, direct IO writes are using truncate_pagecache_range to invalidate the page cache. This is incorrect due to the sub-block zeroing in the page cache that truncate_pagecache_range() triggers. This patch fixes things by using invalidate_inode_pages2_range instead. It preserves the page cache invalidation, but won't zero any pages. cc: stable@vger.kernel.org Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 827cfb2451b1..19917faac2d2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -644,7 +644,15 @@ xfs_file_dio_aio_write( pos, -1); if (ret) goto out; - truncate_pagecache_range(VFS_I(ip), pos, -1); + /* + * Invalidate whole pages. This can return an error if + * we fail to invalidate a page, but this should never + * happen on XFS. Warn if it does fail. + */ + ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + pos >> PAGE_CACHE_SHIFT, -1); + WARN_ON_ONCE(ret); + ret = 0; } /* -- cgit v1.2.3 From 7d4ea3ce63a6bc532abb334c469c18481798af8c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 2 Sep 2014 12:12:53 +1000 Subject: xfs: use ranged writeback and invalidation for direct IO Now we are not doing silly things with dirtying buffers beyond EOF and using invalidation correctly, we can finally reduce the ranges of writeback and invalidation used by direct IO to match that of the IO being issued. Bring the writeback and invalidation ranges back to match the generic direct IO code - this will greatly reduce the perturbation of cached data when direct IO and buffered IO are mixed, but still provide the same buffered vs direct IO coherency behaviour we currently have. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 19917faac2d2..de5368c803f9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -291,7 +291,7 @@ xfs_file_read_iter( if (inode->i_mapping->nrpages) { ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, - pos, -1); + pos, pos + size - 1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; @@ -303,7 +303,8 @@ xfs_file_read_iter( * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - pos >> PAGE_CACHE_SHIFT, -1); + pos >> PAGE_CACHE_SHIFT, + (pos + size - 1) >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } @@ -641,7 +642,7 @@ xfs_file_dio_aio_write( if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, -1); + pos, pos + count - 1); if (ret) goto out; /* @@ -650,7 +651,8 @@ xfs_file_dio_aio_write( * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - pos >> PAGE_CACHE_SHIFT, -1); + pos >> PAGE_CACHE_SHIFT, + (pos + count - 1) >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } -- cgit v1.2.3 From ca446d880c399bb31301e7d8eefbd7fe3c504c4e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 2 Sep 2014 12:12:53 +1000 Subject: xfs: don't log inode unless extent shift makes extent modifications The file collapse mechanism uses xfs_bmap_shift_extents() to collapse all subsequent extents down into the specified, previously punched out, region. This function performs some validation, such as whether a sufficient hole exists in the target region of the collapse, then shifts the remaining exents downward. The exit path of the function currently logs the inode unconditionally. While we must log the inode (and abort) if an error occurs and the transaction is dirty, the initial validation paths can generate errors before the transaction has been dirtied. This creates an unnecessary filesystem shutdown scenario, as the caller will cancel a transaction that has been marked dirty. Modify xfs_bmap_shift_extents() to OR the logflags bits as modifications are made to the inode bmap. Only log the inode in the exit path if logflags has been set. This ensures we only have to cancel a dirty transaction if modifications have been made and prevents an unnecessary filesystem shutdown otherwise. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index de2d26d32844..86df952d3e24 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5424,7 +5424,7 @@ xfs_bmap_shift_extents( struct xfs_bmap_free *flist, int num_exts) { - struct xfs_btree_cur *cur; + struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec left; @@ -5435,7 +5435,7 @@ xfs_bmap_shift_extents( int error = 0; int i; int whichfork = XFS_DATA_FORK; - int logflags; + int logflags = 0; xfs_filblks_t blockcount = 0; int total_extents; @@ -5478,16 +5478,11 @@ xfs_bmap_shift_extents( } } - /* We are going to change core inode */ - logflags = XFS_ILOG_CORE; if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; cur->bc_private.b.flist = flist; cur->bc_private.b.flags = 0; - } else { - cur = NULL; - logflags |= XFS_ILOG_DEXT; } /* @@ -5545,11 +5540,14 @@ xfs_bmap_shift_extents( blockcount = left.br_blockcount + got.br_blockcount; xfs_iext_remove(ip, *current_ext, 1, 0); + logflags |= XFS_ILOG_CORE; if (cur) { error = xfs_btree_delete(cur, &i); if (error) goto del_cursor; XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } else { + logflags |= XFS_ILOG_DEXT; } XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); @@ -5575,6 +5573,7 @@ xfs_bmap_shift_extents( got.br_startoff = startoff; } + logflags |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, @@ -5582,6 +5581,8 @@ xfs_bmap_shift_extents( got.br_state); if (error) goto del_cursor; + } else { + logflags |= XFS_ILOG_DEXT; } (*current_ext)++; @@ -5597,6 +5598,7 @@ del_cursor: xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); - xfs_trans_log_inode(tp, ip, logflags); + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); return error; } -- cgit v1.2.3 From 1669a8ca2105968f660cf7d84ba38fd18075cd99 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 2 Sep 2014 12:12:53 +1000 Subject: xfs: xfs_file_collapse_range is delalloc challenged If we have delalloc extents on a file before we run a collapse range opertaion, we sync the range that we are going to collapse to convert delalloc extents in that region to real extents to simplify the shift operation. However, the shift operation then assumes that the extent list is not going to change as it iterates over the extent list moving things about. Unfortunately, this isn't true because we can't hold the ILOCK over all the operations. We can prevent new IO from modifying the extent list by holding the IOLOCK, but that doesn't prevent writeback from running.... And when writeback runs, it can convert delalloc extents is the range of the file prior to the region being collapsed, and this changes the indexes of all the extents in the file. That causes the collapse range operation to Go Bad. The right fix is to rewrite the extent shift operation not to be dependent on the extent list not changing across the entire operation, but this is a fairly significant piece of work to do. Hence, as a short-term workaround for the problem, sync the entire file before starting a collapse operation to remove all delalloc ranges from the file and so avoid the problem of concurrent writeback changing the extent list. Diagnosed-and-Reported-by: Brian Foster Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2f1e30d39a35..76b6a2910e40 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1470,6 +1470,19 @@ xfs_collapse_file_space( start_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); + /* + * writeback the entire file to prevent concurrent writeback of ranges + * outside the collapsing region from changing the extent list. + * + * XXX: This is a temporary fix until the extent shift loop below is + * converted to use offsets and lookups within the ILOCK rather than + * carrying around the index into the extent list for the next + * iteration. + */ + error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + return error; + error = xfs_free_file_space(ip, offset, len); if (error) return error; -- cgit v1.2.3 From 41b9d7263ea1e270019c5d04fa0ab15db50b9725 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 2 Sep 2014 12:12:53 +1000 Subject: xfs: trim eofblocks before collapse range xfs_collapse_file_space() currently writes back the entire file undergoing collapse range to settle things down for the extent shift algorithm. While this prevents changes to the extent list during the collapse operation, the writeback itself is not enough to prevent unnecessary collapse failures. The current shift algorithm uses the extent index to iterate the in-core extent list. If a post-eof delalloc extent persists after the writeback (e.g., a prior zero range op where the end of the range aligns with eof can separate the post-eof blocks such that they are not written back and converted), xfs_bmap_shift_extents() becomes confused over the encoded br_startblock value and fails the collapse. As with the full writeback, this is a temporary fix until the algorithm is improved to cope with a volatile extent list and avoid attempts to shift post-eof extents. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 76b6a2910e40..1707980f9a4b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1471,8 +1471,10 @@ xfs_collapse_file_space( shift_fsb = XFS_B_TO_FSB(mp, len); /* - * writeback the entire file to prevent concurrent writeback of ranges - * outside the collapsing region from changing the extent list. + * Writeback the entire file and force remove any post-eof blocks. The + * writeback prevents changes to the extent list via concurrent + * writeback and the eofblocks trim prevents the extent shift algorithm + * from running into a post-eof delalloc extent. * * XXX: This is a temporary fix until the extent shift loop below is * converted to use offsets and lookups within the ILOCK rather than @@ -1482,6 +1484,11 @@ xfs_collapse_file_space( error = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (error) return error; + if (xfs_can_free_eofblocks(ip, true)) { + error = xfs_free_eofblocks(mp, ip, false); + if (error) + return error; + } error = xfs_free_file_space(ip, offset, len); if (error) -- cgit v1.2.3 From b73e52824c8920a5ff754e3c8ff68466a7dd61f9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 30 Aug 2014 09:52:34 +0800 Subject: f2fs: reposition unlock_new_inode to prevent accessing invalid inode As the race condition on the inode cache, following scenario can appear: [Thread a] [Thread b] ->f2fs_mkdir ->f2fs_add_link ->__f2fs_add_link ->init_inode_metadata failed here ->gc_thread_func ->f2fs_gc ->do_garbage_collect ->gc_data_segment ->f2fs_iget ->iget_locked ->wait_on_inode ->unlock_new_inode ->move_data_page ->make_bad_inode ->iput When we fail in create/symlink/mkdir/mknod/tmpfile, the new allocated inode should be set as bad to avoid being accessed by other thread. But in above scenario, it allows f2fs to access the invalid inode before this inode was set as bad. This patch fix the potential problem, and this issue was found by code review. change log from v1: o Add condition judgment in gc_data_segment() suggested by Changman Lee. o use iget_failed to simplify code. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- fs/f2fs/namei.c | 20 +++++--------------- 2 files changed, 6 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e8507b1c8759..943a31db7cc3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -593,7 +593,7 @@ next_step: if (phase == 2) { inode = f2fs_iget(sb, dni.ino); - if (IS_ERR(inode)) + if (IS_ERR(inode) || is_bad_inode(inode)) continue; start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 6b53ce924d95..ee103fd7283c 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -134,9 +134,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, return 0; out: clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); + iget_failed(inode); alloc_nid_failed(sbi, ino); return err; } @@ -267,9 +265,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; out: clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); + iget_failed(inode); alloc_nid_failed(sbi, inode->i_ino); return err; } @@ -308,9 +304,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); + iget_failed(inode); alloc_nid_failed(sbi, inode->i_ino); return err; } @@ -354,9 +348,7 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, return 0; out: clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); + iget_failed(inode); alloc_nid_failed(sbi, inode->i_ino); return err; } @@ -688,9 +680,7 @@ release_out: out: f2fs_unlock_op(sbi); clear_nlink(inode); - unlock_new_inode(inode); - make_bad_inode(inode); - iput(inode); + iget_failed(inode); alloc_nid_failed(sbi, inode->i_ino); return err; } -- cgit v1.2.3 From 2ff396be602f10b5eab8e73b24f20348fa2de159 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Tue, 2 Sep 2014 13:17:00 -0400 Subject: aio: add missing smp_rmb() in read_events_ring We ran into a case on ppc64 running mariadb where io_getevents would return zeroed out I/O events. After adding instrumentation, it became clear that there was some missing synchronization between reading the tail pointer and the events themselves. This small patch fixes the problem in testing. Thanks to Zach for helping to look into this, and suggesting the fix. Signed-off-by: Jeff Moyer Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 97bc62cbe2da..5f2e9c6c328e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1111,6 +1111,12 @@ static long aio_read_events_ring(struct kioctx *ctx, tail = ring->tail; kunmap_atomic(ring); + /* + * Ensure that once we've read the current tail pointer, that + * we also see the events that were stored up to the tail. + */ + smp_rmb(); + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); if (head == tail) -- cgit v1.2.3 From d9f85963e3f7f5582552fdae54a2b89d6c62daf5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 25 Aug 2014 10:43:00 +0100 Subject: Btrfs: fix corruption after write/fsync failure + fsync + log recovery While writing to a file, in inode.c:cow_file_range() (and same applies to submit_compressed_extents()), after reserving an extent for the file data, we create a new extent map for the written range and insert it into the extent map cache. After that, we create an ordered operation, but if it fails (due to a transient/temporary-ENOMEM), we return without dropping that extent map, which points to a reserved extent that is freed when we return. A subsequent incremental fsync (when the btrfs inode doesn't have the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and logs a file extent item based on that extent map, which points to a disk extent that doesn't contain valid data - it was freed by us earlier, at this point it might contain any random/garbage data. Therefore, if we reach an error condition when cowing a file range after we added the new extent map to the cache, drop it from the cache before returning. Some sequence of steps that lead to this: $ mkfs.btrfs -f /dev/sdd $ mount -o commit=9999 /dev/sdd /mnt $ cd /mnt $ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo $ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096" $ sync $ od -t x1 foo 0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 * 0020000 $ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo # Now this write + fsync fail with -ENOMEM, which was returned by # btrfs_add_ordered_extent() in inode.c:cow_file_range(). $ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo $ xfs_io -c "fsync" foo fsync: Cannot allocate memory # Now do a new write + fsync, which will succeed. Our previous # -ENOMEM was a transient/temporary error. $ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo $ xfs_io -c "fsync" foo # Our file content (in page cache) is now: $ od -t x1 foo 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 * 0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0050000 # Now reboot the machine, and mount the fs, so that fsync log replay # takes place. # The file content is now weird, in particular the first 8Kb, which # do not match our data before nor after the sync command above. $ od -t x1 foo 0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0050000 # In fact these first 4Kb are a duplicate of the last 4kb block. # The last write got an extent map/file extent item that points to # the same disk extent that we got in the write+fsync that failed # with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to # verify that: $ btrfs-debug-tree /dev/sdd (...) item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53 extent data disk byte 12582912 nr 8192 extent data offset 0 nr 8192 ram 8192 item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 8192 ram 8192 item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53 extent data disk byte 12582912 nr 4096 extent data offset 0 nr 4096 ram 4096 $ umount /dev/sdd $ btrfsck /dev/sdd Checking filesystem on /dev/sdd UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f checking extents extent item 12582912 has multiple extent items ref mismatch on [12582912 4096] extent item 1, found 2 Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192 backpointer mismatch on [12582912 4096] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots root 5 inode 257 errors 1000, some csum missing found 131074 bytes used err is 1 total csum bytes: 4 total tree bytes: 131072 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 123404 file data blocks allocated: 274432 referenced 274432 Btrfs v3.14.1-96-gcc7fd5a-dirty Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d020d6d9ace..7313571e1860 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -778,8 +778,12 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - if (ret) + if (ret) { + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); goto out_free_reserve; + } /* * clear dirty, set writeback and unlock the pages. @@ -971,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode, ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) - goto out_reserve; + goto out_drop_extent_cache; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); if (ret) - goto out_reserve; + goto out_drop_extent_cache; } if (disk_num_bytes < cur_alloc_size) @@ -1006,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode, out: return ret; +out_drop_extent_cache: + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: -- cgit v1.2.3 From dac5705cad20070a70bb028ca52e1f0bc157b42d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Aug 2014 20:54:26 +0100 Subject: Btrfs: fix crash while doing a ranged fsync While doing a ranged fsync, that is, one whose range doesn't cover the whole possible file range (0 to LLONG_MAX), we can crash under certain circumstances with a trace like the following: [41074.641913] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC (...) [41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 3.16.0-fdm-btrfs-next-45+ #1 (...) [41074.643886] RIP: 0010:[] [] btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs] (...) [41074.644919] Stack: (...) [41074.644919] Call Trace: [41074.644919] [] btrfs_truncate_inode_items+0x3f1/0xa10 [btrfs] [41074.644919] [] ? btrfs_get_logged_extents+0x4f/0x80 [btrfs] [41074.644919] [] btrfs_log_inode+0x2f9/0x970 [btrfs] [41074.644919] [] ? sched_clock_local+0x25/0xa0 [41074.644919] [] ? mutex_unlock+0xe/0x10 [41074.644919] [] ? trace_hardirqs_on+0xd/0x10 [41074.644919] [] btrfs_log_inode_parent+0x1ef/0x560 [btrfs] [41074.644919] [] ? dget_parent+0x5/0x180 [41074.644919] [] btrfs_log_dentry_safe+0x51/0x80 [btrfs] [41074.644919] [] btrfs_sync_file+0x1ba/0x3e0 [btrfs] [41074.644919] [] vfs_fsync_range+0x1b/0x30 (...) The necessary conditions that lead to such crash are: * an incremental fsync (when the inode doesn't have the BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged a file extent item ending at offset X; * the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due to a file truncate operation that reduces the file to a size smaller than X; * a ranged fsync call happens (via an msync for example), with a range that doesn't cover the whole file and the end of this range, lets call it Y, is smaller than X; * btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and calls btrfs_truncate_inode_items() to remove all items from the log tree that are associated with our file; * btrfs_truncate_inode_items() removes all of the inode's items, and the lowest file extent item it removed is the one ending at offset X, where X > 0 and X > Y - before returning, it calls btrfs_ordered_update_i_size() with an offset parameter set to X; * btrfs_ordered_update_i_size() sees that X is greater then the current ordered size (btrfs_inode's disk_i_size) and then it assumes there can't be any ongoing ordered operation with a range covering the offset X, calling a BUG_ON() if such ordered operation exists. This assumption is made because the disk_i_size is only increased after the corresponding file extent item is added to the btree (btrfs_finish_ordered_io); * But because our fsync covers only a limited range, such an ordered extent might exist, and our fsync callback (btrfs_sync_file) doesn't wait for such ordered extent to finish when calling btrfs_wait_ordered_range(); And then by the time btrfs_ordered_update_i_size() is called, via: btrfs_sync_file() -> btrfs_log_dentry_safe() -> btrfs_log_inode_parent() -> btrfs_log_inode() -> btrfs_truncate_inode_items() -> btrfs_ordered_update_i_size() We hit the BUG_ON(), which could never happen if the fsync range covered the whole possible file range (0 to LLONG_MAX), as we would wait for all ordered extents to finish before calling btrfs_truncate_inode_items(). So just don't call btrfs_ordered_update_i_size() if we're removing the inode's items from a log tree, which isn't supposed to change the in memory inode's disk_i_size. Issue found while running xfstests/generic/127 (happens very rarely for me), more specifically via the fsx calls that use memory mapped IO (and issue msync calls). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7313571e1860..88823f4ca451 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4248,7 +4248,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1) + if (last_size != (u64)-1 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; -- cgit v1.2.3 From a9cfcd63e8d206ce4235c355d857c4fbdf0f4587 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Sep 2014 09:33:00 -0400 Subject: ext4: avoid trying to kfree an ERR_PTR pointer Thanks to Dan Carpenter for extending smatch to find bugs like this. (This was found using a development version of smatch.) Fixes: 36de928641ee48b2078d3fe9514242aaa2f92013 Reported-by: Dan Carpenter Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 2 ++ fs/ext4/resize.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 90a3cdca3f88..603e4ebbd0ac 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3240,6 +3240,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); + new.bh = NULL; goto end_rename; } if (new.bh) { @@ -3386,6 +3387,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); + new.bh = NULL; goto end_rename; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bb0e80f03e2e..1e43b905ff98 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -575,6 +575,7 @@ handle_bb: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); + bh = NULL; goto out; } overhead = ext4_group_overhead_blocks(sb, group); @@ -603,6 +604,7 @@ handle_ib: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); + bh = NULL; goto out; } -- cgit v1.2.3 From 8a70ee3307908c46f952df91be72a18d5f5ad0a3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 11:47:51 +0200 Subject: udf: Avoid dir link count to go negative If we are writing back inode of unlinked directory, its link count ends up being (u16)-1. Although the inode is deleted, udf_iget() can load the inode when NFS uses stale file handle and get confused. Signed-off-by: Jan Kara --- fs/udf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 236cd48184c2..e86f9b67aa16 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1664,7 +1664,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) FE_PERM_U_DELETE | FE_PERM_U_CHATTR)); fe->permissions = cpu_to_le32(udfperms); - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); else fe->fileLinkCount = cpu_to_le16(inode->i_nlink); -- cgit v1.2.3 From bb7720a0b4a8ca3269fd86fbb45a78d2e0d3deaf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 13:32:50 +0200 Subject: udf: Fold udf_fill_inode() into __udf_read_inode() There's no good reason to separate these since udf_fill_inode() is called only from __udf_read_inode() and both do part of the same thing. Signed-off-by: Jan Kara --- fs/udf/inode.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e86f9b67aa16..68cc7b144c26 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -51,7 +51,6 @@ MODULE_LICENSE("GPL"); static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); -static void udf_fill_inode(struct inode *, struct buffer_head *); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); static sector_t inode_getblk(struct inode *, sector_t, int *, int *); @@ -1275,8 +1274,11 @@ static void __udf_read_inode(struct inode *inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; + struct extendedFileEntry *efe; uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + unsigned int link_count; /* * Set defaults, but the inode is still incomplete! @@ -1307,6 +1309,7 @@ static void __udf_read_inode(struct inode *inode) } fe = (struct fileEntry *)bh->b_data; + efe = (struct extendedFileEntry *)bh->b_data; if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; @@ -1346,22 +1349,6 @@ static void __udf_read_inode(struct inode *inode) make_bad_inode(inode); return; } - udf_fill_inode(inode, bh); - - brelse(bh); -} - -static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) -{ - struct fileEntry *fe; - struct extendedFileEntry *efe; - struct udf_sb_info *sbi = UDF_SB(inode->i_sb); - struct udf_inode_info *iinfo = UDF_I(inode); - unsigned int link_count; - - fe = (struct fileEntry *)bh->b_data; - efe = (struct extendedFileEntry *)bh->b_data; - if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ @@ -1551,6 +1538,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } else make_bad_inode(inode); } + brelse(bh); } static int udf_alloc_i_data(struct inode *inode, size_t size) -- cgit v1.2.3 From c03aa9f6e1f938618e6db2e23afef0574efeeb65 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 14:06:55 +0200 Subject: udf: Avoid infinite loop when processing indirect ICBs We did not implement any bound on number of indirect ICBs we follow when loading inode. Thus corrupted medium could cause kernel to go into an infinite loop, possibly causing a stack overflow. Fix the possible stack overflow by removing recursion from __udf_read_inode() and limit number of indirect ICBs we follow to avoid infinite loops. Signed-off-by: Jan Kara --- fs/udf/inode.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 68cc7b144c26..a6a40536ebf1 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1270,6 +1270,13 @@ update_time: return 0; } +/* + * Maximum length of linked list formed by ICB hierarchy. The chosen number is + * arbitrary - just that we hopefully don't limit any real use of rewritten + * inode on write-once media but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_ICB_NESTING 1024 + static void __udf_read_inode(struct inode *inode) { struct buffer_head *bh = NULL; @@ -1279,7 +1286,9 @@ static void __udf_read_inode(struct inode *inode) struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); unsigned int link_count; + unsigned int indirections = 0; +reread: /* * Set defaults, but the inode is still incomplete! * Note: get_new_inode() sets the following on a new inode: @@ -1317,28 +1326,26 @@ static void __udf_read_inode(struct inode *inode) ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { - struct buffer_head *nbh = NULL; struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); - if (ie->indirectICB.extLength && - (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, - &ident))) { - if (ident == TAG_IDENT_FE || - ident == TAG_IDENT_EFE) { - memcpy(&iinfo->i_location, - &loc, - sizeof(struct kernel_lb_addr)); - brelse(bh); - brelse(ibh); - brelse(nbh); - __udf_read_inode(inode); + if (ie->indirectICB.extLength) { + brelse(bh); + brelse(ibh); + memcpy(&iinfo->i_location, &loc, + sizeof(struct kernel_lb_addr)); + if (++indirections > UDF_MAX_ICB_NESTING) { + udf_err(inode->i_sb, + "too many ICBs in ICB hierarchy" + " (max %d supported)\n", + UDF_MAX_ICB_NESTING); + make_bad_inode(inode); return; } - brelse(nbh); + goto reread; } } brelse(ibh); -- cgit v1.2.3 From 6d3d5e860a114ae606b1af2ba7f64cb19fbeb414 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 16:15:51 +0200 Subject: udf: Make udf_read_inode() and udf_iget() return error Currently __udf_read_inode() wasn't returning anything and we found out whether we succeeded reading inode by checking whether inode is bad or not. udf_iget() returned NULL on failure and inode pointer otherwise. Make these two functions properly propagate errors up the call stack and use the return value in callers. Signed-off-by: Jan Kara --- fs/udf/inode.c | 99 ++++++++++++++++++++++++++------------------------------ fs/udf/namei.c | 22 ++++++------- fs/udf/super.c | 69 +++++++++++++++++++++++---------------- fs/udf/udfdecl.h | 1 - 4 files changed, 96 insertions(+), 95 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a6a40536ebf1..788fc58ea78e 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1277,7 +1277,7 @@ update_time: */ #define UDF_MAX_ICB_NESTING 1024 -static void __udf_read_inode(struct inode *inode) +static int udf_read_inode(struct inode *inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; @@ -1285,10 +1285,19 @@ static void __udf_read_inode(struct inode *inode) uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + struct kernel_lb_addr *iloc = &iinfo->i_location; unsigned int link_count; unsigned int indirections = 0; + int ret = -EIO; reread: + if (iloc->logicalBlockNum >= + sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { + udf_debug("block=%d, partition=%d out of range\n", + iloc->logicalBlockNum, iloc->partitionReferenceNum); + return -EIO; + } + /* * Set defaults, but the inode is still incomplete! * Note: get_new_inode() sets the following on a new inode: @@ -1301,20 +1310,17 @@ reread: * i_nlink = 1 * i_op = NULL; */ - bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); + bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); - make_bad_inode(inode); - return; + return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", inode->i_ino, ident); - brelse(bh); - make_bad_inode(inode); - return; + goto out; } fe = (struct fileEntry *)bh->b_data; @@ -1323,8 +1329,7 @@ reread: if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; - ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, - &ident); + ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { struct kernel_lb_addr loc; struct indirectEntry *ie; @@ -1333,7 +1338,6 @@ reread: loc = lelb_to_cpu(ie->indirectICB.extLocation); if (ie->indirectICB.extLength) { - brelse(bh); brelse(ibh); memcpy(&iinfo->i_location, &loc, sizeof(struct kernel_lb_addr)); @@ -1342,9 +1346,9 @@ reread: "too many ICBs in ICB hierarchy" " (max %d supported)\n", UDF_MAX_ICB_NESTING); - make_bad_inode(inode); - return; + goto out; } + brelse(bh); goto reread; } } @@ -1352,9 +1356,7 @@ reread: } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { udf_err(inode->i_sb, "unsupported strategy type: %d\n", le16_to_cpu(fe->icbTag.strategyType)); - brelse(bh); - make_bad_inode(inode); - return; + goto out; } if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; @@ -1372,11 +1374,10 @@ reread: if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { iinfo->i_efe = 1; iinfo->i_use = 0; - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - @@ -1384,11 +1385,10 @@ reread: } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { iinfo->i_efe = 0; iinfo->i_use = 0; - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct fileEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct fileEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry)); @@ -1398,18 +1398,18 @@ reread: iinfo->i_lenAlloc = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)-> lengthAllocDescs); - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct unallocSpaceEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct unallocSpaceEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); - return; + return 0; } + ret = -EIO; read_lock(&sbi->s_cred_lock); i_uid_write(inode, le32_to_cpu(fe->uid)); if (!uid_valid(inode->i_uid) || @@ -1531,8 +1531,7 @@ reread: default: udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", inode->i_ino, fe->icbTag.fileType); - make_bad_inode(inode); - return; + goto out; } if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct deviceSpec *dsea = @@ -1543,9 +1542,12 @@ reread: le32_to_cpu(dsea->minorDeviceIdent))); /* Developer ID ??? */ } else - make_bad_inode(inode); + goto out; } + ret = 0; +out: brelse(bh); + return ret; } static int udf_alloc_i_data(struct inode *inode, size_t size) @@ -1825,32 +1827,23 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); + int err; if (!inode) - return NULL; - - if (inode->i_state & I_NEW) { - memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); - __udf_read_inode(inode); - unlock_new_inode(inode); - } + return ERR_PTR(-ENOMEM); - if (is_bad_inode(inode)) - goto out_iput; + if (!(inode->i_state & I_NEW)) + return inode; - if (ino->logicalBlockNum >= UDF_SB(sb)-> - s_partmaps[ino->partitionReferenceNum].s_partition_len) { - udf_debug("block=%d, partition=%d out of range\n", - ino->logicalBlockNum, ino->partitionReferenceNum); - make_bad_inode(inode); - goto out_iput; + memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); + err = udf_read_inode(inode); + if (err < 0) { + iget_failed(inode); + return ERR_PTR(err); } + unlock_new_inode(inode); return inode; - - out_iput: - iput(inode); - return NULL; } int udf_add_aext(struct inode *inode, struct extent_position *epos, diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 83a06001742b..e041fd9c6816 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, NULL, 0), }; inode = udf_iget(dir->i_sb, lb); - if (!inode) { - return ERR_PTR(-EACCES); - } + if (IS_ERR(inode)) + return inode; } else #endif /* UDF_RECOVERY */ @@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, loc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(dir->i_sb, &loc); - if (!inode) { - return ERR_PTR(-EACCES); - } + if (IS_ERR(inode)) + return ERR_CAST(inode); } return d_splice_alias(inode, dentry); @@ -1222,7 +1220,7 @@ static struct dentry *udf_get_parent(struct dentry *child) struct udf_fileident_bh fibh; if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) - goto out_unlock; + return ERR_PTR(-EACCES); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); @@ -1230,12 +1228,10 @@ static struct dentry *udf_get_parent(struct dentry *child) tloc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(child->d_inode->i_sb, &tloc); - if (!inode) - goto out_unlock; + if (IS_ERR(inode)) + return ERR_CAST(inode); return d_obtain_alias(inode); -out_unlock: - return ERR_PTR(-EACCES); } @@ -1252,8 +1248,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, loc.partitionReferenceNum = partref; inode = udf_iget(sb, &loc); - if (inode == NULL) - return ERR_PTR(-ENOMEM); + if (IS_ERR(inode)) + return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); diff --git a/fs/udf/super.c b/fs/udf/super.c index 813da94d447b..5401fc33f5cc 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -961,12 +961,14 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, metadata_fe = udf_iget(sb, &addr); - if (metadata_fe == NULL) + if (IS_ERR(metadata_fe)) { udf_warn(sb, "metadata inode efe not found\n"); - else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { + return metadata_fe; + } + if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); iput(metadata_fe); - metadata_fe = NULL; + return ERR_PTR(-EIO); } return metadata_fe; @@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) struct udf_part_map *map; struct udf_meta_data *mdata; struct kernel_lb_addr addr; + struct inode *fe; map = &sbi->s_partmaps[partition]; mdata = &map->s_type_specific.s_metadata; @@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Metadata file location: block = %d part = %d\n", mdata->s_meta_file_loc, map->s_partition_num); - mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb, - mdata->s_meta_file_loc, map->s_partition_num); - - if (mdata->s_metadata_fe == NULL) { + fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc, + map->s_partition_num); + if (IS_ERR(fe)) { /* mirror file entry */ udf_debug("Mirror metadata file location: block = %d part = %d\n", mdata->s_mirror_file_loc, map->s_partition_num); - mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, - mdata->s_mirror_file_loc, map->s_partition_num); + fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc, + map->s_partition_num); - if (mdata->s_mirror_fe == NULL) { + if (IS_ERR(fe)) { udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); - return -EIO; + return PTR_ERR(fe); } - } + mdata->s_mirror_fe = fe; + } else + mdata->s_metadata_fe = fe; + /* * bitmap file entry @@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Bitmap file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_bitmap_fe = udf_iget(sb, &addr); - if (mdata->s_bitmap_fe == NULL) { + fe = udf_iget(sb, &addr); + if (IS_ERR(fe)) { if (sb->s_flags & MS_RDONLY) udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); - return -EIO; + return PTR_ERR(fe); } - } + } else + mdata->s_bitmap_fe = fe; } udf_debug("udf_load_metadata_files Ok\n"); @@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb, phd->unallocSpaceTable.extPosition), .partitionReferenceNum = p_index, }; + struct inode *inode; - map->s_uspace.s_table = udf_iget(sb, &loc); - if (!map->s_uspace.s_table) { + inode = udf_iget(sb, &loc); + if (IS_ERR(inode)) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); - return -EIO; + return PTR_ERR(inode); } + map->s_uspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %ld\n", p_index, map->s_uspace.s_table->i_ino); @@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb, phd->freedSpaceTable.extPosition), .partitionReferenceNum = p_index, }; + struct inode *inode; - map->s_fspace.s_table = udf_iget(sb, &loc); - if (!map->s_fspace.s_table) { + inode = udf_iget(sb, &loc); + if (IS_ERR(inode)) { udf_debug("cannot load freedSpaceTable (part %d)\n", p_index); - return -EIO; + return PTR_ERR(inode); } - + map->s_fspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; udf_debug("freedSpaceTable (part %d) @ %ld\n", p_index, map->s_fspace.s_table->i_ino); @@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index, struct udf_part_map *map = &sbi->s_partmaps[p_index]; sector_t vat_block; struct kernel_lb_addr ino; + struct inode *inode; /* * VAT file entry is in the last recorded block. Some broken disks have @@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index, ino.partitionReferenceNum = type1_index; for (vat_block = start_block; vat_block >= map->s_partition_root && - vat_block >= start_block - 3 && - !sbi->s_vat_inode; vat_block--) { + vat_block >= start_block - 3; vat_block--) { ino.logicalBlockNum = vat_block - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + inode = udf_iget(sb, &ino); + if (!IS_ERR(inode)) { + sbi->s_vat_inode = inode; + break; + } } } @@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* assign inodes by physical block number */ /* perhaps it's not extensible enough, but for now ... */ inode = udf_iget(sb, &rootdir); - if (!inode) { + if (IS_ERR(inode)) { udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); - ret = -EIO; + ret = PTR_ERR(inode); goto error_out; } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index be7dabbbcb49..41a8115c9345 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -143,7 +143,6 @@ extern int udf_expand_file_adinicb(struct inode *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); extern struct buffer_head *udf_bread(struct inode *, int, int, int *); extern int udf_setsize(struct inode *, loff_t); -extern void udf_read_inode(struct inode *); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); -- cgit v1.2.3 From 4071b913622316970d0e1919f7d82b4403fec5f2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 16:19:47 +0200 Subject: udf: Properly detect stale inodes NFS can easily ask for inodes that are already deleted. Currently UDF happily returns such inodes which is a bug. Return -ESTALE if udf_read_inode() is asked to read deleted inode. Signed-off-by: Jan Kara --- fs/udf/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 788fc58ea78e..3a44d9187aad 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1435,8 +1435,10 @@ reread: read_unlock(&sbi->s_cred_lock); link_count = le16_to_cpu(fe->fileLinkCount); - if (!link_count) - link_count = 1; + if (!link_count) { + ret = -ESTALE; + goto out; + } set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); -- cgit v1.2.3 From 470cca56c366428d4d5785a0a5a291619c332c7f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 16:26:19 +0200 Subject: udf: Set i_generation field Currently UDF doesn't initialize i_generation in any way and thus NFS can easily get reallocated inodes from stale file handles. Luckily UDF already has a unique object identifier associated with each inode - i_unique. Use that for initialization of i_generation. Signed-off-by: Jan Kara --- fs/udf/ialloc.c | 1 + fs/udf/inode.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 6eaf5edf1ea1..647370d70175 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -95,6 +95,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) lvidiu = udf_sb_lvidiu(sb); if (lvidiu) { iinfo->i_unique = lvid_get_unique_id(sb); + inode->i_generation = iinfo->i_unique; mutex_lock(&sbi->s_alloc_mutex); if (S_ISDIR(mode)) le32_add_cpu(&lvidiu->numDirs, 1); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 3a44d9187aad..08598843288f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1484,6 +1484,7 @@ reread: iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); } + inode->i_generation = iinfo->i_unique; switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: -- cgit v1.2.3 From d2be51cb34dc501791f3b8c01a99a3f2064bd8d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2014 09:34:14 -0400 Subject: udf: merge the pieces inserting a new non-directory object into directory boilerplate code in udf_{create,mknod,symlink} taken to new helper symlink case converted to unique id calculated by udf_new_inode() - no point finding a new one. Signed-off-by: Al Viro Signed-off-by: Jan Kara --- fs/udf/namei.c | 98 +++++++++++++++++----------------------------------------- 1 file changed, 29 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/udf/namei.c b/fs/udf/namei.c index e041fd9c6816..abec86466735 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -548,31 +548,16 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); } -static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { + struct udf_inode_info *iinfo = UDF_I(inode); + struct inode *dir = dentry->d_parent->d_inode; struct udf_fileident_bh fibh; - struct inode *inode; struct fileIdentDesc cfi, *fi; int err; - struct udf_inode_info *iinfo; - - inode = udf_new_inode(dir, mode, &err); - if (!inode) { - return err; - } - - iinfo = UDF_I(inode); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; - inode->i_op = &udf_file_inode_operations; - inode->i_fop = &udf_file_operations; - mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + if (unlikely(!fi)) { inode_dec_link_count(inode); iput(inode); return err; @@ -592,6 +577,28 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, return 0; } +static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + int err; + + inode = udf_new_inode(dir, mode, &err); + if (!inode) { + return err; + } + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + mark_inode_dirty(inode); + + return udf_add_nondir(dentry, inode); +} + static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -619,10 +626,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; int err; - struct udf_inode_info *iinfo; if (!old_valid_dev(rdev)) return -EINVAL; @@ -630,33 +634,10 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, err = -EIO; inode = udf_new_inode(dir, mode, &err); if (!inode) - goto out; - - iinfo = UDF_I(inode); - init_special_inode(inode, mode, rdev); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { - inode_dec_link_count(inode); - iput(inode); return err; - } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = - cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); - mark_inode_dirty(inode); - - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - d_instantiate(dentry, inode); - err = 0; -out: - return err; + init_special_inode(inode, mode, rdev); + return udf_add_nondir(dentry, inode); } static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -877,11 +858,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct inode *inode; struct pathComponent *pc; const char *compstart; - struct udf_fileident_bh fibh; struct extent_position epos = {}; int eoffset, elen = 0; - struct fileIdentDesc *fi; - struct fileIdentDesc cfi; uint8_t *ea; int err; int block; @@ -1010,31 +988,13 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); up_write(&iinfo->i_data_sem); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) - goto out_fail; - cfi.icb.extLength = cpu_to_le32(sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - if (UDF_SB(inode->i_sb)->s_lvid_bh) { - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = - cpu_to_le32(lvid_get_unique_id(sb)); - } - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - d_instantiate(dentry, inode); - err = 0; - + err = udf_add_nondir(dentry, inode); out: kfree(name); return err; out_no_entry: up_write(&iinfo->i_data_sem); -out_fail: inode_dec_link_count(inode); iput(inode); goto out; -- cgit v1.2.3 From b231509616feb911c2a7a8814d58c0014ef5b17f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2014 09:38:11 -0400 Subject: udf: fix the udf_iget() vs. udf_new_inode() races Currently udf_iget() (triggered by NFS) can race with udf_new_inode() leading to two inode structures with the same inode number: nfsd: iget_locked() creates inode nfsd: try to read from disk, block on that. udf_new_inode(): allocate inode with that inumber udf_new_inode(): insert it into icache, set it up and dirty udf_write_inode(): write inode into buffer cache nfsd: get CPU again, look into buffer cache, see nice and sane on-disk inode, set the in-core inode from it Fix the problem by putting inode into icache in locked state (I_NEW set) and unlocking it only after it's fully set up. Signed-off-by: Al Viro Signed-off-by: Jan Kara --- fs/udf/ialloc.c | 7 ++++++- fs/udf/namei.c | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 647370d70175..598f33bdcd26 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -124,7 +124,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; inode->i_mtime = inode->i_atime = inode->i_ctime = iinfo->i_crtime = current_fs_time(inode->i_sb); - insert_inode_hash(inode); + if (unlikely(insert_inode_locked(inode) < 0)) { + make_bad_inode(inode); + iput(inode); + *err = -EIO; + return NULL; + } mark_inode_dirty(inode); *err = 0; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index abec86466735..d106fdd1bef7 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -559,6 +559,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (unlikely(!fi)) { inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -572,6 +573,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); + unlock_new_inode(inode); d_instantiate(dentry, inode); return 0; @@ -619,6 +621,7 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) mark_inode_dirty(inode); d_tmpfile(dentry, inode); + unlock_new_inode(inode); return 0; } @@ -660,6 +663,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); if (!fi) { inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -678,6 +682,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (!fi) { clear_nlink(inode); mark_inode_dirty(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -689,6 +694,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); inc_nlink(dir); mark_inode_dirty(dir); + unlock_new_inode(inode); d_instantiate(dentry, inode); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); @@ -996,6 +1002,7 @@ out: out_no_entry: up_write(&iinfo->i_data_sem); inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); goto out; } -- cgit v1.2.3 From 0b93a92be4cb48f22b78c95dea84089a1e72f860 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2014 09:47:41 -0400 Subject: udf: saner calling conventions for udf_new_inode() Signed-off-by: Al Viro Signed-off-by: Jan Kara --- fs/udf/ialloc.c | 24 ++++++++++-------------- fs/udf/namei.c | 44 ++++++++++++++++---------------------------- fs/udf/udfdecl.h | 2 +- 3 files changed, 27 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 598f33bdcd26..e77db621ec89 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode) udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); } -struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) +struct inode *udf_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); @@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); struct logicalVolIntegrityDescImpUse *lvidiu; + int err; inode = new_inode(sb); - if (!inode) { - *err = -ENOMEM; - return NULL; - } - *err = -ENOSPC; + if (!inode) + return ERR_PTR(-ENOMEM); iinfo = UDF_I(inode); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { @@ -80,16 +78,16 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) } if (!iinfo->i_ext.i_data) { iput(inode); - *err = -ENOMEM; - return NULL; + return ERR_PTR(-ENOMEM); } + err = -ENOSPC; block = udf_new_block(dir->i_sb, NULL, dinfo->i_location.partitionReferenceNum, - start, err); - if (*err) { + start, &err); + if (err) { iput(inode); - return NULL; + return ERR_PTR(err); } lvidiu = udf_sb_lvidiu(sb); @@ -127,11 +125,9 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); - *err = -EIO; - return NULL; + return ERR_PTR(-EIO); } mark_inode_dirty(inode); - *err = 0; return inode; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index d106fdd1bef7..c12e260fd6c4 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -582,13 +582,10 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct inode *inode; - int err; + struct inode *inode = udf_new_inode(dir, mode); - inode = udf_new_inode(dir, mode, &err); - if (!inode) { - return err; - } + if (IS_ERR(inode)) + return PTR_ERR(inode); if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) inode->i_data.a_ops = &udf_adinicb_aops; @@ -603,23 +600,18 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct inode *inode; - struct udf_inode_info *iinfo; - int err; + struct inode *inode = udf_new_inode(dir, mode); - inode = udf_new_inode(dir, mode, &err); - if (!inode) - return err; + if (IS_ERR(inode)) + return PTR_ERR(inode); - iinfo = UDF_I(inode); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) inode->i_data.a_ops = &udf_adinicb_aops; else inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); - d_tmpfile(dentry, inode); unlock_new_inode(inode); return 0; @@ -629,15 +621,13 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; - int err; if (!old_valid_dev(rdev)) return -EINVAL; - err = -EIO; - inode = udf_new_inode(dir, mode, &err); - if (!inode) - return err; + inode = udf_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); init_special_inode(inode, mode, rdev); return udf_add_nondir(dentry, inode); @@ -652,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; - err = -EIO; - inode = udf_new_inode(dir, S_IFDIR | mode, &err); - if (!inode) - goto out; + inode = udf_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; @@ -861,7 +850,7 @@ out: static int udf_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct inode *inode; + struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO); struct pathComponent *pc; const char *compstart; struct extent_position epos = {}; @@ -874,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct udf_inode_info *iinfo; struct super_block *sb = dir->i_sb; - inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); - if (!inode) - goto out; + if (IS_ERR(inode)) + return PTR_ERR(inode); iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 41a8115c9345..742557be9936 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -208,7 +208,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *); /* ialloc.c */ extern void udf_free_inode(struct inode *); -extern struct inode *udf_new_inode(struct inode *, umode_t, int *); +extern struct inode *udf_new_inode(struct inode *, umode_t); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); -- cgit v1.2.3 From 6098b45b32e6baeacc04790773ced9340601d511 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 3 Sep 2014 17:45:44 +0800 Subject: aio: block exit_aio() until all context requests are completed It seems that exit_aio() also needs to wait for all iocbs to complete (like io_destroy), but we missed the wait step in current implemention, so fix it in the same way as we did in io_destroy. Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 5f2e9c6c328e..733750096b71 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -793,6 +793,8 @@ void exit_aio(struct mm_struct *mm) for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = table->table[i]; + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); if (!ctx) continue; @@ -804,7 +806,10 @@ void exit_aio(struct mm_struct *mm) * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, NULL); + kill_ioctx(mm, ctx, &requests_done); + + /* Wait until all IO for the context are done. */ + wait_for_completion(&requests_done); } RCU_INIT_POINTER(mm->ioctx_table, NULL); -- cgit v1.2.3 From 10096fb1088e5c89b10772a1dfbe9682ecae5cea Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Thu, 21 Aug 2014 11:09:27 +0100 Subject: Export sync_filesystem() for modular ->remount_fs() use This patch changes sync_filesystem() to be EXPORT_SYMBOL(). The reason this is needed is that starting with 3.15 kernel, due to Theodore Ts'o's commit 02b9984d6408 ("fs: push sync_filesystem() down to the file system's remount_fs()"), all file systems that have dirty data to be written out need to call sync_filesystem() from their ->remount_fs() method when remounting read-only. As this is now a generically required function rather than an internal only function it should be EXPORT_SYMBOL() so that all file systems can call it. Signed-off-by: Anton Altaparmakov Acked-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index b28d1dd10e8b..bdc729d80e5e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -65,7 +65,7 @@ int sync_filesystem(struct super_block *sb) return ret; return __sync_filesystem(sb, 1); } -EXPORT_SYMBOL_GPL(sync_filesystem); +EXPORT_SYMBOL(sync_filesystem); static void sync_inodes_one_sb(struct super_block *sb, void *arg) { -- cgit v1.2.3 From 9ef7db7f38d0472dd9c444e42d5c5175ccbe5451 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Tue, 2 Sep 2014 11:40:17 +0400 Subject: ufs: fix deadlocks introduced by sb mutex merge Commit 0244756edc4b ("ufs: sb mutex merge + mutex_destroy") introduces deadlocks in ufs_new_inode() and ufs_free_inode(). Most callers of that functions acqure the mutex by themselves and ufs_{new,free}_inode() do that via lock_ufs(), i.e we have an unavoidable double lock. The patch proposes to resolve the issue by making sure that ufs_{new,free}_inode() are not called with the mutex held. Found by Linux Driver Verification project (linuxtesting.org). Cc: stable@vger.kernel.org # 3.16 Signed-off-by: Alexey Khoroshilov Signed-off-by: Al Viro --- fs/ufs/inode.c | 7 ++----- fs/ufs/namei.c | 14 ++++++-------- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7c580c97990e..be7d42c7d938 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -902,9 +902,6 @@ void ufs_evict_inode(struct inode * inode) invalidate_inode_buffers(inode); clear_inode(inode); - if (want_delete) { - lock_ufs(inode->i_sb); - ufs_free_inode (inode); - unlock_ufs(inode->i_sb); - } + if (want_delete) + ufs_free_inode(inode); } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 90d74b8f8eba..2df62a73f20c 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -126,12 +126,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > sb->s_blocksize) goto out_notlocked; - lock_ufs(dir->i_sb); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out; + goto out_notlocked; + lock_ufs(dir->i_sb); if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ inode->i_op = &ufs_symlink_inode_operations; @@ -181,13 +181,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; - lock_ufs(dir->i_sb); - inode_inc_link_count(dir); - inode = ufs_new_inode(dir, S_IFDIR|mode); - err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_dir; + return PTR_ERR(inode); inode->i_op = &ufs_dir_inode_operations; inode->i_fop = &ufs_dir_operations; @@ -195,6 +191,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) inode_inc_link_count(inode); + lock_ufs(dir->i_sb); + inode_inc_link_count(dir); + err = ufs_make_empty(inode, dir); if (err) goto out_fail; @@ -212,7 +211,6 @@ out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); iput (inode); -out_dir: inode_dec_link_count(dir); unlock_ufs(dir->i_sb); goto out; -- cgit v1.2.3 From aee3776441461c14ba6d8ed9e2149933e65abb6e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Aug 2014 14:49:50 -0400 Subject: nfsd4: fix rd_dircount enforcement Commit 3b299709091b "nfsd4: enforce rd_dircount" totally misunderstood rd_dircount; it refers to total non-attribute bytes returned, not number of directory entries returned. Bring the code into agreement with RFC 3530 section 14.2.24. Cc: stable@vger.kernel.org Fixes: 3b299709091b "nfsd4: enforce rd_dircount" Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f9821ce6658a..e94457c33ad6 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2657,6 +2657,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, struct xdr_stream *xdr = cd->xdr; int start_offset = xdr->buf->len; int cookie_offset; + u32 name_and_cookie; int entry_bytes; __be32 nfserr = nfserr_toosmall; __be64 wire_offset; @@ -2718,7 +2719,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, cd->rd_maxcount -= entry_bytes; if (!cd->rd_dircount) goto fail; - cd->rd_dircount--; + /* + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so + * let's always let through the first entry, at least: + */ + name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); cd->cookie_offset = cookie_offset; skip_entry: cd->common.err = nfs_ok; @@ -3321,6 +3329,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } maxcount = min_t(int, maxcount-16, bytes_left); + /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ + if (!readdir->rd_dircount) + readdir->rd_dircount = INT_MAX; + readdir->xdr = xdr; readdir->rd_maxcount = maxcount; readdir->common.err = 0; -- cgit v1.2.3 From 7c17705e77b12b20fb8afb7c1b15dcdb126c0c12 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 29 Aug 2014 16:25:50 -0400 Subject: lockd: fix rpcbind crash on lockd startup failure Nikita Yuschenko reported that booting a kernel with init=/bin/sh and then nfs mounting without portmap or rpcbind running using a busybox mount resulted in: # mount -t nfs 10.30.130.21:/opt /mnt svc: failed to register lockdv1 RPC service (errno 111). lockd_up: makesock failed, error=-111 Unable to handle kernel paging request for data at address 0x00000030 Faulting instruction address: 0xc055e65c Oops: Kernel access of bad area, sig: 11 [#1] MPC85xx CDS Modules linked in: CPU: 0 PID: 1338 Comm: mount Not tainted 3.10.44.cge #117 task: cf29cea0 ti: cf35c000 task.ti: cf35c000 NIP: c055e65c LR: c0566490 CTR: c055e648 REGS: cf35dad0 TRAP: 0300 Not tainted (3.10.44.cge) MSR: 00029000 CR: 22442488 XER: 20000000 DEAR: 00000030, ESR: 00000000 GPR00: c05606f4 cf35db80 cf29cea0 cf0ded80 cf0dedb8 00000001 1dec3086 00000000 GPR08: 00000000 c07b1640 00000007 1dec3086 22442482 100b9758 00000000 10090ae8 GPR16: 00000000 000186a5 00000000 00000000 100c3018 bfa46edc 100b0000 bfa46ef0 GPR24: cf386ae0 c07834f0 00000000 c0565f88 00000001 cf0dedb8 00000000 cf0ded80 NIP [c055e65c] call_start+0x14/0x34 LR [c0566490] __rpc_execute+0x70/0x250 Call Trace: [cf35db80] [00000080] 0x80 (unreliable) [cf35dbb0] [c05606f4] rpc_run_task+0x9c/0xc4 [cf35dbc0] [c0560840] rpc_call_sync+0x50/0xb8 [cf35dbf0] [c056ee90] rpcb_register_call+0x54/0x84 [cf35dc10] [c056f24c] rpcb_register+0xf8/0x10c [cf35dc70] [c0569e18] svc_unregister.isra.23+0x100/0x108 [cf35dc90] [c0569e38] svc_rpcb_cleanup+0x18/0x30 [cf35dca0] [c0198c5c] lockd_up+0x1dc/0x2e0 [cf35dcd0] [c0195348] nlmclnt_init+0x2c/0xc8 [cf35dcf0] [c015bb5c] nfs_start_lockd+0x98/0xec [cf35dd20] [c015ce6c] nfs_create_server+0x1e8/0x3f4 [cf35dd90] [c0171590] nfs3_create_server+0x10/0x44 [cf35dda0] [c016528c] nfs_try_mount+0x158/0x1e4 [cf35de20] [c01670d0] nfs_fs_mount+0x434/0x8c8 [cf35de70] [c00cd3bc] mount_fs+0x20/0xbc [cf35de90] [c00e4f88] vfs_kern_mount+0x50/0x104 [cf35dec0] [c00e6e0c] do_mount+0x1d0/0x8e0 [cf35df10] [c00e75ac] SyS_mount+0x90/0xd0 [cf35df40] [c000ccf4] ret_from_syscall+0x0/0x3c The addition of svc_shutdown_net() resulted in two calls to svc_rpcb_cleanup(); the second is no longer necessary and crashes when it calls rpcb_register_call with clnt=NULL. Reported-by: Nikita Yushchenko Fixes: 679b033df484 "lockd: ensure we tear down any live sockets when socket creation fails during lockd_up" Cc: stable@vger.kernel.org Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 8f27c93f8d2e..ec9e082f9ecd 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -253,13 +253,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net) error = make_socks(serv, net); if (error < 0) - goto err_socks; + goto err_bind; set_grace_period(net); dprintk("lockd_up_net: per-net data created; net=%p\n", net); return 0; -err_socks: - svc_rpcb_cleanup(serv, net); err_bind: ln->nlmsvc_users--; return error; -- cgit v1.2.3 From c47ca32d3aadb234f73389a34c97574085bc9eda Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Sep 2014 14:09:15 +0300 Subject: Btrfs: kfree()ing ERR_PTRs The "inherit" in btrfs_ioctl_snap_create_v2() and "vol_args" in btrfs_ioctl_rm_dev() are ERR_PTRs so we can't call kfree() on them. These kind of bugs are "One Err Bugs" where there is just one error label that does everything. I could set the "inherit = NULL" and keep the single out label but it ends up being more complicated that way. It makes the code simpler to re-order the unwind so it's in the mirror order of the allocation and introduce some new error labels. Signed-off-by: Dan Carpenter Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a018ea484d39..8a8e29878c34 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1703,7 +1703,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | BTRFS_SUBVOL_QGROUP_INHERIT)) { ret = -EOPNOTSUPP; - goto out; + goto free_args; } if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) @@ -1713,27 +1713,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { if (vol_args->size > PAGE_CACHE_SIZE) { ret = -EINVAL; - goto out; + goto free_args; } inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); if (IS_ERR(inherit)) { ret = PTR_ERR(inherit); - goto out; + goto free_args; } } ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, vol_args->fd, subvol, ptr, readonly, inherit); + if (ret) + goto free_inherit; - if (ret == 0 && ptr && - copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), ptr, sizeof(*ptr))) + if (ptr && copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), + ptr, sizeof(*ptr))) ret = -EFAULT; -out: - kfree(vol_args); + +free_inherit: kfree(inherit); +free_args: + kfree(vol_args); return ret; } @@ -2653,7 +2657,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto err_drop; } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; @@ -2671,6 +2675,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) out: kfree(vol_args); +err_drop: mnt_drop_write_file(file); return ret; } -- cgit v1.2.3 From 49dae1bc1c665817e434d01eefaa11967f618243 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 6 Sep 2014 22:34:39 +0100 Subject: Btrfs: fix fsync data loss after a ranged fsync While we're doing a full fsync (when the inode has the flag BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a portion of the file), we might have ordered operations that are started before or while we're logging the inode and that fall outside the fsync range. Therefore when a full ranged fsync finishes don't remove every extent map from the list of modified extent maps - as for some of them, that fall outside our fsync range, their respective ordered operation hasn't finished yet, meaning the corresponding file extent item wasn't inserted into the fs/subvol tree yet and therefore we didn't log it, and we must let the next fast fsync (one that checks only the modified list) see this extent map and log a matching file extent item to the log btree and wait for its ordered operation to finish (if it's still ongoing). A test case for xfstests follows. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 77 ++++++++++++++++++++++++++++++++++++++++++----------- fs/btrfs/tree-log.h | 2 ++ 3 files changed, 64 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36861b7a6757..ff1cc0399b9a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1966,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(&ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7e0e6e3029dd..d296efe2d3e7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,10 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3858,8 +3860,10 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -3876,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; path = btrfs_alloc_path(); if (!path) @@ -4049,13 +4054,35 @@ log_extents: goto out_unlock; } } else if (inode_only == LOG_INODE_ALL) { - struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; - write_lock(&tree->lock); - list_for_each_entry_safe(em, n, &tree->modified_extents, list) - list_del_init(&em->list); - write_unlock(&tree->lock); + write_lock(&em_tree->lock); + /* + * We can't just remove every em if we're called for a ranged + * fsync - that is, one that doesn't cover the whole possible + * file range (0 to LLONG_MAX). This is because we can have + * em's that fall outside the range we're logging and therefore + * their ordered operations haven't completed yet + * (btrfs_finish_ordered_io() not invoked yet). This means we + * didn't get their respective file extent item in the fs/subvol + * tree yet, and need to let the next fast fsync (one which + * consults the list of modified extent maps) find the em so + * that it logs a matching file extent item and waits for the + * respective ordered operation to complete (if it's still + * running). + * + * Removing every em outside the range we're logging would make + * the next fast fsync not log their matching file extent items, + * therefore making us lose data after a log replay. + */ + list_for_each_entry_safe(em, n, &em_tree->modified_extents, + list) { + const u64 mod_end = em->mod_start + em->mod_len - 1; + + if (em->mod_start >= start && mod_end <= end) + list_del_init(&em->list); + } + write_unlock(&em_tree->lock); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { @@ -4065,8 +4092,19 @@ log_extents: goto out_unlock; } } - BTRFS_I(inode)->logged_trans = trans->transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; + + write_lock(&em_tree->lock); + /* + * If we're doing a ranged fsync and there are still modified extents + * in the list, we must run on the next fsync call as it might cover + * those extents (a full fsync or an fsync for other range). + */ + if (list_empty(&em_tree->modified_extents)) { + BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = + BTRFS_I(inode)->last_sub_trans; + } + write_unlock(&em_tree->lock); out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); @@ -4161,7 +4199,10 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only, + struct dentry *parent, + const loff_t start, + const loff_t end, + int exists_only, struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; @@ -4207,7 +4248,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only); + ret = btrfs_log_inode(trans, root, inode, inode_only, start, end); if (ret) goto end_trans; @@ -4235,7 +4276,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { - ret = btrfs_log_inode(trans, root, inode, inode_only); + ret = btrfs_log_inode(trans, root, inode, inode_only, + 0, LLONG_MAX); if (ret) goto end_trans; } @@ -4269,13 +4311,15 @@ end_no_trans: */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, - 0, ctx); + start, end, 0, ctx); dput(parent); return ret; @@ -4512,6 +4556,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); + return btrfs_log_inode_parent(trans, root, inode, parent, 0, + LLONG_MAX, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 7f5b41bd5373..e2e798ae7cd7 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, -- cgit v1.2.3 From b0d5d10f41a0f1cd839408dd94427f2db3553bca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 8 Sep 2014 13:08:51 -0700 Subject: Btrfs: use insert_inode_locked4 for inode creation Btrfs was inserting inodes into the hash table before we had fully set the inode up on disk. This leaves us open to rare races that allow two different inodes in memory for the same [root, inode] pair. This patch fixes things by using insert_inode_locked4 to insert an I_NEW inode and unlock_new_inode when we're ready for the rest of the kernel to use the inode. It also makes sure to init the operations pointers on the inode before going into the error handling paths. Signed-off-by: Chris Mason Reported-by: Al Viro --- fs/btrfs/inode.c | 176 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 88823f4ca451..214b936bdd3d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5634,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index) return ret; } +static int btrfs_insert_inode_locked(struct inode *inode) +{ + struct btrfs_iget_args args; + args.location = &BTRFS_I(inode)->location; + args.root = BTRFS_I(inode)->root; + + return insert_inode_locked4(inode, + btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), + btrfs_find_actor, &args); +} + static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, @@ -5726,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[1] = name_len + sizeof(*ref); } + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) + goto fail; + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) - goto fail; + goto fail_unlock; inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); @@ -5752,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { @@ -5767,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - btrfs_insert_inode_hash(inode); inode_tree_add(inode); trace_btrfs_inode_new(inode); @@ -5782,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_ino(inode), root->root_key.objectid, ret); return inode; + +fail_unlock: + unlock_new_inode(inode); fail: if (dir && name) BTRFS_I(dir)->index_cnt--; @@ -5916,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see * if the filesystem supports xattrs by looking at the * ops vector. */ - inode->i_op = &btrfs_special_inode_operations; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + init_special_inode(inode, inode->i_mode, rdev); + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - drop_inode = 1; - else { - init_special_inode(inode, inode->i_mode, rdev); + goto out_unlock_inode; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) { + goto out_unlock_inode; + } else { btrfs_update_inode(trans, root, inode); + unlock_new_inode(inode); d_instantiate(dentry, inode); } + out_unlock: btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); @@ -5947,6 +5964,12 @@ out_unlock: iput(inode); } return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_create(struct inode *dir, struct dentry *dentry, @@ -5981,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } drop_inode_on_err = 1; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, inode); - if (err) - goto out_unlock; - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -5998,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; + + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - goto out_unlock; + goto out_unlock_inode; - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + unlock_new_inode(inode); d_instantiate(dentry, inode); out_unlock: @@ -6017,6 +6040,11 @@ out_unlock: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6124,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } drop_on_err = 1; + /* these must be set before we unlock the inode */ + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_fail; - - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; + goto out_fail_inode; btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); if (err) - goto out_fail; + goto out_fail_inode; err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, 0, index); if (err) - goto out_fail; + goto out_fail_inode; d_instantiate(dentry, inode); + /* + * mkdir is special. We're unlocking after we call d_instantiate + * to avoid a race with nfsd calling d_instantiate. + */ + unlock_new_inode(inode); drop_on_err = 0; out_fail: @@ -6152,6 +6185,10 @@ out_fail: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_fail_inode: + unlock_new_inode(inode); + goto out_fail; } /* helper for btfs_get_extent. Given an existing extent in the tree, @@ -8107,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + unlock_new_inode(inode); err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) @@ -8757,12 +8795,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -8771,23 +8803,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - } - if (drop_inode) - goto out_unlock; + goto out_unlock_inode; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - drop_inode = 1; - goto out_unlock; + goto out_unlock_inode; } key.objectid = btrfs_ino(inode); key.offset = 0; @@ -8796,9 +8827,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { - drop_inode = 1; btrfs_free_path(path); - goto out_unlock; + goto out_unlock_inode; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -8822,12 +8852,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); - if (err) + if (err) { drop_inode = 1; + goto out_unlock_inode; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); out_unlock: - if (!err) - d_instantiate(dentry, inode); btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); @@ -8835,6 +8868,11 @@ out_unlock: } btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -9018,14 +9056,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } - ret = btrfs_init_inode_security(trans, inode, dir, NULL); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, inode); - if (ret) - goto out; - inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; @@ -9033,9 +9063,16 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out_inode; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out_inode; ret = btrfs_orphan_add(trans, inode); if (ret) - goto out; + goto out_inode; /* * We set number of links to 0 in btrfs_new_inode(), and here we set @@ -9045,6 +9082,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); + unlock_new_inode(inode); d_tmpfile(dentry, inode); mark_inode_dirty(inode); @@ -9054,8 +9092,12 @@ out: iput(inode); btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); - return ret; + +out_inode: + unlock_new_inode(inode); + goto out; + } static const struct inode_operations btrfs_dir_inode_operations = { -- cgit v1.2.3 From 21e81002f9788a3af591416b6dec60d7b67f2fb2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 8 Sep 2014 16:17:55 -0700 Subject: nfs: fix kernel warning when removing proc entry I saw the following kernel warning: [ 1852.321222] ------------[ cut here ]------------ [ 1852.326527] WARNING: CPU: 0 PID: 118 at fs/proc/generic.c:521 remove_proc_entry+0x154/0x16b() [ 1852.335630] remove_proc_entry: removing non-empty directory 'fs/nfsfs', leaking at least 'volumes' [ 1852.344084] CPU: 0 PID: 118 Comm: kworker/u8:2 Not tainted 3.16.0+ #540 [ 1852.350036] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 1852.354992] Workqueue: netns cleanup_net [ 1852.358701] 0000000000000000 ffff880116f2fbd0 ffffffff819c03e9 ffff880116f2fc18 [ 1852.366474] ffff880116f2fc08 ffffffff810744ee ffffffff811e0e6e ffff8800d4e96238 [ 1852.373507] ffffffff81dbe665 ffff8800d46a5948 0000000000000005 ffff880116f2fc68 [ 1852.380224] Call Trace: [ 1852.381976] [] dump_stack+0x4d/0x66 [ 1852.385495] [] warn_slowpath_common+0x7a/0x93 [ 1852.389869] [] ? remove_proc_entry+0x154/0x16b [ 1852.393987] [] warn_slowpath_fmt+0x4c/0x4e [ 1852.397999] [] remove_proc_entry+0x154/0x16b [ 1852.402034] [] nfs_fs_proc_net_exit+0x53/0x56 [ 1852.406136] [] nfs_net_exit+0x12/0x1d [ 1852.409774] [] ops_exit_list+0x44/0x55 [ 1852.413529] [] cleanup_net+0xee/0x182 [ 1852.417198] [] process_one_work+0x209/0x40d [ 1852.502320] [] ? process_one_work+0x162/0x40d [ 1852.587629] [] worker_thread+0x1f0/0x2c7 [ 1852.673291] [] ? process_scheduled_works+0x2f/0x2f [ 1852.759470] [] kthread+0xc9/0xd1 [ 1852.843099] [] ? finish_task_switch+0x3a/0xce [ 1852.926518] [] ? __kthread_parkme+0x61/0x61 [ 1853.008565] [] ret_from_fork+0x7c/0xb0 [ 1853.076477] [] ? __kthread_parkme+0x61/0x61 [ 1853.140653] ---[ end trace 69c4c6617f78e32d ]--- It looks wrong that we add "/proc/net/nfsfs" in nfs_fs_proc_net_init() while remove "/proc/fs/nfsfs" in nfs_fs_proc_net_exit(). Fixes: commit 65b38851a17 (NFS: Fix /proc/fs/nfsfs/servers and /proc/fs/nfsfs/volumes) Cc: Eric W. Biederman Cc: Trond Myklebust Cc: Dan Aloni Signed-off-by: Cong Wang [Trond: replace uses of remove_proc_entry() with remove_proc_subtree() as suggested by Al Viro] Cc: stable@vger.kernel.org # 3.4.x : 65b38851a17: NFS: Fix /proc/fs/nfsfs/servers Cc: stable@vger.kernel.org # 3.4.x Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1c5ff6d58385..6a4f3666e273 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1412,24 +1412,18 @@ int nfs_fs_proc_net_init(struct net *net) p = proc_create("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_volume_list_fops); if (!p) - goto error_2; + goto error_1; return 0; -error_2: - remove_proc_entry("servers", nn->proc_nfsfs); error_1: - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("nfsfs", net->proc_net); error_0: return -ENOMEM; } void nfs_fs_proc_net_exit(struct net *net) { - struct nfs_net *nn = net_generic(net, nfs_net_id); - - remove_proc_entry("volumes", nn->proc_nfsfs); - remove_proc_entry("servers", nn->proc_nfsfs); - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("nfsfs", net->proc_net); } /* -- cgit v1.2.3 From 0c0e0d3c091ed5b823fb89e1d46dbb9abd5830a7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Sep 2014 08:26:01 -0400 Subject: nfs: revert "nfs4: queue free_lock_state job submission to nfsiod" This reverts commit 49a4bda22e186c4d0eb07f4a36b5b1a378f9398d. Christoph reported an oops due to the above commit: generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1] SMP [ 2187.042899] Modules linked in: [ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151 [ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 2187.044287] Workqueue: nfsiod free_lock_state_work [ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000 [ 2187.044287] RIP: 0010:[] [] free_lock_state_work+0x16/0x30 [ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296 [ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000 [ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8 [ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000 [ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240 [ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000 [ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 [ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0 [ 2187.044287] Stack: [ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258 [ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0 [ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240 [ 2187.044287] Call Trace: [ 2187.044287] [] process_one_work+0x1c7/0x490 [ 2187.044287] [] ? process_one_work+0x15d/0x490 [ 2187.044287] [] worker_thread+0x119/0x4f0 [ 2187.044287] [] ? trace_hardirqs_on+0xd/0x10 [ 2187.044287] [] ? init_pwq+0x190/0x190 [ 2187.044287] [] kthread+0xdf/0x100 [ 2187.044287] [] ? __init_kthread_worker+0x70/0x70 [ 2187.044287] [] ret_from_fork+0x7c/0xb0 [ 2187.044287] [] ? __init_kthread_worker+0x70/0x70 [ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3 [ 2187.044287] RIP [] free_lock_state_work+0x16/0x30 [ 2187.044287] RSP [ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]--- The original reason for this patch was because the fl_release_private operation couldn't sleep. With commit ed9814d85810 (locks: defer freeing locks in locks_delete_lock until after i_lock has been dropped), this is no longer a problem so we can revert this patch. Reported-by: Christoph Hellwig Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Tested-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 13 ++++++------- fs/nfs/nfs4state.c | 24 ++++++------------------ 2 files changed, 12 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 92193eddb41d..a8b855ab4e22 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -130,16 +130,15 @@ enum { */ struct nfs4_lock_state { - struct list_head ls_locks; /* Other lock stateids */ - struct nfs4_state * ls_state; /* Pointer to open state */ + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ #define NFS_LOCK_INITIALIZED 0 #define NFS_LOCK_LOST 1 - unsigned long ls_flags; + unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; - nfs4_stateid ls_stateid; - atomic_t ls_count; - fl_owner_t ls_owner; - struct work_struct ls_release; + nfs4_stateid ls_stateid; + atomic_t ls_count; + fl_owner_t ls_owner; }; /* bits for nfs4_state->flags */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a043f618cd5a..22fe35104c0c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -799,18 +799,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) return NULL; } -static void -free_lock_state_work(struct work_struct *work) -{ - struct nfs4_lock_state *lsp = container_of(work, - struct nfs4_lock_state, ls_release); - struct nfs4_state *state = lsp->ls_state; - struct nfs_server *server = state->owner->so_server; - struct nfs_client *clp = server->nfs_client; - - clp->cl_mvops->free_lock_state(server, lsp); -} - /* * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. @@ -832,7 +820,6 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f if (lsp->ls_seqid.owner_id < 0) goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); - INIT_WORK(&lsp->ls_release, free_lock_state_work); return lsp; out_free: kfree(lsp); @@ -896,12 +883,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) - queue_work(nfsiod_workqueue, &lsp->ls_release); - else { - server = state->owner->so_server; + server = state->owner->so_server; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct nfs_client *clp = server->nfs_client; + + clp->cl_mvops->free_lock_state(server, lsp); + } else nfs4_free_lock_state(server, lsp); - } } static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) -- cgit v1.2.3 From 224ecbf5a674ec7da3a3b3ea21ca62e2853653fa Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 9 Sep 2014 17:51:47 -0400 Subject: pnfs: fix filelayout_retry_commit when idx > 0 filelayout_retry_commit was recently split out from alloc_ds_commits, but was done in such a way that the bucket pointer always starts at index 0 no matter what the @idx argument is set to. The intention of the @idx argument is to retry commits starting at bucket @idx. This is called when alloc_ds_commits fails for a bucket. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 1359c4a27393..90978075f730 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1269,11 +1269,12 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - struct pnfs_commit_bucket *bucket = fl_cinfo->buckets; + struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; int i; - for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) { + for (i = idx; i < fl_cinfo->nbuckets; i++) { + bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) continue; nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); -- cgit v1.2.3 From c680e41b3a2e944185c74bf60531e3d316d3ecc4 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Tue, 9 Sep 2014 14:50:51 -0700 Subject: eventpoll: fix uninitialized variable in epoll_ctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling epoll_ctl with operation EPOLL_CTL_DEL, structure epds is not initialized but ep_take_care_of_epollwakeup reads its event field. When this unintialized field has EPOLLWAKEUP bit set, a capability check is done for CAP_BLOCK_SUSPEND in ep_take_care_of_epollwakeup. This produces unexpected messages in the audit log, such as (on a system running SELinux): type=AVC msg=audit(1408212798.866:410): avc: denied { block_suspend } for pid=7754 comm="dbus-daemon" capability=36 scontext=unconfined_u:unconfined_r:unconfined_t tcontext=unconfined_u:unconfined_r:unconfined_t tclass=capability2 permissive=1 type=SYSCALL msg=audit(1408212798.866:410): arch=c000003e syscall=233 success=yes exit=0 a0=3 a1=2 a2=9 a3=7fffd4d66ec0 items=0 ppid=1 pid=7754 auid=1000 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=3 comm="dbus-daemon" exe="/usr/bin/dbus-daemon" subj=unconfined_u:unconfined_r:unconfined_t key=(null) ("arch=c000003e syscall=233 a1=2" means "epoll_ctl(op=EPOLL_CTL_DEL)") Remove use of epds in epoll_ctl when op == EPOLL_CTL_DEL. Fixes: 4d7e30d98939 ("epoll: Add a flag, EPOLLWAKEUP, to prevent suspend while epoll events are ready") Signed-off-by: Nicolas Iooss Cc: Alexander Viro Cc: Arve Hjønnevåg Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b10b48c2a7af..7bcfff900f05 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - ep_take_care_of_epollwakeup(&epds); + if (ep_op_has_event(op)) + ep_take_care_of_epollwakeup(&epds); /* * We have to check that the file structure underneath the file descriptor -- cgit v1.2.3 From 1fc98d11cac6dd66342e5580cb2687e5b1e9a613 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 9 Sep 2014 14:51:04 -0700 Subject: fsnotify/fdinfo: use named constants instead of hardcoded values MAX_HANDLE_SZ is equal to 128, but currently the size of pad is only 64 bytes, so exportfs_encode_inode_fh can return an error. Signed-off-by: Andrey Vagin Acked-by: Cyrill Gorcunov Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fdinfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 238a5930cb3c..660d33bc1bef 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) { struct { struct file_handle handle; - u8 pad[64]; + u8 pad[MAX_HANDLE_SZ]; } f; int size, ret, i; @@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f.handle.handle_bytes >> 2; ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); - if ((ret == 255) || (ret == -ENOSPC)) { + if ((ret == FILEID_INVALID) || (ret == -ENOSPC)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return 0; } -- cgit v1.2.3 From 7e8824816bda16bb11ff5ff1e1212d642e57b0b3 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 9 Sep 2014 14:51:06 -0700 Subject: fs/notify: don't show f_handle if exportfs_encode_inode_fh failed Currently we handle only ENOSPC. In case of other errors the file_handle variable isn't filled properly and we will show a part of stack. Signed-off-by: Andrey Vagin Acked-by: Cyrill Gorcunov Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fdinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 660d33bc1bef..9d7e2b9659cb 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f.handle.handle_bytes >> 2; ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); - if ((ret == FILEID_INVALID) || (ret == -ENOSPC)) { + if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return 0; } -- cgit v1.2.3 From 7b7a91152d3c2ddca95172cac1675625cc8dffaf Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 10 Sep 2014 14:09:20 -0400 Subject: GFS2: Hash the negative dentry during inode lookup Fix a regression introduced by: 6d4ade986f9c8df31e68 GFS2: Add atomic_open support where an early return misses d_splice_alias() which had been adding the negative dentry. Signed-off-by: Benjamin Coddington Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e62e59477884..9317ddc1b3c3 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -840,8 +840,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, int error; inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (!inode) + if (inode == NULL) { + d_add(dentry, NULL); return NULL; + } if (IS_ERR(inode)) return ERR_CAST(inode); -- cgit v1.2.3 From a937cca270c544c4319572b57b196b2251b07dd2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Sep 2014 21:22:51 +0200 Subject: GFS2: Don't use MAXQUOTAS value MAXQUOTAS value defines maximum number of quota types VFS supports. This isn't necessarily the number of types gfs2 supports and with addition of project quotas these two numbers stop matching. So make gfs2 use its private definition. CC: cluster-devel@redhat.com Signed-off-by: Jan Kara Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 67d310c9ada3..39e7e9959b74 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -262,6 +262,9 @@ struct gfs2_holder { unsigned long gh_ip; }; +/* Number of quota types we support */ +#define GFS2_MAXQUOTAS 2 + /* Resource group multi-block reservation, in order of appearance: Step 1. Function prepares to write, allocates a mb, sets the size hint. @@ -282,8 +285,8 @@ struct gfs2_blkreserv { u64 rs_inum; /* Inode number for reservation */ /* ancillary quota stuff */ - struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; - struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS]; + struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS]; + struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS]; unsigned int rs_qa_qd_num; }; -- cgit v1.2.3 From f39c01047994e66e7f3d89ddb4c6141f23349d8d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Sep 2014 16:19:37 +1000 Subject: NFS: remove BUG possibility in nfs4_open_and_get_state commit 4fa2c54b5198d09607a534e2fd436581064587ed NFS: nfs4_do_open should add negative results to the dcache. used "d_drop(); d_add();" to ensure that a dentry was hashed as a negative cached entry. This is not safe if the dentry has an non-NULL ->d_inode. It will trigger a BUG_ON in d_instantiate(). In that case, d_delete() is needed. Also, only d_add if the dentry is currently unhashed, it seems pointless removed and re-adding it unchanged. Reported-by: Christoph Hellwig Fixes: 4fa2c54b5198d09607a534e2fd436581064587ed Cc: Jeff Layton Link: http://lkml.kernel.org/r/20140908144525.GB19811@infradead.org Signed-off-by: NeilBrown Acked-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7dd8aca31c29..ac2dd953fc18 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2226,9 +2226,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ret = _nfs4_proc_open(opendata); if (ret != 0) { if (ret == -ENOENT) { - d_drop(opendata->dentry); - d_add(opendata->dentry, NULL); - nfs_set_verifier(opendata->dentry, + dentry = opendata->dentry; + if (dentry->d_inode) + d_delete(dentry); + else if (d_unhashed(dentry)) + d_add(dentry, NULL); + + nfs_set_verifier(dentry, nfs_save_change_attribute(opendata->dir->d_inode)); } goto out; -- cgit v1.2.3 From cfb2f9d5c921e38b0f12bb26fed10b877664444d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 12 Sep 2014 20:56:04 +0100 Subject: GFS2: fix d_splice_alias() misuses Callers of d_splice_alias(dentry, inode) don't need iput(), neither on success nor on failure. Either the reference to inode is stored in a previously negative dentry, or it's dropped. In either case inode reference the caller used to hold is consumed. __gfs2_lookup() does iput() in case when d_splice_alias() has failed. Double iput() if we ever hit that. And gfs2_create_inode() ends up not only with double iput(), but with link count dropped to zero - on an inode it has just found in directory. Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Al Viro Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9317ddc1b3c3..fc8ac2ee0667 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!IS_ERR(inode)) { d = d_splice_alias(inode, dentry); error = PTR_ERR(d); - if (IS_ERR(d)) + if (IS_ERR(d)) { + inode = ERR_CAST(d); goto fail_gunlock; + } error = 0; if (file) { if (S_ISREG(inode->i_mode)) { @@ -856,7 +858,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, d = d_splice_alias(inode, dentry); if (IS_ERR(d)) { - iput(inode); gfs2_glock_dq_uninit(&gh); return d; } -- cgit v1.2.3 From 99d263d4c5b2f541dfacb5391e22e8c91ea982a6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 13 Sep 2014 11:30:10 -0700 Subject: vfs: fix bad hashing of dentries Josef Bacik found a performance regression between 3.2 and 3.10 and narrowed it down to commit bfcfaa77bdf0 ("vfs: use 'unsigned long' accesses for dcache name comparison and hashing"). He reports: "The test case is essentially for (i = 0; i < 1000000; i++) mkdir("a$i"); On xfs on a fio card this goes at about 20k dir/sec with 3.2, and 12k dir/sec with 3.10. This is because we spend waaaaay more time in __d_lookup on 3.10 than in 3.2. The new hashing function for strings is suboptimal for < sizeof(unsigned long) string names (and hell even > sizeof(unsigned long) string names that I've tested). I broke out the old hashing function and the new one into a userspace helper to get real numbers and this is what I'm getting: Old hash table had 1000000 entries, 0 dupes, 0 max dupes New hash table had 12628 entries, 987372 dupes, 900 max dupes We had 11400 buckets with a p50 of 30 dupes, p90 of 240 dupes, p99 of 567 dupes for the new hash My test does the hash, and then does the d_hash into a integer pointer array the same size as the dentry hash table on my system, and then just increments the value at the address we got to see how many entries we overlap with. As you can see the old hash function ended up with all 1 million entries in their own bucket, whereas the new one they are only distributed among ~12.5k buckets, which is why we're using so much more CPU in __d_lookup". The reason for this hash regression is two-fold: - On 64-bit architectures the down-mixing of the original 64-bit word-at-a-time hash into the final 32-bit hash value is very simplistic and suboptimal, and just adds the two 32-bit parts together. In particular, because there is no bit shuffling and the mixing boundary is also a byte boundary, similar character patterns in the low and high word easily end up just canceling each other out. - the old byte-at-a-time hash mixed each byte into the final hash as it hashed the path component name, resulting in the low bits of the hash generally being a good source of hash data. That is not true for the word-at-a-time case, and the hash data is distributed among all the bits. The fix is the same in both cases: do a better job of mixing the bits up and using as much of the hash data as possible. We already have the "hash_32|64()" functions to do that. Reported-by: Josef Bacik Cc: Al Viro Cc: Christoph Hellwig Cc: Chris Mason Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/dcache.c | 3 +-- fs/namei.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index d30ce699ae4b..4023e77b800e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash + (hash >> d_hash_shift); - return dentry_hashtable + (hash & d_hash_mask); + return dentry_hashtable + hash_32(hash, d_hash_shift); } /* Statistics gathering. */ diff --git a/fs/namei.c b/fs/namei.c index a996bb48dfab..229235862e50 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1634,8 +1635,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) static inline unsigned int fold_hash(unsigned long hash) { - hash += hash >> (8*sizeof(int)); - return hash; + return hash_64(hash, 32); } #else /* 32-bit case */ -- cgit v1.2.3 From 6f18493e541c690169c3b1479d47d95f624161cf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 11 Sep 2014 18:55:50 -0400 Subject: move the call of __d_drop(anon) into __d_materialise_unique(dentry, anon) and lock the right list there Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index d30ce699ae4b..5c6e71dc23f5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2656,6 +2656,12 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) dentry->d_parent = dentry; list_del_init(&dentry->d_u.d_child); anon->d_parent = dparent; + if (likely(!d_unhashed(anon))) { + hlist_bl_lock(&anon->d_sb->s_anon); + __hlist_bl_del(&anon->d_hash); + anon->d_hash.pprev = NULL; + hlist_bl_unlock(&anon->d_sb->s_anon); + } list_move(&anon->d_u.d_child, &dparent->d_subdirs); write_seqcount_end(&dentry->d_seq); @@ -2714,7 +2720,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) write_seqlock(&rename_lock); __d_materialise_dentry(dentry, new); write_sequnlock(&rename_lock); - __d_drop(new); _d_rehash(new); spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); @@ -2778,7 +2783,6 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) * could splice into our tree? */ __d_materialise_dentry(dentry, alias); write_sequnlock(&rename_lock); - __d_drop(alias); goto found; } else { /* Nope, but we must(!) avoid directory -- cgit v1.2.3 From f5be3e29127aec8c87f883aadadff337f8c2cfd7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Sep 2014 21:50:45 -0400 Subject: fix bogus read_seqretry() checks introduced in b37199e read_seqretry() returns true on mismatch, not on match... Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index a996bb48dfab..3d1dc745f9d8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1137,7 +1137,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, */ *inode = path->dentry->d_inode; } - return read_seqretry(&mount_lock, nd->m_seq) && + return !read_seqretry(&mount_lock, nd->m_seq) && !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } @@ -1174,7 +1174,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - if (!read_seqretry(&mount_lock, nd->m_seq)) + if (read_seqretry(&mount_lock, nd->m_seq)) goto failed; } nd->inode = nd->path.dentry->d_inode; -- cgit v1.2.3 From 7bd88377d482e1eae3c5329b12e33cfd664fa6a9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Sep 2014 21:55:46 -0400 Subject: don't bugger nd->seq on set_root_rcu() from follow_dotdot_rcu() return the value instead, and have path_init() do the assignment. Broken by "vfs: Fix absolute RCU path walk failures due to uninitialized seq number", which was Cc-stable with 2.6.38+ as destination. This one should go where it went. To avoid dummy value returned in case when root is already set (it would do no harm, actually, since the only caller that doesn't ignore the return value is guaranteed to have nd->root *not* set, but it's more obvious that way), lift the check into callers. And do the same to set_root(), to keep them in sync. Cc: stable@vger.kernel.org # 2.6.38+ Signed-off-by: Al Viro --- fs/namei.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 3d1dc745f9d8..fe47e6d8e85f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -643,24 +643,22 @@ static int complete_walk(struct nameidata *nd) static __always_inline void set_root(struct nameidata *nd) { - if (!nd->root.mnt) - get_fs_root(current->fs, &nd->root); + get_fs_root(current->fs, &nd->root); } static int link_path_walk(const char *, struct nameidata *); -static __always_inline void set_root_rcu(struct nameidata *nd) +static __always_inline unsigned set_root_rcu(struct nameidata *nd) { - if (!nd->root.mnt) { - struct fs_struct *fs = current->fs; - unsigned seq; + struct fs_struct *fs = current->fs; + unsigned seq, res; - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - } + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + res = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + return res; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -860,7 +858,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p) return PTR_ERR(s); } if (*s == '/') { - set_root(nd); + if (!nd->root.mnt) + set_root(nd); path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); @@ -1143,7 +1142,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static int follow_dotdot_rcu(struct nameidata *nd) { - set_root_rcu(nd); + if (!nd->root.mnt) + set_root_rcu(nd); while (1) { if (nd->path.dentry == nd->root.dentry && @@ -1256,7 +1256,8 @@ static void follow_mount(struct path *path) static void follow_dotdot(struct nameidata *nd) { - set_root(nd); + if (!nd->root.mnt) + set_root(nd); while(1) { struct dentry *old = nd->path.dentry; @@ -1852,7 +1853,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (*name=='/') { if (flags & LOOKUP_RCU) { rcu_read_lock(); - set_root_rcu(nd); + nd->seq = set_root_rcu(nd); } else { set_root(nd); path_get(&nd->root); -- cgit v1.2.3 From 4023bfc9f351a7994fb6a7d515476c320f94a574 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Sep 2014 21:59:43 -0400 Subject: be careful with nd->inode in path_init() and follow_dotdot_rcu() in the former we simply check if dentry is still valid after picking its ->d_inode; in the latter we fetch ->d_inode in the same places where we fetch dentry and its ->d_seq, under the same checks. Cc: stable@vger.kernel.org # 2.6.38+ Signed-off-by: Al Viro --- fs/namei.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index fe47e6d8e85f..d07bc1b206c3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1142,6 +1142,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static int follow_dotdot_rcu(struct nameidata *nd) { + struct inode *inode = nd->inode; if (!nd->root.mnt) set_root_rcu(nd); @@ -1155,6 +1156,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) struct dentry *parent = old->d_parent; unsigned seq; + inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) goto failed; @@ -1164,6 +1166,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) } if (!follow_up_rcu(&nd->path)) break; + inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } while (d_mountpoint(nd->path.dentry)) { @@ -1173,11 +1176,12 @@ static int follow_dotdot_rcu(struct nameidata *nd) break; nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; + inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); if (read_seqretry(&mount_lock, nd->m_seq)) goto failed; } - nd->inode = nd->path.dentry->d_inode; + nd->inode = inode; return 0; failed: @@ -1904,7 +1908,14 @@ static int path_init(int dfd, const char *name, unsigned int flags, } nd->inode = nd->path.dentry->d_inode; - return 0; + if (!(flags & LOOKUP_RCU)) + return 0; + if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) + return 0; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + return -ECHILD; } static inline int lookup_last(struct nameidata *nd, struct path *path) -- cgit v1.2.3 From 2ae83bf93882d1ec0cd775c489bd1bee611f792e Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 14 Sep 2014 17:06:36 -0500 Subject: [CIFS] Fix setting time before epoch (negative time values) xfstest generic/258 sets the time on a file to a negative value (before 1970) which fails since do_div can not handle negative numbers. In addition 'normal' division of 64 bit values does not build on 32 bit arch so have to workaround this by special casing negative values in cifs_NTtimeToUnix Samba server also has a bug with this (see samba bugzilla 7771) but it works to Windows server. Signed-off-by: Steve French --- fs/cifs/netmisc.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 6834b9c3bec1..b333ff60781d 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc) /* BB what about the timezone? BB */ /* Subtract the NTFS time offset, then convert to 1s intervals. */ - u64 t; + s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; + + /* + * Unfortunately can not use normal 64 bit division on 32 bit arch, but + * the alternative, do_div, does not work with negative numbers so have + * to special case them + */ + if (t < 0) { + t = -t; + ts.tv_nsec = (long)(do_div(t, 10000000) * 100); + ts.tv_nsec = -ts.tv_nsec; + ts.tv_sec = -t; + } else { + ts.tv_nsec = (long)do_div(t, 10000000) * 100; + ts.tv_sec = t; + } - t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; - ts.tv_nsec = do_div(t, 10000000) * 100; - ts.tv_sec = t; return ts; } -- cgit v1.2.3 From 9226b5b440f2b4fbb3b797f3cb74a9a627220660 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Sep 2014 17:28:32 -0700 Subject: vfs: avoid non-forwarding large load after small store in path lookup The performance regression that Josef Bacik reported in the pathname lookup (see commit 99d263d4c5b2 "vfs: fix bad hashing of dentries") made me look at performance stability of the dcache code, just to verify that the problem was actually fixed. That turned up a few other problems in this area. There are a few cases where we exit RCU lookup mode and go to the slow serializing case when we shouldn't, Al has fixed those and they'll come in with the next VFS pull. But my performance verification also shows that link_path_walk() turns out to have a very unfortunate 32-bit store of the length and hash of the name we look up, followed by a 64-bit read of the combined hash_len field. That screws up the processor store to load forwarding, causing an unnecessary hickup in this critical routine. It's caused by the ugly calling convention for the "hash_name()" function, and easily fixed by just making hash_name() fill in the whole 'struct qstr' rather than passing it a pointer to just the hash value. With that, the profile for this function looks much smoother. Signed-off-by: Linus Torvalds --- fs/namei.c | 19 ++++++++++--------- include/linux/dcache.h | 1 + 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 229235862e50..2be5120b81b3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1669,13 +1669,14 @@ EXPORT_SYMBOL(full_name_hash); /* * Calculate the length and hash of the path component, and - * return the length of the component; + * fill in the qstr. return the "len" as the result. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline unsigned long hash_name(const char *name, struct qstr *res) { unsigned long a, b, adata, bdata, mask, hash, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + res->name = name; hash = a = 0; len = -sizeof(unsigned long); do { @@ -1691,9 +1692,10 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) mask = create_zero_mask(adata | bdata); hash += a & zero_bytemask(mask); - *hashp = fold_hash(hash); + len += find_zero(mask); + res->hash_len = hashlen_create(fold_hash(hash), len); - return len + find_zero(mask); + return len; } #else @@ -1711,18 +1713,19 @@ EXPORT_SYMBOL(full_name_hash); * We know there's a real path component here of at least * one character. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline long hash_name(const char *name, struct qstr *res) { unsigned long hash = init_name_hash(); unsigned long len = 0, c; + res->name = name; c = (unsigned char)*name; do { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - *hashp = end_name_hash(hash); + res->hash_len = hashlen_create(end_name_hash(hash), len); return len; } @@ -1756,9 +1759,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) break; - len = hash_name(name, &this.hash); - this.name = name; - this.len = len; + len = hash_name(name, &this); type = LAST_NORM; if (name[0] == '.') switch (len) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index e4ae2ad48d07..75a227cc7ce2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -55,6 +55,7 @@ struct qstr { #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } #define hashlen_hash(hashlen) ((u32) (hashlen)) #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) +#define hashlen_create(hash,len) (((u64)(len)<<32)|(u32)(hash)) struct dentry_stat_t { long nr_dentry; -- cgit v1.2.3 From da80659d4aa758dc6935b10ec64513f0b67bc969 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 14 Sep 2014 23:27:09 -0500 Subject: [SMB3] Fix oops when creating symlinks on smb3 We were not checking for symlink support properly for SMB2/SMB3 mounts so could oops when mounted with mfsymlinks when try to create symlink when mfsymlinks on smb2/smb3 mounts Signed-off-by: Steve French Cc: # 3.14+ CC: Sachin Prabhu --- fs/cifs/link.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 68559fd557fb..a5c2812ead68 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto out; - rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb, - fromName, buf, &bytes_written); + if (tcon->ses->server->ops->create_mf_symlink) + rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, + cifs_sb, fromName, buf, &bytes_written); + else + rc = -EOPNOTSUPP; + if (rc) goto out; -- cgit v1.2.3 From d6bb3e9075bbcf758d6084bb581f797bb6ea24c6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 15 Sep 2014 10:51:07 -0700 Subject: vfs: simplify and shrink stack frame of link_path_walk() Commit 9226b5b440f2 ("vfs: avoid non-forwarding large load after small store in path lookup") made link_path_walk() always access the "hash_len" field as a single 64-bit entity, in order to avoid mixed size accesses to the members. However, what I didn't notice was that that effectively means that the whole "struct qstr this" is now basically redundant. We already explicitly track the "const char *name", and if we just use "u64 hash_len" instead of "long len", there is nothing else left of the "struct qstr". We do end up wanting the "struct qstr" if we have a filesystem with a "d_hash()" function, but that's a rare case, and we might as well then just squirrell away the name and hash_len at that point. End result: fewer live variables in the loop, a smaller stack frame, and better code generation. And we don't need to pass in pointers variables to helper functions any more, because the return value contains all the relevant information. So this removes more lines than it adds, and the source code is clearer too. Signed-off-by: Linus Torvalds --- fs/namei.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 215e44254c53..01d03892316c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1674,14 +1674,13 @@ EXPORT_SYMBOL(full_name_hash); /* * Calculate the length and hash of the path component, and - * fill in the qstr. return the "len" as the result. + * return the "hash_len" as the result. */ -static inline unsigned long hash_name(const char *name, struct qstr *res) +static inline u64 hash_name(const char *name) { unsigned long a, b, adata, bdata, mask, hash, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - res->name = name; hash = a = 0; len = -sizeof(unsigned long); do { @@ -1698,9 +1697,7 @@ static inline unsigned long hash_name(const char *name, struct qstr *res) hash += a & zero_bytemask(mask); len += find_zero(mask); - res->hash_len = hashlen_create(fold_hash(hash), len); - - return len; + return hashlen_create(fold_hash(hash), len); } #else @@ -1718,20 +1715,18 @@ EXPORT_SYMBOL(full_name_hash); * We know there's a real path component here of at least * one character. */ -static inline long hash_name(const char *name, struct qstr *res) +static inline u64 hash_name(const char *name) { unsigned long hash = init_name_hash(); unsigned long len = 0, c; - res->name = name; c = (unsigned char)*name; do { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - res->hash_len = hashlen_create(end_name_hash(hash), len); - return len; + return hashlen_create(end_name_hash(hash), len); } #endif @@ -1756,18 +1751,17 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - struct qstr this; - long len; + u64 hash_len; int type; err = may_lookup(nd); if (err) break; - len = hash_name(name, &this); + hash_len = hash_name(name); type = LAST_NORM; - if (name[0] == '.') switch (len) { + if (name[0] == '.') switch (hashlen_len(hash_len)) { case 2: if (name[1] == '.') { type = LAST_DOTDOT; @@ -1781,29 +1775,32 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + struct qstr this = { .hash_len = hash_len, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) break; + hash_len = this.hash_len; + name = this.name; } } - nd->last = this; + nd->last.hash_len = hash_len; + nd->last.name = name; nd->last_type = type; - if (!name[len]) + name += hashlen_len(hash_len); + if (!*name) return 0; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { - len++; - } while (unlikely(name[len] == '/')); - if (!name[len]) + name++; + } while (unlikely(*name == '/')); + if (!*name) return 0; - name += len; - err = walk_component(nd, &next, LOOKUP_FOLLOW); if (err < 0) return err; -- cgit v1.2.3 From a5c3e1c725af9e84deceb3c33939ca4ffe3fefc8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 16 Sep 2014 04:16:19 -0500 Subject: Revert "cifs: No need to send SIGKILL to demux_thread during umount" This reverts commit 52a36244443eabb594bdb63622ff2dd7a083f0e2. Causes rmmod to fail for at least 7 seconds after unmount which makes automated testing a little harder when reloading cifs.ko between test runs. Signed-off-by: Namjae Jeon CC: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8a9fded7c135..36ca2045009b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -837,6 +837,7 @@ cifs_demultiplex_thread(void *p) struct TCP_Server_Info *server = p; unsigned int pdu_length; char *buf = NULL; + struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; current->flags |= PF_MEMALLOC; @@ -927,7 +928,19 @@ cifs_demultiplex_thread(void *p) if (server->smallbuf) /* no sense logging a debug message if NULL */ cifs_small_buf_release(server->smallbuf); + task_to_wake = xchg(&server->tsk, NULL); clean_demultiplex_info(server); + + /* if server->tsk was NULL then wait for a signal before exiting */ + if (!task_to_wake) { + set_current_state(TASK_INTERRUPTIBLE); + while (!signal_pending(current)) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + set_current_state(TASK_RUNNING); + } + module_put_and_exit(0); } @@ -2050,6 +2063,8 @@ cifs_find_tcp_session(struct smb_vol *vol) static void cifs_put_tcp_session(struct TCP_Server_Info *server) { + struct task_struct *task; + spin_lock(&cifs_tcp_ses_lock); if (--server->srv_count > 0) { spin_unlock(&cifs_tcp_ses_lock); @@ -2073,6 +2088,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; + + task = xchg(&server->tsk, NULL); + if (task) + force_sig(SIGKILL, task); } static struct TCP_Server_Info * -- cgit v1.2.3 From 116ae5e2b09f7022281c253a6037a74d0446bfaf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Sep 2014 17:20:56 +0200 Subject: cifs: remove dead code cifs provides two dummy functions 'sess_auth_lanman' and 'sess_auth_kerberos' for the case in which the respective features are not defined. However, the caller is also under an #ifdef, so we just get warnings about unused code: fs/cifs/sess.c:1109:1: warning: 'sess_auth_kerberos' defined but not used [-Wunused-function] sess_auth_kerberos(struct sess_data *sess_data) Removing the dead functions gets rid of the warnings without any downsides that I can see. (Yalin Wang reported the identical problem and fix so added him) Signed-off-by: Arnd Bergmann Signed-off-by: Yalin Wang Signed-off-by: Steve French --- fs/cifs/sess.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3a5e83317683..57db63ff88da 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -745,14 +745,6 @@ out: sess_free_buffer(sess_data); } -#else - -static void -sess_auth_lanman(struct sess_data *sess_data) -{ - sess_data->result = -EOPNOTSUPP; - sess_data->func = NULL; -} #endif static void @@ -1103,15 +1095,6 @@ out: ses->auth_key.response = NULL; } -#else - -static void -sess_auth_kerberos(struct sess_data *sess_data) -{ - cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); - sess_data->result = -ENOSYS; - sess_data->func = NULL; -} #endif /* ! CONFIG_CIFS_UPCALL */ /* -- cgit v1.2.3 From 69af38dbc5b44e90dde35af4a1df3f5510809a1a Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 16 Sep 2014 04:13:31 -0500 Subject: Update version number displayed by modinfo for cifs.ko Update cifs.ko version to 2.05 Signed-off-by: Steve French w --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b0fafa499505..002e0c173939 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.04" +#define CIFS_VERSION "2.05" #endif /* _CIFSFS_H */ -- cgit v1.2.3 From 364d42930d96a872b2076deeb9c24f9ff132de34 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 16 Sep 2014 06:40:25 -0500 Subject: Fix mfsymlinks file size check If the mfsymlinks file size has changed (e.g. the file no longer represents an emulated symlink) we were not returning an error properly. Signed-off-by: Steve French Reviewed-by: Stefan Metzmacher --- fs/cifs/link.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/link.c b/fs/cifs/link.c index a5c2812ead68..5657416d3483 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -343,9 +343,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + rc = -ENOENT; /* it's not a symlink */ goto out; + } io_parms.netfid = fid.netfid; io_parms.pid = current->tgid; -- cgit v1.2.3 From a060dc5010ffa32f3a83e5336f6eeb6551fa137a Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 16 Sep 2014 13:07:35 +0100 Subject: vfs: workaround gcc <4.6 build error in link_path_walk() Commit d6bb3e9075bb ("vfs: simplify and shrink stack frame of link_path_walk()") introduced build problems with GCC versions older than 4.6 due to the initialisation of a member of an anonymous union in struct qstr without enclosing braces. This hits GCC bug 10676 [1] (which was fixed in GCC 4.6 by [2]), and causes the following build error: fs/namei.c: In function 'link_path_walk': fs/namei.c:1778: error: unknown field 'hash_len' specified in initializer This is worked around by adding explicit braces. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=10676 [2] https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=159206 Fixes: d6bb3e9075bb (vfs: simplify and shrink stack frame of link_path_walk()) Signed-off-by: James Hogan Cc: Linus Torvalds Cc: Alexander Viro Cc: Geert Uytterhoeven Cc: linux-fsdevel@vger.kernel.org Cc: linux-metag@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 01d03892316c..a7b05bf82d31 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1775,7 +1775,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - struct qstr this = { .hash_len = hash_len, .name = name }; + struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) break; -- cgit v1.2.3 From 125c4cf9f37c98fed2c08229b31358cfec63dcf6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Sep 2014 21:22:14 +0100 Subject: Btrfs: set inode's logged_trans/last_log_commit after ranged fsync When a ranged fsync finishes if there are still extent maps in the modified list, still set the inode's logged_trans and last_log_commit. This is important in case an inode is fsync'ed and unlinked in the same transaction, to ensure its inode ref gets deleted from the log and the respective dentries in its parent are deleted too from the log (if the parent directory was fsync'ed in the same transaction). Instead make btrfs_inode_in_log() return false if the list of modified extent maps isn't empty. This is an incremental on top of the v4 version of the patch: "Btrfs: fix fsync data loss after a ranged fsync" which was added to its v5, but didn't make it on time. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 13 +++++++++++-- fs/btrfs/tree-log.c | 14 ++------------ 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 43527fd78825..56b8522d5767 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -234,8 +234,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit && BTRFS_I(inode)->last_sub_trans <= - BTRFS_I(inode)->root->last_log_commit) - return 1; + BTRFS_I(inode)->root->last_log_commit) { + /* + * After a ranged fsync we might have left some extent maps + * (that fall outside the fsync's range). So return false + * here if the list isn't empty, to make sure btrfs_log_inode() + * will be called and process those extent maps. + */ + smp_mb(); + if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) + return 1; + } return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d296efe2d3e7..1d1ba083ca6e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4093,18 +4093,8 @@ log_extents: } } - write_lock(&em_tree->lock); - /* - * If we're doing a ranged fsync and there are still modified extents - * in the list, we must run on the next fsync call as it might cover - * those extents (a full fsync or an fsync for other range). - */ - if (list_empty(&em_tree->modified_extents)) { - BTRFS_I(inode)->logged_trans = trans->transid; - BTRFS_I(inode)->last_log_commit = - BTRFS_I(inode)->last_sub_trans; - } - write_unlock(&em_tree->lock); + BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); -- cgit v1.2.3 From 3e1199dcad004a40b55297a4736ccd1b9b81a952 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 13 Aug 2014 12:58:26 -0400 Subject: FS-Cache: refcount becomes corrupt under vma pressure. In rare cases under heavy VMA pressure the ref count for a fscache cookie becomes corrupt. In this case we decrement ref count even if we fail before incrementing the refcount. FS-Cache: Assertion failed bnode-eca5f9c6/syslog 0 > 0 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP Call Trace: [] __fscache_relinquish_cookie+0x50/0x220 [fscache] [] ceph_fscache_unregister_inode_cookie+0x3e/0x50 [ceph] [] ceph_destroy_inode+0x33/0x200 [ceph] [] ? __fsnotify_inode_delete+0xe/0x10 [] destroy_inode+0x3c/0x70 [] evict+0x111/0x180 [] iput+0x103/0x190 [] __dentry_kill+0x1c8/0x220 [] shrink_dentry_list+0xf1/0x250 [] prune_dcache_sb+0x4c/0x60 [] super_cache_scan+0xff/0x170 [] shrink_slab_node+0x140/0x2c0 [] shrink_slab+0x8a/0x130 [] balance_pgdat+0x3e2/0x5d0 [] kswapd+0x16a/0x4a0 [] ? __wake_up_sync+0x20/0x20 [] ? balance_pgdat+0x5d0/0x5d0 [] kthread+0xc9/0xe0 [] ? ftrace_raw_event_xen_mmu_release_ptpage+0x70/0x90 [] ? flush_kthread_worker+0xb0/0xb0 [] ret_from_fork+0x7c/0xb0 [] ? flush_kthread_worker+0xb0/0xb0 RIP [] __fscache_disable_cookie+0x1db/0x210 [fscache] RSP ---[ end trace 254d0d7c74a01f25 ]--- Signed-off-by: Milosz Tanski Signed-off-by: David Howells --- fs/fscache/page.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 781ac7b0b53e..de33b3fccca6 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -198,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; - bool wake_cookie; + bool wake_cookie = false; _enter("%p", cookie); @@ -228,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) __fscache_use_cookie(cookie); if (fscache_submit_exclusive_op(object, op) < 0) - goto nobufs; + goto nobufs_dec; spin_unlock(&cookie->lock); fscache_stat(&fscache_n_attr_changed_ok); fscache_put_operation(op); _leave(" = 0"); return 0; -nobufs: +nobufs_dec: wake_cookie = __fscache_unuse_cookie(cookie); +nobufs: spin_unlock(&cookie->lock); kfree(op); if (wake_cookie) -- cgit v1.2.3 From 696382f938d22597f4945865ed8e3f25e240cd41 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 27 Aug 2014 15:06:20 +0100 Subject: cachefiles: remove two unused pagevecs. These two have been unused since commit c4d6d8dbf335c7fa47341654a37c53a512b519bb CacheFiles: Fix the marking of cached pages in 3.8. Signed-off-by: NeilBrown Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 4b1fb5ca65b8..25e745b8eb1b 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op) struct cachefiles_one_read *monitor; struct cachefiles_object *object; struct fscache_retrieval *op; - struct pagevec pagevec; int error, max; op = container_of(_op, struct fscache_retrieval, op); @@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op) _enter("{ino=%lu}", object->backer->d_inode->i_ino); - pagevec_init(&pagevec, 0); - max = 8; spin_lock_irq(&object->work_lock); @@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, { struct cachefiles_object *object; struct cachefiles_cache *cache; - struct pagevec pagevec; struct inode *inode; sector_t block0, block; unsigned shift; @@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; - pagevec_init(&pagevec, 0); - /* we assume the absence or presence of the first block is a good * enough indication for the page as a whole * - TODO: don't use bmap() for this as it is _not_ actually good -- cgit v1.2.3 From e2cf1f1cc7636bd860e47cd0ad6194da8975f8b5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 17 Sep 2014 23:28:38 +0100 Subject: CacheFiles: Handle rename2 Not all filesystems now provide the rename i_op - ext4 for one - but rather provide the rename2 i_op. CacheFiles checks that the filesystem has rename and so will reject ext4 now with EPERM: CacheFiles: Failed to register: -1 Fix this by checking for rename2 as an alternative. The call to vfs_rename() actually handles selection of the appropriate function, so we needn't worry about that. Turning on debugging shows: [cachef] ==> cachefiles_get_directory(,,cache) [cachef] subdir -> ffff88000b22b778 positive [cachef] <== cachefiles_get_directory() = -1 [check] where -1 is EPERM. Signed-off-by: David Howells --- fs/cachefiles/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 5bf2b41e66d3..83e9c94ca2cf 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || !subdir->d_inode->i_op->create || - !subdir->d_inode->i_op->rename || + (!subdir->d_inode->i_op->rename && + !subdir->d_inode->i_op->rename2) || !subdir->d_inode->i_op->rmdir || !subdir->d_inode->i_op->unlink) goto check_error; -- cgit v1.2.3 From 0f23ae74f589304bf33233f85737f4fd368549eb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 18 Sep 2014 07:49:05 -0700 Subject: Revert "Btrfs: device_list_add() should not update list when mounted" This reverts commit b96de000bc8bc9688b3a2abea4332bd57648a49f. This commit is triggering failures to mount by subvolume id in some configurations. The main problem is how many different ways this scanning function is used, both for scanning while mounted and unmounted. A proper cleanup is too big for late rcs. For now, just revert the commit and we'll put a better fix into a later merge window. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 340a92d08e84..2c2d6d1d8eee 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -529,12 +529,12 @@ static noinline int device_list_add(const char *path, */ /* - * As of now don't allow update to btrfs_fs_device through - * the btrfs dev scan cli, after FS has been mounted. + * For now, we do allow update to btrfs_fs_device through the + * btrfs dev scan cli after FS has been mounted. We're still + * tracking a problem where systems fail mount by subvolume id + * when we reject replacement on a mounted FS. */ - if (fs_devices->opened) { - return -EBUSY; - } else { + if (!fs_devices->opened && found_transid < device->generation) { /* * That is if the FS is _not_ mounted and if you * are here, that means there is more than one @@ -542,8 +542,7 @@ static noinline int device_list_add(const char *path, * with larger generation number or the last-in if * generation are equal. */ - if (found_transid < device->generation) - return -EEXIST; + return -EEXIST; } name = rcu_string_strdup(path, GFP_NOFS); -- cgit v1.2.3 From 080af20cc945d110f9912d01cf6b66f94a375b8d Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Thu, 18 Sep 2014 09:13:17 -0400 Subject: NFSv4: nfs4_state_manager() vs. nfs_server_remove_lists() There is a race between nfs4_state_manager() and nfs_server_remove_lists() that happens during a nfsv3 mount. The v3 mount notices there is already a supper block so nfs_server_remove_lists() called which uses the nfs_client_lock spin lock to synchronize access to the client list. At the same time nfs4_state_manager() is running through the client list looking for work to do, using the same lock. When nfs4_state_manager() wins the race to the list, a v3 client pointer is found and not ignored properly which causes the panic. Moving some protocol checks before the state checking avoids the panic. CC: Stable Tree Signed-off-by: Steve Dickson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 53e435a95260..ffdb28d86cf8 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos" */ if (pos->cl_cons_state > NFS_CS_READY) { @@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_proto != new->cl_proto) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - if (pos->cl_clientid != new->cl_clientid) continue; @@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos", especially the client * ID and serverowner fields. Wait for CREATE_SESSION @@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_proto != new->cl_proto) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - if (!nfs4_match_clientids(pos, new)) continue; -- cgit v1.2.3 From cd9288ffaea4359d5cfe2b8d264911506aed26a4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 18 Sep 2014 11:51:32 -0400 Subject: NFSv4: Fix another bug in the close/open_downgrade code James Drew reports another bug whereby the NFS client is now sending an OPEN_DOWNGRADE in a situation where it should really have sent a CLOSE: the client is opening the file for O_RDWR, but then trying to do a downgrade to O_RDONLY, which is not allowed by the NFSv4 spec. Reported-by: James Drews Link: http://lkml.kernel.org/r/541AD7E5.8020409@engr.wisc.edu Fixes: aee7af356e15 (NFSv4: Fix problems with close in the presence...) Cc: stable@vger.kernel.org # 2.6.33+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ac2dd953fc18..6ca0c8e7a945 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2618,23 +2618,23 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); - /* Calculate the current open share mode */ - calldata->arg.fmode = 0; - if (is_rdonly || is_rdwr) - calldata->arg.fmode |= FMODE_READ; - if (is_wronly || is_rdwr) - calldata->arg.fmode |= FMODE_WRITE; /* Calculate the change in open mode */ + calldata->arg.fmode = 0; if (state->n_rdwr == 0) { - if (state->n_rdonly == 0) { - call_close |= is_rdonly || is_rdwr; - calldata->arg.fmode &= ~FMODE_READ; - } - if (state->n_wronly == 0) { - call_close |= is_wronly || is_rdwr; - calldata->arg.fmode &= ~FMODE_WRITE; - } - } + if (state->n_rdonly == 0) + call_close |= is_rdonly; + else if (is_rdonly) + calldata->arg.fmode |= FMODE_READ; + if (state->n_wronly == 0) + call_close |= is_wronly; + else if (is_wronly) + calldata->arg.fmode |= FMODE_WRITE; + } else if (is_rdwr) + calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; + + if (calldata->arg.fmode == 0) + call_close |= is_rdwr; + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); -- cgit v1.2.3 From f2d5a94436cc7cc0221b9a81bba2276a25187dd3 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Mon, 22 Sep 2014 01:53:03 +0100 Subject: Fix nasty 32-bit overflow bug in buffer i/o code. On 32-bit architectures, the legacy buffer_head functions are not always handling the sector number with the proper 64-bit types, and will thus fail on 4TB+ disks. Any code that uses __getblk() (and thus bread(), breadahead(), sb_bread(), sb_breadahead(), sb_getblk()), and calls it using a 64-bit block on a 32-bit arch (where "long" is 32-bit) causes an inifinite loop in __getblk_slow() with an infinite stream of errors logged to dmesg like this: __find_get_block_slow() failed. block=6740375944, b_blocknr=2445408648 b_state=0x00000020, b_size=512 device sda1 blocksize: 512 Note how in hex block is 0x191C1F988 and b_blocknr is 0x91C1F988 i.e. the top 32-bits are missing (in this case the 0x1 at the top). This is because grow_dev_page() is broken and has a 32-bit overflow due to shifting the page index value (a pgoff_t - which is just 32 bits on 32-bit architectures) left-shifted as the block number. But the top bits to get lost as the pgoff_t is not type cast to sector_t / 64-bit before the shift. This patch fixes this issue by type casting "index" to sector_t before doing the left shift. Note this is not a theoretical bug but has been seen in the field on a 4TiB hard drive with logical sector size 512 bytes. This patch has been verified to fix the infinite loop problem on 3.17-rc5 kernel using a 4TB disk image mounted using "-o loop". Without this patch doing a "find /nt" where /nt is an NTFS volume causes the inifinite loop 100% reproducibly whilst with the patch it works fine as expected. Signed-off-by: Anton Altaparmakov Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 8f05111bbb8b..3588a80854b2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1022,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, bh = page_buffers(page); if (bh->b_size == size) { end_block = init_page_buffers(page, bdev, - index << sizebits, size); + (sector_t)index << sizebits, + size); goto done; } if (!try_to_free_buffers(page)) @@ -1043,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, */ spin_lock(&inode->i_mapping->private_lock); link_dev_buffers(page, bh); - end_block = init_page_buffers(page, bdev, index << sizebits, size); + end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, + size); spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; -- cgit v1.2.3